In [16]:
from sentence_transformers import SentenceTransformer, util
import os
import zipfile
import shutil
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import BertTokenizer
from sklearn.decomposition import PCA
from nltk import tokenize

In [12]:
model = SentenceTransformer('all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
# extract tenders from zip files using mitch's code 

def search(search_path):

    ref_dict = {}

# recursively through all files and folders
    for root, dirs, files in os.walk(search_path):
        for filename in files:
            if filename.endswith(".zip"):
                # get ref num
                ref = filename.rsplit("-specification.zip", 1)[0]
                file_path = os.path.join(root, filename)

                # open read zip
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    file_list = zip_ref.namelist()
                    doc_files = [file for file in file_list if "request" in file.lower() and file.lower().endswith(('.doc', '.docx'))]

                    if doc_files:
                        # add sub files, if any
                        ref_dict.setdefault(ref, []).extend([(doc_name, file_path) for doc_name in doc_files])

    for ref, doc_list in ref_dict.items():
        print(f"Reference: {ref}")
        for doc_name, file_path in doc_list:
            print(f"Document Name: {doc_name}, ZIP File Path: {file_path}")

    return ref_dict

###########################################
### filter for one relevant doc per ref ###
###########################################

def copy(copy_path, ref_dict):
    # make output folder, in not already there
    os.makedirs(copy_path, exist_ok=True)

    for ref, doc_list in ref_dict.items():
        for doc_name, file_path in doc_list:
            # copy relevant docs from zip to output folder
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                for file in zip_ref.namelist():
                    if file == doc_name:
                        extracted_path = os.path.join(copy_path, f"{ref}.docx")
                        with zip_ref.open(file) as source, open(extracted_path, "wb") as target:
                            shutil.copyfileobj(source, target)
                        print(f"Extracted: {doc_name} from {file_path} to {extracted_path}")
    return

zip_files_path = "..\..\..\data\Tenders\Tenders"


output_folder = r"..\..\..\data\tender_docs_extracted_"

def extract(search_path, copy_path):
    ref_dict = search(search_path)
    copy(copy_path, ref_dict)    
    return

extract(zip_files_path, output_folder)

In [5]:
from docx import Document

def extract_doc_by_header(docx_path, target_header):
    doc = Document(docx_path)
    content = []
    current_header = None
    capture_content = False

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith("Heading") and paragraph.text == target_header:
            current_header = paragraph.text
            capture_content = True
            content.append(current_header)
        elif capture_content:
            content.append(paragraph.text)
        
        # Stop capturing content if a new header is encountered
        if paragraph.style.name.startswith("Heading") and paragraph.text != current_header:
            capture_content = False
   
    return "\n".join(content)


In [6]:
header = "Background"
documents = []
failed =0

for item in os.listdir(r"..\..\..\data\tender_docs_extracted_"):
    try:
        
        link = output_folder + "\\" +item
       
        documents.append(extract_doc_by_header(link,header))
    except:
        failed+=1
        
        continue

In [7]:
documents

clipped_documents = []
for item in documents:
    thing = item[11::]
    thing = thing[:-20:]
    clipped_documents.append(thing)
clipped_documents

['The Perth Theatre Trust (PTT) is a statutory authority, established and constituted under the Perth Theatre Trust Act 1979, to manage and operate several theatres and entertainment/art centres across Western Australia (WA). \nTo support the local arts sector and provide an opportunity for audience enhancement following COVID-19, PTT has identified a requirement for the capability to broadcast/stream live performances to the world.\nPTT is seeking offers from suitably experienced Respondents for the supply of live streaming broadcast systems for five of its Perth-based and Regional venues, and one roaming system. \nRefer to Schedule 2 – Specification / Statement of Requirements and any attachment to that Schedule for further details.',
 '',
 '',
 'The Art Gallery of Western Australia (AGWA, the Customer) requires the services of a consultant to assist with the day to day running of AGWA Rooftop Bar and AGWA Café and Bar.\nThe Rooftop Bar opens three nights per week Friday through Sund

In [13]:
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

In [14]:
len(passage_embedding[0])

768

In [17]:
outputs = []
with torch.no_grad():
    for item in clipped_documents:
        if item != '':

            a = tokenize.sent_tokenize(item)
        
            outputs.append(model.encode(a))

In [18]:
np.savez('sbert_embeddings.npz', *outputs)