# Set up

In [29]:
from TendersWA.Models import Embedding_Model as em
from TendersWA.Preprocessing import Text as text
import os
import pandas as pd
import numpy as np

In [2]:
model = em.Sentence_transformer()

In [24]:
embs = np.load('../data/embedding_data/sent_transformer_embeddings.npz')
embs = [embs[f] for f in embs.files]
# convert into data frame
tender_embedding_df = pd.DataFrame({"Embedding": embs})

In [30]:
import pandas as pd
tenders_structured_path = r"../data/UpdatedAgainTenders.xlsx"

tenders_structured = pd.read_excel(tenders_structured_path, dtype={"Reference Number": str})
tenders_structured = tenders_structured[["Reference Number", "Contract Title", "Description"]].dropna(subset=["Reference Number"]).drop_duplicates(subset=["Reference Number"])

# clean the descriptions.
for index, row in tenders_structured.iterrows():
    desc = text.remove_html_tags(row["Description"])
    tenders_structured.at[index, "Description"] = desc

In [18]:
tender_refs = open("../data/embedding_data/tender_references.txt", "r")
content = ""
for line in tender_refs:
    content = line
    break
    
content = content.replace("[", "")
content = content.replace("'", "")
content = content.replace(",", "")
refs = content.split(" ")

In [31]:
tender_embedding_df['Reference Number'] = refs
merged_data = tender_embedding_df.merge(tenders_structured, on="Reference Number", how="left")

In [97]:
def mmr(query_emb, embs, d = 0.5, n = 10):
    found_embs = []
    found_embs_index = {}
    embs_to_consider = embs.copy()
    for i in range(0, n):
        max_score = -1000000000
        best_emb_index_found = -1
        for emb_index, emb in enumerate(embs_to_consider):
            if emb_index in found_embs_index: # skip found embeddings
                continue
            emb_to_query_sim  = em.cosine_sim(query_emb, emb)

            max_sim_to_found_embs = 0
            for found_emb in found_embs:
                found_emb_to_considered_sim = em.cosine_sim(emb, found_emb)
                if found_emb_to_considered_sim > max_sim_to_found_embs:
                    max_sim_to_found_embs = found_emb_to_considered_sim
            
            score = d * emb_to_query_sim - (1 - d) * max_sim_to_found_embs
            if score > max_score:
                max_score = score
                best_emb_index_found = emb_index
        
        # add the found emb, remove from those to consider
        found_embs_index[best_emb_index_found] = None
        found_embs.append(embs_to_consider[best_emb_index_found])
    return list(found_embs_index.keys())

from sklearn.metrics.pairwise import cosine_similarity
def query_tenders(query, model, merged_data, top_k=10, algorithm = "top", d = 0.5):
    # Encode the query, model needs to be the same that generated tender embeddings
    query_embedding = model(query)

    unpacked_embs = np.block([[unpacked[0]] for unpacked in merged_data[["Embedding"]].values])
    
    found_indices = []
    if algorithm == "top":
        similarities = cosine_similarity(query_embedding.reshape(1, -1), unpacked_embs)
        found_indices = similarities[0].argsort()[-top_k:][::-1]
    else:
        found_indices = mmr(query_embedding, unpacked_embs, d, top_k)
    return merged_data.iloc[found_indices][["Reference Number", "Contract Title", "Description"]]

# Querying 

In [96]:
query_tenders("hospital plumbing", model, merged_data, algorithm = "top")

Unnamed: 0,Reference Number,Contract Title,Description
18441,SMHS202311395,Fremantle Hospital - Hydraulic Fixture Audit a...,"South Metropolitan Health Service, Fremantle H..."
19692,WACHS20205659,Panel Contract for Plumbing Services to WACHS ...,WACHS Goldfields requires plumbing services ac...
19341,WACHS202310069,Panel Contract for Plumbing Services to WA Cou...,The WA Country Health Service South West (WACH...
10647,EMHS202210377,Water Filtration System for South Campus Mains...,East Metropolitan Health Service (EMHS) requir...
14177,GRA20200103,Provision of Replacement Evaporative Air Condi...,North Metropolitan Health Service (NMHS) requi...
19673,WACHS20194721,Patient Bathroom Conversion to Universal Acces...,WACHS-SW Infrastructure requires a contractor ...
19483,WACHS20229235,Repairs To Water Supply Issues to Fitzroy Cros...,Repairs To Water Supply Issues to Fitzroy Cros...
19227,FINW0390421,WA Country Health Service - Margaret River Hos...,WA Country Health Service - Margaret River Hos...
10775,EMHS20217567,Replacement of Emergency Domestic Water Line A...,EMHS requires damaged cast steel emergency dom...
19348,WACHS202311112,Patient Bathroom Upgrade at Boyup Brook Hospital,WA Country Health Service South West requires ...


In [93]:
query_tenders("hospital plumbing", model, merged_data, algorithm = "mmr", d = 0.7)

Unnamed: 0,Reference Number,Contract Title,Description
18441,SMHS202311395,Fremantle Hospital - Hydraulic Fixture Audit a...,"South Metropolitan Health Service, Fremantle H..."
14177,GRA20200103,Provision of Replacement Evaporative Air Condi...,North Metropolitan Health Service (NMHS) requi...
19692,WACHS20205659,Panel Contract for Plumbing Services to WACHS ...,WACHS Goldfields requires plumbing services ac...
19673,WACHS20194721,Patient Bathroom Conversion to Universal Acces...,WACHS-SW Infrastructure requires a contractor ...
10625,EMHS202311376,SUPPLY OF HEMOFILTRATION FLUIDS,SUPPLY OF HEMOFILTRATION FLUIDS RPH ICU
6130,DOH202210703,Department of Health - Maintenance of Medical ...,Department of Health - Maintenance of Medical ...
18567,SMHS20182363,Supply of Haemodialysis Machines and Associate...,The South Metropolitan Health Service (SMHS) h...
10764,20023951016,Armadale Health Service_Negative Pressure Room...,To carry out works in existing patient rooms t...
527,CAHS19029,Patient Meals Kitchen High Level Cleaning & Co...,Patient Meals Kitchen High Level Cleaning & Co...
13963,NMHS202210645,"Supply, Delivery, Installation, Commissioning ...",North Metropolitan Health Service (NMHS) requi...
