In [2]:
from sklearn.metrics.pairwise import cosine_similarity

In [170]:
import numpy as np
# concat to df
# sorted_inds are similarity ranks
# df and sims should have same # of rows
def form_most_similar(df, sorted_inds, n):
    # put a graph index in
    df["graphID"] = np.arange(len(df))
    # put similarity cols in
    for i in range(n):
        col_name = "top_sim_" + str(i)
        df[col_name] = -1
        col_name = "bot_sim_" + str(i)
        df[col_name] = -1
        
    # direction: top - get the top n, bot - get the bot n
    # assumes row is sorted smallest to biggest
    # will ignore row_i if it sees it
    def get_n(row, row_i, n, direction):
        collected_n = []
        row_j = 0
        if direction == "top":
            row_j = len(row) - 1
        added = 0
        while added < n and row_j >= 0 and row_j < len(row):
            if row[row_j] != row_i:
                collected_n.append(row[row_j])
                added = added + 1
            if direction == "top":
                row_j = row_j - 1
            else:
                row_j = row_j + 1
        return collected_n
    
    # form the top and bottom n similar tenders
    for i, row in enumerate(sorted_inds):
        bot_n = get_n(row, i, n, "bot")
        top_n = get_n(row, i, n, "top")
        for j in range(n):
            col_name = "top_sim_" + str(j)
            df.at[i, col_name] = top_n[j]
            col_name = "bot_sim_" + str(j)
            df.at[i, col_name] = bot_n[j]

In [171]:
import pandas as pd
fast_text_embs = pd.read_csv("../fasttext_emb/fasttext_emb.csv", header = None)
tenders = pd.read_excel("../../../data/UpdatedAgainTenders.xlsx")

In [172]:
shuffle_i = np.random.choice(a=[False, True], size=(len(tenders),), p = [0.9, 0.1])
fast_text_embs = fast_text_embs.loc[shuffle_i]
tenders = tenders.loc[shuffle_i]
fast_text_embs.reset_index(drop=True, inplace=True)
tenders.reset_index(drop=True, inplace=True)

In [173]:
sims = cosine_similarity(fast_text_embs)
sims.shape

(2639, 2639)

In [174]:
%%time
# sorts smallest to biggest for the provided similarity matrix
# this call takes about a minute for 24k rows
sorted_inds = np.argsort(sims)
sorted_inds.shape

CPU times: user 536 ms, sys: 37.2 ms, total: 573 ms
Wall time: 149 ms


(2639, 2639)

In [175]:
saved_t = tenders.copy()
saved_e = fast_text_embs.copy()

In [154]:
tenders = saved_t.copy()
fast_text_embs = saved_e.copy()

In [176]:
form_most_similar(tenders, sorted_inds, 5)

In [181]:
# tests
assert tenders.at[0, "top_sim_0"] == sorted_inds[0, len(sorted_inds) - 2]
assert tenders.at[0, "bot_sim_0"] == sorted_inds[0, 0]
assert tenders.at[0, "top_sim_1"] == sorted_inds[0, len(sorted_inds) - 3]
assert tenders.at[0, "bot_sim_1"] == sorted_inds[0, 1]
assert tenders.at[0, "top_sim_2"] == sorted_inds[0, len(sorted_inds) - 4]
assert tenders.at[0, "bot_sim_2"] == sorted_inds[0, 2]
assert tenders.at[0, "top_sim_3"] == sorted_inds[0, len(sorted_inds) - 5]
assert tenders.at[0, "bot_sim_3"] == sorted_inds[0, 3]
assert tenders.at[0, "top_sim_4"] == sorted_inds[0, len(sorted_inds) - 6]
assert tenders.at[0, "bot_sim_4"] == sorted_inds[0, 4]
assert tenders.at[10, "top_sim_4"] == sorted_inds[10, len(sorted_inds) - 6]
assert tenders.at[1337, "top_sim_4"] == sorted_inds[1337, len(sorted_inds) - 6]

In [182]:
# fix up Description column for consumption
# Adapted from Joyce's 'deal_data.py' file
import re
def clean(content):
    text = re.findall(r'>([^<]+)<', content)
    text = [t.strip() for t in text if t.strip()]
    text = " ".join(text)
    text = re.sub("\xa0", '', text)
    return text

tenders["CleanedDescription"] = ""
for i, row in tenders.iterrows():
    tenders.at[i,"CleanedDescription"] = clean(tenders.at[i, "Description"])

In [184]:
tenders.to_csv("tenders.csv")

In [56]:
tenders.columns

Index(['Client Agency', 'Client Agency Address', 'Developing Agency Parent',
       'Reference Number', 'Type of Work', 'Contract Title', 'Description',
       'Tender Closing Date', 'UNSPSC Code', 'UNSPSC Title',
       'Procurement Method', 'Period Contract', 'CUA Contract',
       'Original Contract Value', 'Revised Contract Value', 'Region/s',
       'Awarded Date', 'Start Date', 'Initial Expiry Date', 'Expiry Date',
       'Number of Submissions', 'Panel Contract', 'Contract Developer',
       'Contract Manager', 'Supplier Name', 'Supplier ABN', 'Supplier Address',
       'DCSP Contract', 'supplier contract price', 'Number of Variations',
       'Variation Amount', 'Contract Expenditure', 'Tender Number',
       'Tender Open Date', 'Tender Close Date', 'Tender ID', 'TenderLink',
       'CleanedDescription', 'graphID', 'top_sim_0', 'bot_sim_0', 'top_sim_1',
       'bot_sim_1', 'top_sim_2', 'bot_sim_2', 'top_sim_3', 'bot_sim_3',
       'top_sim_4', 'bot_sim_4'],
      dtype='object'

In [183]:
tenders

Unnamed: 0,Client Agency,Client Agency Address,Developing Agency Parent,Reference Number,Type of Work,Contract Title,Description,Tender Closing Date,UNSPSC Code,UNSPSC Title,...,bot_sim_0,top_sim_1,bot_sim_1,top_sim_2,bot_sim_2,top_sim_3,bot_sim_3,top_sim_4,bot_sim_4,CleanedDescription
0,Arts and Culture Trust,825 Hay Street Perth WA 6000,Department of Culture and the Arts,RFQ06042022PTT,Goods and Services,"Event Health Services, First Aid Training and ...","<p><span style=""font-size:12.0pt"">The Perth Th...",2022-06-29,85100000,Comprehensive health services,...,739,120,2602,963,1607,131,2598,130,2257,The Perth Theatre Trust (PTT) wasseeking offer...
1,Botanic Gardens and Parks Authority,1 Kattidj Close Kings Park WA 6005,"Department of Biodiversity, Conservation and A...",DBCABGPA20032023,Goods and Services,"Repairs, maintenance and licence - Irrigation ...","<p>Repairs, maintenance and licence - Irrigati...",2022-12-01,70171700,Irrigation system maintenance and management s...,...,739,669,923,682,2203,1368,1111,1855,2197,"Repairs, maintenance and licence - Irrigation ..."
2,Botanic Gardens and Parks Authority,1 Kattidj Close Kings Park WA 6005,"Department of Biodiversity, Conservation and A...",DBCABGPA20012022,Goods and Services,Establish a Low Value Maintenance Panel for Bu...,"<p style=""text-align:justify""><span style=""fon...",2022-04-12,72101500,Building maintenance and repair services,...,739,684,1592,685,1607,683,1405,1516,1111,The Panel Arrangement is expected to include 6...
3,Botanic Gardens and Parks Authority,1 Kattidj Close Kings Park WA 6005,"Department of Biodiversity, Conservation and A...",DBCABGPA20012022,Goods and Services,Establish a Low Value Maintenance Panel for Bu...,"<p style=""text-align:justify""><span style=""fon...",2022-04-12,72101500,Building maintenance and repair services,...,739,684,1592,685,1607,683,1405,1516,1111,The Panel Arrangement is expected to include 6...
4,Botanic Gardens and Parks Authority,1 Kattidj Close Kings Park WA 6005,"Department of Biodiversity, Conservation and A...",DBCABGPA20242022,Goods and Services,Provision of Herbicide spraying services in Bo...,"<p style=""text-align:justify""><span style=""fon...",2022-08-16,70141604,Herbicide services,...,739,992,1607,865,45,947,1405,1924,1616,The Customer is seeking offers for the provisi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,Western Australian Treasury Corporation,Level 12 225 St Georges Terrace Perth WA 6000,Western Australian Treasury Corporation,WATC1234,Goods and Services,Client Portal Upgrade,<p>Client Portal Upgrade</p>,2021-03-19,81160000,Information technology service delivery,...,739,1163,1589,1152,1592,1010,2225,2444,1054,Client Portal Upgrade
2635,Workcover Western Australia,2 Bedbrook Place Shenton Park WA 6008,WorkCover Western Australia Authority,WCE0123,Goods and Services,Provision of Security Services,<table> <tbody> <tr> <td> <p>Prov...,2023-06-06,92121504,Security guard services,...,739,2065,1607,1708,584,2328,1906,1465,2208,Provide Security Services: Static Guard Level ...
2636,Workcover Western Australia,2 Bedbrook Place Shenton Park WA 6008,WorkCover Western Australia Authority,WCWAHRMIS0119AB,Goods and Services,Aggregated Buy for Human Resource Management I...,<p>Aa part of an aggrebated by for HRMIS0119AB...,2019-08-27,43231505,Human resources software,...,739,829,1405,854,2178,1311,282,1700,1090,Aa part of an aggrebated by for HRMIS0119AB.
2637,Workcover Western Australia,2 Bedbrook Place Shenton Park WA 6008,WorkCover Western Australia Authority,WCQ0122,Goods and Services,Supply of 96 OptiPlex 7070 UFF CTO,<p>Supply of 96 OptiPlex 7070 UFF CTO</p>,2020-02-27,43211500,Computers,...,739,1033,45,2224,2135,2523,923,702,1607,Supply of 96 OptiPlex 7070 UFF CTO
