In [1]:
from TendersWA import Evaluation as eval
import TendersWA.Preprocessing.Text as text
from TendersWA import Panda

In [6]:
# bit of loading data
tenders_structured_path = "../../data/UpdatedAgainTenders.xlsx"
tenders_structured = Panda.load_tender_uniques("../../data/UpdatedAgainTenders.xlsx", cols = ["Reference Number", "Contract Title", "Description", "UNSPSC Title"])

In [3]:
import pandas as pd
bert_tender_clusters = pd.read_csv("../../data/clustering/bert_clusters.csv", dtype={"Reference Number": str})
bert_cluster_topics = pd.read_csv("../../data/clustering/bert_cluster_topics.csv", dtype={"Topics": str})
sentence_bert_tender_clusters = pd.read_csv("../../data/clustering/sentence_bert_clusters.csv", dtype={"Reference Number": str})
sentence_bert_cluster_topics = pd.read_csv("../../data/clustering/sentence_bert_cluster_topics.csv", dtype={"Topics": str})
summary_tender_clusters = pd.read_csv("../../data/clustering/summary_clusters.csv", dtype={"Reference Number": str})
summary_cluster_topics = pd.read_csv("../../data/clustering/summary_cluster_topics.csv", dtype={"Topics": str})

In [4]:
# noticed issues with weird stray characters.
def strip_bad_chars(df):
    for index, row in df.iterrows():
        df.at[index, "Reference Number"] = row["Reference Number"].replace("[","").replace("]","")

strip_bad_chars(bert_tender_clusters)
strip_bad_chars(sentence_bert_tender_clusters)
strip_bad_chars(summary_tender_clusters)

In [5]:
import os
import pickle

summary_map = {}
summary_path = "../../data/extended_summaries/"
for ref in list(tenders_structured["Reference Number"]):
    sum_file = os.path.join(summary_path, f"{ref}.pickle.sum")
    if os.path.exists(sum_file):
        loaded_sum = pickle.load(open(sum_file, "rb"))
        summary_map[ref] = loaded_sum

In [11]:
# form unspsc clusters naturally from the titles
unspsc_clusters = {}
unspsc_n = 0
tenders_structured["unspsc_cluster"] = -1
for index, row in tenders_structured.iterrows():
    if row["UNSPSC Title"] not in unspsc_clusters:
        unspsc_clusters[row["UNSPSC Title"]] = unspsc_n
        unspsc_n += 1
    tenders_structured.at[index, "unspsc_cluster"] = unspsc_clusters[row["UNSPSC Title"]]

KeyError: 'UNSPSC Title'

In [12]:
# manual labels
manual_labelled = pd.read_excel("../../data/manual_labels.xlsx", dtype={"Reference Number": str})
manual_labelled = manual_labelled.rename(columns = {"Content": "Description", "Unnamed: 4": "Topics"})
del manual_labelled["Topic"]
del manual_labelled["Processed_Content"]
manual_labelled = manual_labelled.astype(str)

# Perplexity

In [7]:
# https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM
from evaluate import load
def eval_perplexity(generates_list, model_id = "bigscience/bloom-560m"):
    perplexity = load("perplexity", module_type="metric")
    return perplexity.compute(predictions=generates_list, model_id=model_id)

In [14]:
def join_df_with_clusters_and_topics(tender_data, tender_clusters, cluster_topics):
    combined = pd.merge(tender_data, tender_clusters, on = "Reference Number", how = "left")
    combined = pd.merge(combined, cluster_topics, on = "Cluster", how = "left")
    return combined

def compute_perplexity(combined_df, use_summaries = False):
    generates = []
    if use_summaries:
        for index, row in combined_df.iterrows():
            ref = row["Reference Number"]
            next_text = ""
            if ref in summary_map and summary_map[ref]["summary"] != None and summary_map[ref]["summary"] != "":
                next_text = summary_map[ref]["summary"]
            else:
                next_text = row["Description"]
            generates.append(f"The next sentences have topics {row['Topics']}. {next_text}")
        pass
    else:
        for index, row in combined_df.iterrows():
            generates.append(f"The next sentences have topics {row['Topics']}. {row['Description']}")

    return eval_perplexity(generates)

def compute_unspsc_evals(tender_data):
    generates = []
    for index, row in tender_data:
        generates.append(f"The next sentences have topics {row['UNSPSC Title']}. {row['Description']}")

    return eval_perplexity(generates)

In [16]:
bert_combined = join_df_with_clusters_and_topics(tenders_structured, bert_tender_clusters, bert_cluster_topics)
perplexities = compute_perplexity(bert_combined)

NameError: name 'load' is not defined

In [11]:
sentences = ["Use the topics Water, Agriculture to write the next sentences. The fire department is looking to service buildings.",
                     "Use the topics Fire, Department to write the next sentences. The fire department is looking to service buildings.",
                     "Use the topics Green, Vegetation to write the next sentences. The fire department is looking to service buildings."]

In [9]:
def do_perplex_and_present(sentences):
    res = eval_perplexity(sentences)
    df = pd.DataFrame({"Sentence": sentences, "Perplexity": res["perplexities"]})
    Panda.pretty_print(df)

In [13]:
do_perplex_and_present(sentences)

  0%|          | 0/1 [00:00<?, ?it/s]

Sentence,Perplexity
"Use the topics Water, Agriculture to write the next sentences. The fire department is looking to service buildings.",914.029236
"Use the topics Fire, Department to write the next sentences. The fire department is looking to service buildings.",1062.815186
"Use the topics Green, Vegetation to write the next sentences. The fire department is looking to service buildings.",280.577454


In [14]:
do_perplex_and_present(["The weather is nice", "The weather are shoes"])

  0%|          | 0/1 [00:00<?, ?it/s]

Sentence,Perplexity
The weather is nice,267.797089
The weather are shoes,9338.162109
