In [132]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import CrossEncoder
import numpy as np


In [199]:
# 2. Load the glossary
glossary_df = pd.read_csv("../data/1b_glossary_descriptions.csv")            # your glossary CSV

# Method 1
glossary_terms = glossary_df['Glossary']
glossary_terms = glossary_terms.dropna().tolist()

# Method 2
glossary_full = glossary_df['Glossary'] + ' can be defined as '+ glossary_df['Description']
glossary_full = glossary_full.dropna().tolist()

# Method 3
glossary_desc = glossary_df['Description'].dropna().tolist()

print(glossary_full[0])

Assets can be defined as Resources owned by a company (e.g., cash, inventory, equipment).


In [None]:
def build_full_text(row):
    text = f"{row['Glossary']} can be defined as {row['Description']}"
    if pd.notnull(row['Formulas, if any']):
        text += f" Its Formula is:  {row['Formulas, if any']}"
    return text

glossary_full = glossary_df.apply(build_full_text, axis=1)
print(glossary_full[0])


'\ndef build_full_text(row):\n    text = f"{row[\'Glossary\']} can be defined as {row[\'Description\']}"\n    if pd.notnull(row[\'Formulas, if any\']):\n        text += f" Its Formula is:  {row[\'Formulas, if any\']}"\n    return text\n\nglossary_full = glossary_df.apply(build_full_text, axis=1)\nprint(glossary_full[0])\n'

In [201]:
# 3. Load your NL queries alongside their ground-truth glossary terms
#    Input CSV must have columns: 'NL_Query' and 'GT_Glossary'
queries_df = pd.read_csv("../data/1a_simpler_dataset_noisy_nl_to_glossary_gt.csv")

In [202]:
# 4. Initialize the embedding model
#model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [203]:
# 5. Pre-compute embeddings for all glossary terms
term_embeddings = model.encode(glossary_full, convert_to_tensor=True, normalize_embeddings=True)

In [204]:

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [205]:
# 6. Iterate through each query and find the best matching glossary term
results = []
y_true = []
y_pred = []
for _, row in queries_df.iterrows():
    query_text = row['NL_Query']
    original = row['GT_Glossary']
    
    # Embed the query
    q_emb = model.encode(query_text, convert_to_tensor=True, normalize_embeddings=True)

    # Compute cosine similarity to all glossary terms
    sims = util.cos_sim(q_emb, term_embeddings)
    #print(sims)
    
    # Identify best match
    best_idx = torch.argmax(sims).item()
    predicted = glossary_terms[best_idx]
    score = sims[0][best_idx].item()

    ###### Adding a re-ranker ######
    top_k = 10  # How many to pass to reranker
    top_k_indices = torch.topk(sims[0], k=top_k).indices.tolist()

    # Retrieve top-k glossary terms
    top_k_full = [glossary_full[i] for i in top_k_indices]

    # Prepare query-term pairs for reranking
    pairs = [(query_text, term) for term in top_k_full]

    # Rerank top-k candidates
    rerank_scores = reranker.predict(pairs)
    best_rerank_idx = torch.tensor(rerank_scores).argmax().item()

    predicted = top_k_full[best_rerank_idx].split(' can be defined as ')[0]
    score = rerank_scores[best_rerank_idx] 

    y_true.append(original)
    y_pred.append(predicted)
    
    results.append({
        'NL_Query': query_text,
        'GT_Glossary': original,
        'Predicted_Glossary': predicted,
        'Similarity_Score': round(score, 4)
    })


In [206]:
# 7. Convert to DataFrame and save or inspect
results_df = pd.DataFrame(results)
results_df.head(50)

Unnamed: 0,NL_Query,GT_Glossary,Predicted_Glossary,Similarity_Score
0,Can yu share what was the Assets last year?,Assets,Current Assets,-7.7502
1,Can u tell me what was the Liabilities last year?,Liabilities,Current Liabilities,2.0672
2,Can yu share what was the Equity last year?,Equity,Equity,-10.3681
3,Can yu share what was the Current Assets last ...,Current Assets,Current Assets,-2.2529
4,What's was the Non-Current Assets last quarter?,Non-Current Assets,Non-Current Assets,2.9399
5,Wat was the Current Liabilities last quarter?,Current Liabilities,Current Liabilities,-0.2632
6,What's was the Non-Current Liabilities last qu...,Non-Current Liabilities,Non-Current Liabilities,3.1812
7,Can u tell me what was the Working Capital las...,Working Capital,Working Capital,-1.1451
8,Give me Net Worth figure for last quater?,Net Worth,Net Worth,-7.5832
9,Can you tell me what is the Revenue last year?,Revenue,Revenue,-2.1026


In [207]:
# 8. Evaluate
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy (labels): {accuracy:.4f}")
print(f"Macro F1 (labels): {f1_macro:.4f}")
#print("\nClassification Report (labels):\n")
#print(classification_report(y_true, y_pred, digits=4))

Accuracy (labels): 0.9688
Macro F1 (labels): 0.9583


In [198]:
# Save the results to a CSV file
results_df.to_csv('../results/25May_1a_simpler_clean/stage1_nl2glossary_noisy_data.csv', index=False)