In [132]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sentence_transformers import CrossEncoder
import numpy as np


In [159]:
# 2. Load the glossary
glossary_df = pd.read_csv("../data/1b_glossary_descriptions.csv")            # your glossary CSV

# Method 1
glossary_terms = glossary_df['Glossary']
glossary_terms = glossary_terms.dropna().tolist()

# Method 2
glossary_full = glossary_df['Glossary'] + ' can be defined as '+ glossary_df['Description']
glossary_full = glossary_full.dropna().tolist()

# Method 3
glossary_desc = glossary_df['Description'].dropna().tolist()

print(glossary_full[0])

Assets can be defined as Resources owned by a company (e.g., cash, inventory, equipment).


In [160]:
def build_full_text(row):
    text = f"{row['Glossary']} can be defined as {row['Description']}"
    if pd.notnull(row['Formulas, if any']):
        text += f" Its Formula is:  {row['Formulas, if any']}"
    return text

glossary_full = glossary_df.apply(build_full_text, axis=1)
print(glossary_full[0])


Assets can be defined as Resources owned by a company (e.g., cash, inventory, equipment).


In [None]:
# 3. Load your NL queries alongside their ground-truth glossary terms
#    Input CSV must have columns: 'NL_Query' and 'GT_Glossary'
queries_df = pd.read_csv("../data/1a_simpler_dataset_noisy_nl_to_glossary_gt.csv")

In [163]:
# 4. Initialize the embedding model
#model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [164]:
# 5. Pre-compute embeddings for all glossary terms
term_embeddings = model.encode(glossary_full, convert_to_tensor=True, normalize_embeddings=True)

In [165]:

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
# 6. Iterate through each query and find the best matching glossary term
results = []
y_true = []
y_pred = []
for _, row in queries_df.iterrows():
    query_text = row['NL_Query']
    original = row['GT_Glossary']
    
    # Embed the query
    q_emb = model.encode(query_text, convert_to_tensor=True, normalize_embeddings=True)

    # Compute cosine similarity to all glossary terms
    sims = util.cos_sim(q_emb, term_embeddings)
    #print(sims)
    
    # Identify best match
    best_idx = torch.argmax(sims).item()
    predicted = glossary_terms[best_idx]
    score = sims[0][best_idx].item()

    ###### Adding a re-ranker ######
    top_k = 10  # How many to pass to reranker
    top_k_indices = torch.topk(sims[0], k=top_k).indices.tolist()

    # Retrieve top-k glossary terms
    top_k_full = [glossary_full[i] for i in top_k_indices]

    # Prepare query-term pairs for reranking
    pairs = [(query_text, term) for term in top_k_full]

    # Rerank top-k candidates
    rerank_scores = reranker.predict(pairs)
    best_rerank_idx = torch.tensor(rerank_scores).argmax().item()

    predicted = top_k_full[best_rerank_idx].split(' can be defined as ')[0]
    score = rerank_scores[best_rerank_idx] 

    y_true.append(original)
    y_pred.append(predicted)
    
    results.append({
        'NL_Query': query_text,
        'GT_Glossary': original,
        'Predicted_Glossary': predicted,
        'Similarity_Score': round(score, 4)
    })


In [189]:
# 7. Convert to DataFrame and save or inspect
results_df = pd.DataFrame(results)
results_df.head(50)

Unnamed: 0,NL_Query,GT_Glossary,Predicted_Glossary,Similarity_Score
0,Can you tell me what was the Assets last year?,Assets,Current Assets,0.0325
1,Give me the Assets figure for the last quarter.,Assets,Current Assets,-8.6411
2,Can you tell me what was the Liabilities last ...,Liabilities,Current Liabilities,2.958
3,Can you provide the Liabilities amount for the...,Liabilities,Provisions,-3.4675
4,Can you tell me what was the Equity last year?,Equity,Equity,-5.5001
5,Could you share the Equity for the last quarter?,Equity,Equity,-9.1591
6,Can you tell me what was the Current Assets la...,Current Assets,Current Assets,4.5202
7,Tell me the value of Current Assets in the las...,Current Assets,Current Assets,-1.7828
8,Can you tell me what was the Non-Current Asset...,Non-Current Assets,Non-Current Assets,4.4408
9,What was the Non-Current Assets in the last qu...,Non-Current Assets,Non-Current Assets,3.0412


In [190]:
# 8. Evaluate
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy (labels): {accuracy:.4f}")
print(f"Macro F1 (labels): {f1_macro:.4f}")
#print("\nClassification Report (labels):\n")
#print(classification_report(y_true, y_pred, digits=4))

Accuracy (labels): 0.9621
Macro F1 (labels): 0.9503


In [181]:
# Save the results to a CSV file
results_df.to_csv('../results/25May_1a_simpler_clean/stage1_nl2glossary.csv', index=False)