In [37]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [38]:
# 2. Load the glossary
glossary_df = pd.read_csv("../data/1b_glossary_descriptions.csv")            # your glossary CSV

# Method 1
glossary_terms = glossary_df['Glossary']
glossary_terms = glossary_terms.dropna().tolist()

# Method 2
glossary_full = glossary_df['Glossary'] + ': '+ glossary_df['Description']
glossary_full = glossary_full.dropna().tolist()

# Method 3
glossary_desc = glossary_df['Description'].dropna().tolist()

print(glossary_terms[0])

Assets


In [39]:
# 3. Load your NL queries alongside their ground-truth glossary terms
#    Input CSV must have columns: 'NL_Query' and 'GT_Glossary'
queries_df = pd.read_csv("../data/1a_nl_to_glossary_gt.csv")

In [40]:
# 4. Initialize the embedding model
model = SentenceTransformer('BAAI/bge-base-en-v1.5')

In [57]:
# 5. Pre-compute embeddings for all glossary terms
term_embeddings = model.encode(glossary_full, convert_to_tensor=True, normalize_embeddings=True)

In [58]:
# 6. Iterate through each query and find the best matching glossary term
results = []
y_true = []
y_pred = []
for _, row in queries_df.iterrows():
    query_text = row['NL_Query']
    original = row['GT_Glossary']
    
    # Embed the query
    q_emb = model.encode(query_text, convert_to_tensor=True, normalize_embeddings=True)
    
    # Compute cosine similarity to all glossary terms
    sims = util.cos_sim(q_emb, term_embeddings)
    #print(sims)
    
    # Identify best match
    best_idx = torch.argmax(sims).item()
    predicted = glossary_terms[best_idx]
    score = sims[0][best_idx].item()

    # For metric calc
    
    # Method 2
    #pred_to_append = predicted.split(':')[0]

    # Method 3
    #index = glossary_desc.index(predicted)
    #pred_to_append = 

    y_true.append(original)
    y_pred.append(predicted)
    
    results.append({
        'NL_Query': query_text,
        'GT_Glossary': original,
        'Predicted_Glossary': predicted,
        'Similarity_Score': round(score, 4)
    })


In [59]:
# 7. Convert to DataFrame and save or inspect
results_df = pd.DataFrame(results)
results_df.head(5)

Unnamed: 0,NL_Query,GT_Glossary,Predicted_Glossary,Similarity_Score
0,How much money did we bring in from sales this...,Revenue,Gross Profit,0.6444
1,What did it cost us to make the stuff we sold ...,Cost of Goods Sold (COGS),Inventory Turnover,0.6226
2,"After paying for production, how much did we k...",Gross Profit,Working Capital,0.6058
3,What are we spending to keep the business runn...,Operating Expenses,Operating Cash Flow,0.6148
4,How much did our main operations earn us in th...,Operating Profit (EBIT),Operating Cash Flow,0.6447


In [60]:
# 8. Evaluate
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy (labels): {accuracy:.4f}")
print(f"Macro F1 (labels): {f1_macro:.4f}")
#print("\nClassification Report (labels):\n")
#print(classification_report(y_true, y_pred, digits=4))

Accuracy (labels): 0.3100
Macro F1 (labels): 0.2320


In [None]:
# Save the results to a CSV file
results_df.to_csv('../results/24May_refactor/stage1_nl2glossary.csv', index=False)