In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""


In [2]:
import pandas as pd

from fastembed import (
    SparseTextEmbedding,
    TextEmbedding,
    LateInteractionTextEmbedding,
    ImageEmbedding,
)
from fastembed.rerank.cross_encoder import TextCrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import time
import numpy as np
import pandas as pd

In [4]:
# Load dataset
df = pd.read_csv("../data/List-Statewise-Districts-Messy.csv")
df["DISTRICT__STATE"] = df["DISTRICT"] + "__" + df["STATE"]

In [5]:
df

Unnamed: 0,S. No.,DISTRICT,STATE,Messy_Version1,Messy_Version2,Messy_Version3,Messy_Version4,DISTRICT__STATE
0,1,ANDAMAN_NICOBAR_IS,Andman & Nicobar Island,ANDAMAN_NICOBAR_IS,ANDAMAN_NICOB^AR_IS,ANDAMAN_NICOBAR_IS,ANDAMAN_NRIIOCS_AB,ANDAMAN_NICOBAR_IS__Andman & Nicobar Island
1,2,ADILABAD,Andhra Pradesh,ADILABAD,A$DILABAD,ADILABAD,ADILBADA,ADILABAD__Andhra Pradesh
2,3,ANANTAPUR,Andhra Pradesh,ANANTAPUR,ANAN!TAPUR,ANANTAPUR,ANANTRUAP,ANANTAPUR__Andhra Pradesh
3,4,CHITTOOR,Andhra Pradesh,CHITTOOR,CHITTOO&R,CHITTOOR,CHITOROT,CHITTOOR__Andhra Pradesh
4,5,EAST_GODAVARI,Andhra Pradesh,EAST_GODAVARI,EAST_GODA)VARI,EAST_GODAVARI,EAST_GOVAIDRA,EAST_GODAVARI__Andhra Pradesh
...,...,...,...,...,...,...,...,...
620,621,NORTH_24_PARGANAS,West Bengal,NORTH_24_PARGANAS,*NORTH_24_PARGANAS,NORTH_24_PARGANAS,NORTH_24_RSAAPGAN,NORTH_24_PARGANAS__West Bengal
621,622,PURULIYA,West Bengal,PURULIYA,PURU!LIYA,PURULIYA,PURULIAY,PURULIYA__West Bengal
622,623,SOUTH_24_PARGANAS,West Bengal,SOUTH_24_PARGANAS,SO$UTH_24_PARGANAS,SOUTH_24_PARGANAS,SOUTH_24_RPNAASGA,SOUTH_24_PARGANAS__West Bengal
623,624,UTTAR_DINAJPUR,West Bengal,UTTAR_DINAJPUR,UTTAR_DINA)JPUR,UTTAR_DINAJPUR,UTTAR_DRPIJNAU,UTTAR_DINAJPUR__West Bengal


In [None]:
correct_entries = df["DISTRICT__STATE"].tolist()
reference_entries = list(set(correct_entries))


# Cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [16]:

def run_fastembed(model_name, incorrect_entries, reference_entries):
    embedder = TextEmbedding(model_name=model_name, cache_dir=".fastembed")
    
    # Batch embed references
    reference_vectors = list(embedder.embed(reference_entries, as_numpy=True))
    reference_embeddings = dict(zip(reference_entries, reference_vectors))

    # Batch embed incorrect entries
    incorrect_vectors = list(embedder.embed(incorrect_entries, as_numpy=True))

    # Compare and build result tuples
    results = []
    for incorrect, query_vector in zip(incorrect_entries, incorrect_vectors):
        similarities = {
            ref: cosine_similarity(query_vector, ref_vector)
            for ref, ref_vector in reference_embeddings.items()
        }
        best_match = max(similarities, key=similarities.get)
        best_score = similarities[best_match]
        results.append((incorrect, best_match, best_score))

    # ✅ Return in the same format as your SentenceTransformer version
    return pd.DataFrame(
        results,
        columns=["entity_given", "entity_predicted", "entity_predicted_score"]
    )


# Run evaluation
results = []
results_df_list = []

models = ["BAAI/bge-small-en-v1.5"]  # You can change to "bge-large" for better accuracy

for model in models:
    print("Started:", model)
    embedding_model = TextEmbedding(model_name=model, cache_dir=".fastembed")
    print(f"The model {model} is ready to use.")
    
    for col in ["Messy_Version1", "Messy_Version2", "Messy_Version3", "Messy_Version4"]:
        incorrect_entries = (df[col] + "__" + df["STATE"]).tolist()
        start_time = time.time()
        results_df = run_fastembed(model, incorrect_entries, reference_entries)
        accuracy = np.mean(results_df["entity_predicted"] == correct_entries)
        results_df.columns = [
            "{}_{}".format(given_col, col) for given_col in results_df.columns
        ]
        results_df["correct_entry"] = correct_entries
        results_df["model"] = model
        time_taken = time.time() - start_time
        results_df_list.append(results_df)
        results.append((model, col, accuracy, time_taken))

# Print summary
for r in results:
    print(f"Model: {r[0]}, Column: {r[1]}, Accuracy: {r[2]:.2f}, Time Taken: {r[3]:.2f}s")

Started: BAAI/bge-small-en-v1.5
The model BAAI/bge-small-en-v1.5 is ready to use.
Model: BAAI/bge-small-en-v1.5, Column: Messy_Version1, Accuracy: 0.97, Time Taken: 18.52s
Model: BAAI/bge-small-en-v1.5, Column: Messy_Version2, Accuracy: 0.85, Time Taken: 21.49s
Model: BAAI/bge-small-en-v1.5, Column: Messy_Version3, Accuracy: 1.00, Time Taken: 19.47s
Model: BAAI/bge-small-en-v1.5, Column: Messy_Version4, Accuracy: 0.90, Time Taken: 18.86s


In [21]:
print(f"\nPredictions for {model} on {col}")
print(results_df[["entity_given_" + col, "entity_predicted_" + col, "entity_predicted_score_" + col, "correct_entry"]].head(10))  # Show first 10



Predictions for BAAI/bge-small-en-v1.5 on Messy_Version4
                   entity_given_Messy_Version4  \
0  ANDAMAN_NRIIOCS_AB__Andman & Nicobar Island   
1                     ADILBADA__Andhra Pradesh   
2                    ANANTRUAP__Andhra Pradesh   
3                     CHITOROT__Andhra Pradesh   
4                EAST_GOVAIDRA__Andhra Pradesh   
5                       GUNTUR__Andhra Pradesh   
6                    HYDERBADA__Andhra Pradesh   
7                       Kadaap__Andhra Pradesh   
8                   KARIMRANGA__Andhra Pradesh   
9         KHAMMAM/BHARCADAMLAH__Andhra Pradesh   

               entity_predicted_Messy_Version4  \
0  ANDAMAN_NICOBAR_IS__Andman & Nicobar Island   
1                     ADILABAD__Andhra Pradesh   
2                    ANANTAPUR__Andhra Pradesh   
3                     CHITTOOR__Andhra Pradesh   
4                EAST_GODAVARI__Andhra Pradesh   
5                       GUNTUR__Andhra Pradesh   
6                      KRISHNA__Andhra Pr

In [22]:
results_df_list[1]

Unnamed: 0,entity_given_Messy_Version2,entity_predicted_Messy_Version2,entity_predicted_score_Messy_Version2,correct_entry,model
0,ANDAMAN_NICOB^AR_IS__Andman & Nicobar Island,ANDAMAN_NICOBAR_IS__Andman & Nicobar Island,0.985446,ANDAMAN_NICOBAR_IS__Andman & Nicobar Island,BAAI/bge-small-en-v1.5
1,A$DILABAD__Andhra Pradesh,KRISHNA__Andhra Pradesh,0.866682,ADILABAD__Andhra Pradesh,BAAI/bge-small-en-v1.5
2,ANAN!TAPUR__Andhra Pradesh,ANANTAPUR__Andhra Pradesh,0.834260,ANANTAPUR__Andhra Pradesh,BAAI/bge-small-en-v1.5
3,CHITTOO&R__Andhra Pradesh,CHITTOOR__Andhra Pradesh,0.937009,CHITTOOR__Andhra Pradesh,BAAI/bge-small-en-v1.5
4,EAST_GODA)VARI__Andhra Pradesh,EAST_GODAVARI__Andhra Pradesh,0.930600,EAST_GODAVARI__Andhra Pradesh,BAAI/bge-small-en-v1.5
...,...,...,...,...,...
620,*NORTH_24_PARGANAS__West Bengal,NORTH_24_PARGANAS__West Bengal,0.997393,NORTH_24_PARGANAS__West Bengal,BAAI/bge-small-en-v1.5
621,PURU!LIYA__West Bengal,PURULIYA__West Bengal,0.891123,PURULIYA__West Bengal,BAAI/bge-small-en-v1.5
622,SO$UTH_24_PARGANAS__West Bengal,SOUTH_24_PARGANAS__West Bengal,0.850178,SOUTH_24_PARGANAS__West Bengal,BAAI/bge-small-en-v1.5
623,UTTAR_DINA)JPUR__West Bengal,UTTAR_DINAJPUR__West Bengal,0.917938,UTTAR_DINAJPUR__West Bengal,BAAI/bge-small-en-v1.5
