In [2]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading transformers-4.46.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [1]:
import pandas as pd

df = pd.read_csv('issuer-mfm-mappings.csv')

In [4]:
words_to_find = df.apply(lambda row: row['nport_issuer_name'] if pd.notna(row['nport_issuer_name']) else row['nport_security_name'], axis=1)

word_list = df['slug']

In [7]:
# all-MiniLM-L6-v2
import torch
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')


embeddings_to_find = model.encode(words_to_find.tolist(), convert_to_tensor=True)
embeddings_list = model.encode(word_list.tolist(), convert_to_tensor=True)

# Cosine similarity matrix
cosine_scores = util.cos_sim(embeddings_to_find, embeddings_list)

# Find top matches
top_k = 5  
results = []

correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k


for idx, (word, scores) in enumerate(zip(words_to_find, cosine_scores)):
    # Get top_k matches
    top_results = torch.topk(scores, k=top_k)
    matches = []
    ground_truth_found = False

    for rank, (score, index) in enumerate(zip(top_results.values, top_results.indices)):
        matched_word = word_list[index.item()]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': score.item()
        })
        
        # Check if ground truth is among the top_k matches
        if index.item() == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1  # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })

# Calculate accuracy metrics
total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")

# display detailed results
# for result in results:
#     print(f"Word to find: {result['word_to_find']}")
#     for match in result['matches']:
#         print(f"  Rank {match['rank']}: {match['match_word']}, Score: {match['score']:.4f}")
#     print(f"Ground truth match found: {'Yes' if result['ground_truth_found'] else 'No'}\n")


Top-1 Accuracy: 23.58%
Top-5 Accuracy: 73.13%


In [18]:
stop_words_normalized = [
        "healthcare",
        "technologies",
        "therapeutics",
        "financial",
        "software",
        "holdings",
        "transportation",
        "pharmaceuticals",
        "capital",
        "copper",
        "communications",
        "biotechnology",
        "biopharmaceuticals",
        "group",
        "technology",
        "media",
        "energy",
        "industries",
        "biotherapeutics",
        "solution",
        "bioscience",
        "industries",
        "corporation",
        "systems",
        "enterprises",
        "robotics",
        "bank",
        "inc",
        "llc",
        "pp",
        "series a",
        "series seed",
        "series b",
        "series c",
        "series d",
        "series e",
        "series f",
        "series g",
        "series h",
        "series i"
    ]

def normalize_name(name):
    name = name.lower() 
    for word in stop_words_normalized:
        name = name.replace(word, '')
    return name.strip()

In [19]:
# all-MiniLM-L6-v2 with cleaning 
import torch
from sentence_transformers import SentenceTransformer, util
import re

def clean_text(text):
    text = normalize_name(str(text))
    return text


df['nport_issuer_name_clean'] = df['nport_issuer_name'].apply(clean_text)
df['nport_security_name_clean'] = df['nport_security_name'].apply(clean_text)
df['slug_clean'] = df['slug'].apply(clean_text)


cleaned_words_to_find = df.apply(
    lambda row: row['nport_issuer_name_clean'] if row['nport_issuer_name_clean'] else row['nport_security_name_clean'],
    axis=1
).tolist()

cleaned_word_list = df['slug_clean'].tolist()

model = SentenceTransformer('all-MiniLM-L6-v2')


embeddings_to_find = model.encode(cleaned_words_to_find, convert_to_tensor=True, normalize_embeddings=True)
embeddings_list = model.encode(cleaned_word_list, convert_to_tensor=True, normalize_embeddings=True)


cosine_scores = util.cos_sim(embeddings_to_find, embeddings_list)

correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5        

results = []

for idx, (word, scores) in enumerate(zip(cleaned_words_to_find, cosine_scores)):
    # Get top_k matches
    top_results = torch.topk(scores, k=top_k)
    matches = []
    ground_truth_found = False

    for rank, (score, index) in enumerate(zip(top_results.values, top_results.indices)):
        matched_word = cleaned_word_list[index.item()]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': score.item()
        })
        
        # Check if ground truth is among the top_k matches
        if index.item() == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1  # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(cleaned_words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 23.26%
Top-5 Accuracy: 71.88%


In [20]:
# all-mpnet-base-v2

import torch
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2')


embeddings_to_find = model.encode(words_to_find.tolist(), convert_to_tensor=True)
embeddings_list = model.encode(word_list.tolist(), convert_to_tensor=True)

cosine_scores = util.cos_sim(embeddings_to_find, embeddings_list)


top_k = 5  
results = []

correct_top1 = 0  
correct_topk = 0  


for idx, (word, scores) in enumerate(zip(words_to_find, cosine_scores)):
    top_results = torch.topk(scores, k=top_k)
    matches = []
    ground_truth_found = False

    for rank, (score, index) in enumerate(zip(top_results.values, top_results.indices)):
        matched_word = word_list[index.item()]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': score.item()
        })
        
        # Check if ground truth is among the top_k matches
        if index.item() == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1  # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })

total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top-1 Accuracy: 21.69%
Top-5 Accuracy: 69.31%


In [21]:
# all-mpnet-base-v2 with cleaning 
import torch
from sentence_transformers import SentenceTransformer, util
import re

def clean_text(text):
    text = normalize_name(str(text))
    return text


df['nport_issuer_name_clean'] = df['nport_issuer_name'].apply(clean_text)
df['nport_security_name_clean'] = df['nport_security_name'].apply(clean_text)
df['slug_clean'] = df['slug'].apply(clean_text)


cleaned_words_to_find = df.apply(
    lambda row: row['nport_issuer_name_clean'] if row['nport_issuer_name_clean'] else row['nport_security_name_clean'],
    axis=1
).tolist()

cleaned_word_list = df['slug_clean'].tolist()

model = SentenceTransformer('all-mpnet-base-v2')


embeddings_to_find = model.encode(cleaned_words_to_find, convert_to_tensor=True, normalize_embeddings=True)
embeddings_list = model.encode(cleaned_word_list, convert_to_tensor=True, normalize_embeddings=True)


cosine_scores = util.cos_sim(embeddings_to_find, embeddings_list)

correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5        

results = []

for idx, (word, scores) in enumerate(zip(cleaned_words_to_find, cosine_scores)):
    # Get top_k matches
    top_results = torch.topk(scores, k=top_k)
    matches = []
    ground_truth_found = False

    for rank, (score, index) in enumerate(zip(top_results.values, top_results.indices)):
        matched_word = cleaned_word_list[index.item()]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': score.item()
        })
        
        # Check if ground truth is among the top_k matches
        if index.item() == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1  # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(cleaned_words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 22.65%
Top-5 Accuracy: 70.66%


Character level embeddings
- FastText

In [22]:
!pip install gensim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl (30.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.4/30.4 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.0
    Uninstalling scipy-1.14.0:
      Successfully uninstalled scipy-1.14.0
Successfully installed gensim-4.3.3 sci

In [23]:
import numpy as np
import re
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity


all_company_names = list(set(words_to_find + word_list))


sentences = [name.split() for name in all_company_names]


fasttext_model = FastText(
    sentences,
    vector_size=100,
    window=3,
    min_count=1,
    workers=4,
    sg=1,
    min_n=3,
    max_n=6
)

# Function to get embedding for a company name
def get_embedding(model, text):
    words = text.split()
    word_embeddings = []
    for word in words:
        if word in model.wv:
            word_embeddings.append(model.wv[word])
        else:
            # Handle out-of-vocabulary words
            word_embeddings.append(np.zeros(model.vector_size))
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


embeddings_to_find = np.array([get_embedding(fasttext_model, name) for name in words_to_find])
embeddings_list = np.array([get_embedding(fasttext_model, name) for name in word_list])


cosine_sim_matrix = cosine_similarity(embeddings_to_find, embeddings_list)


correct_top1 = 0  
correct_topk = 0  
top_k = 5         

results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        
        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  
            correct_topk += 1 

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 0.39%
Top-5 Accuracy: 1.06%


In [32]:
# Bert
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = normalize_name(str(text))
    return text


df['nport_issuer_name_clean'] = df['nport_issuer_name'].apply(clean_text)
df['nport_security_name_clean'] = df['nport_security_name'].apply(clean_text)
df['slug_clean'] = df['slug'].apply(clean_text)


cleaned_words_to_find = df.apply(
    lambda row: row['nport_issuer_name_clean'] if row['nport_issuer_name_clean'] else row['nport_security_name_clean'],
    axis=1
).tolist()

cleaned_word_list = df['slug_clean'].tolist()

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embedding for a text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=32)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Use the [CLS] token's embedding as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embedding.flatten()

# Compute embeddings for words_to_find and word_list
embeddings_to_find = np.array([get_embedding(text) for text in cleaned_words_to_find])
embeddings_list = np.array([get_embedding(text) for text in cleaned_word_list])

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings_to_find, embeddings_list)

# Initialize counters for accuracy calculation
correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5         # You can adjust this value

# Store results for analysis
results = []

for idx, (word, scores) in enumerate(zip(cleaned_words_to_find, cosine_sim_matrix)):
    # Get top_k matches
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = cleaned_word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        # Check if ground truth is among the top_k matches
        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1  # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })

# Calculate accuracy metrics
total = len(cleaned_words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 7.21%
Top-5 Accuracy: 24.36%


Combined textual columns

In [35]:
df_merged_result = pd.read_csv('data/merged_result.csv')
df_mfm_mappings = pd.read_csv('issuer-mfm-mappings.csv')


df_all = df_mfm_mappings.merge(df_merged_result, on='slug', how='left')


# df_all.to_csv('data/mfm_mappings_merged_result.csv', index=False)

In [33]:
# all-MiniLM-L6-v2 plus cleaning
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def clean_text(text):
    text = normalize_name(str(text))
    return text

# Combine 'nport_issuer_name' and 'nport_security_name' for the inputs
def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    if pd.notnull(row['nport_security_name']):
        texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_input'] = df_all.apply(combine_input_columns, axis=1)

output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_output'] = df_all.apply(combine_output_columns, axis=1)

# Prepare the words_to_find and word_list
words_to_find = df_all['combined_input'].tolist()
word_list = df_all['combined_output'].tolist()

# model = SentenceTransformer('all-mpnet-base-v2')  
model = SentenceTransformer('all-MiniLM-L6-v2')


embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)
cosine_sim_matrix = cosine_similarity(embeddings_to_find, embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5         
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })


        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 23.42%
Top-5 Accuracy: 74.16%


In [34]:
# all-mpnet-base-v2 plus cleaning
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


def clean_text(text):
    text = normalize_name(str(text))
    return text

# Combine 'nport_issuer_name' and 'nport_security_name' for the inputs
def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    if pd.notnull(row['nport_security_name']):
        texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_input'] = df_all.apply(combine_input_columns, axis=1)

output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_output'] = df_all.apply(combine_output_columns, axis=1)

# Prepare the words_to_find and word_list
words_to_find = df_all['combined_input'].tolist()
word_list = df_all['combined_output'].tolist()

model = SentenceTransformer('all-mpnet-base-v2')  
# model = SentenceTransformer('all-MiniLM-L6-v2')


embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)
cosine_sim_matrix = cosine_similarity(embeddings_to_find, embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5         
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })


        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")



Top-1 Accuracy: 22.94%
Top-5 Accuracy: 73.55%


Weighted columns

In [41]:
# Weight functions

from datetime import datetime

def weight_domicile_country_code(code):
    return 1 if code == 'US' else 0

def weight_domicile_state_code(code):
    return 1 if code == 'DE' else 0


funding_round_ranking = {
    'seed': 0,
    'series a': 1,
    'series b': 2,
    'series c': 3,
    'series d': 4,
    'series e': 5,
    'series f': 6,
    'series g': 7,
    'series h': 8,
    'series i': 9
}
def normalize_and_weight_series(series_types):  
    if pd.isna(series_types) or not series_types:  # Handle NaN or empty lists
        return 0  # Neutral weight for missing values or empty lists

    normalized_weights = []
    
    for series in series_types:
        series_lower = series.lower().strip()
        
        match = re.search(r'series [a-z]', series_lower)
        if match:
            normalized_type = match.group()
            weight = funding_round_ranking.get(normalized_type, 0)
            normalized_weights.append(weight)
        else:
            # Fallback for unrecognized series types (neutral)
            normalized_weights.append(0)  
    
    return max(normalized_weights)  # Return the highest weight in the list


def weight_recency(funding_dates_str):
    if pd.isna(funding_dates_str) or not funding_dates_str: # Handle NaN or empty lists
        return -99999  # Lowest weight for missing or empty values
    
    funding_dates_clean = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}', funding_dates_str)
    funding_dates = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S.%f') for date in funding_dates_clean]
    most_recent_date = max(funding_dates)
    now = datetime.now()
    recency_weight = (now - most_recent_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)


def weight_price_source(price_source):
    if pd.isna(price_source):  # Handle NaN or missing values
        return -0.1  # Assign negative weight to missing values
    price_source_lower = price_source.lower().strip()
    
    if re.search(r'vwap', price_source_lower):
        return 1  
    elif re.search(r'iois', price_source_lower):
        return 0.5  
    elif re.search(r'primary', price_source_lower):
        return 0.2  
    else:
        return -0.1  # Negative weight for other or unrecognized types

def weight_price_issuer_tier(tier):
    if pd.isna(tier):  # Handle NaN or missing values
        return 0  # Neutral weight for missing
    tier_lower = tier.lower().strip()
    if tier_lower == 'tier_1':
        return 1  # Highest weight for TIER_1
    else:
        return 0.5  # Neutral or medium weight for other tiers


def weight_price(price):
    if pd.isna(price):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return price  # Higher value means higher weight

def weight_implied_valuation(valuation):
    if pd.isna(valuation):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return valuation  # Higher value means higher weight


In [42]:
# all-MiniLM-L6-v2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = normalize_name(str(text))
    return text


def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    if pd.notnull(row['nport_security_name']):
        texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_input'] = df_all.apply(combine_input_columns, axis=1)


output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_all['combined_output'] = df_all.apply(combine_output_columns, axis=1)

words_to_find = df_all['combined_input'].tolist()
word_list = df_all['combined_output'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')  

embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_all['DOMICILECOUNTRYCODE_WEIGHT'] = df_all['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_all['DOMICILESTATECODE_WEIGHT'] = df_all['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_all['SHARE_TYPE_WEIGHT'] = df_all['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_all['FUNDING_DATE_WEIGHT'] = df_all['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_all['FORGE_PRICE_SOURCE_WEIGHT'] = df_all['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_all['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_all['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_all['FORGE_PRICE_WEIGHT'] = df_all['FORGE_PRICE'].apply(weight_price)
df_all['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_all['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT'
]

# Fill NaN values with zeros
df_all[weighted_feature_columns] = df_all[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_all[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5 
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")


Top-1 Accuracy: 26.48%
Top-5 Accuracy: 77.70%


Not matched from perfect matchings

In [43]:
df_merged_result = pd.read_csv('data/merged_result.csv')
df_non_matches = pd.read_csv('data/output.csv')


df_not_matched = df_non_matches.merge(df_merged_result, on='slug', how='left')


df_not_matched.to_csv('data/not_matched_merged_result.csv', index=False)

In [45]:
# all-MiniLM-L6-v2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = normalize_name(str(text))
    return text


def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    # if pd.notnull(row['nport_security_name']):
    #     texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched['combined_input'] = df_not_matched.apply(combine_input_columns, axis=1)


output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched['combined_output'] = df_not_matched.apply(combine_output_columns, axis=1)

words_to_find = df_not_matched['combined_input'].tolist()
word_list = df_not_matched['combined_output'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')  

embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_not_matched['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched['DOMICILESTATECODE_WEIGHT'] = df_not_matched['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched['SHARE_TYPE_WEIGHT'] = df_not_matched['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched['FUNDING_DATE_WEIGHT'] = df_not_matched['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched['FORGE_PRICE_WEIGHT'] = df_not_matched['FORGE_PRICE'].apply(weight_price)
df_not_matched['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched[weighted_feature_columns] = df_not_matched[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5 
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")


Top-1 Accuracy: 36.49%
Top-5 Accuracy: 80.18%


Not matched from perfect matchings + fuzzy match

In [46]:
df_merged_result = pd.read_csv('data/merged_result.csv')
df_non_matches_2 = pd.read_csv('data/output_not_matched.csv')


df_not_matched_2 = df_non_matches_2.merge(df_merged_result, on='slug', how='left')


df_not_matched_2.to_csv('data/not_matched_merged_result.csv', index=False)

In [47]:
# all-MiniLM-L6-v2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = normalize_name(str(text))
    return text


def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    # if pd.notnull(row['nport_security_name']):
    #     texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_input'] = df_not_matched_2.apply(combine_input_columns, axis=1)


output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_output'] = df_not_matched_2.apply(combine_output_columns, axis=1)

words_to_find = df_not_matched_2['combined_input'].tolist()
word_list = df_not_matched_2['combined_output'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')  

embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_not_matched_2['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_2['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_2['DOMICILESTATECODE_WEIGHT'] = df_not_matched_2['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_2['SHARE_TYPE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_2['FUNDING_DATE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_2['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_2['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_2['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_2['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_2['FORGE_PRICE_WEIGHT'] = df_not_matched_2['FORGE_PRICE'].apply(weight_price)
df_not_matched_2['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_2['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_2[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5 
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")


Top-1 Accuracy: 29.63%
Top-5 Accuracy: 76.85%


In [48]:
# Vsevolod/company-names-similarity-sentence-transformer
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

def clean_text(text):
    text = normalize_name(str(text))
    return text


def combine_input_columns(row):
    texts = []
    if pd.notnull(row['nport_issuer_name']):
        texts.append(clean_text(row['nport_issuer_name']))
    # if pd.notnull(row['nport_security_name']):
    #     texts.append(clean_text(row['nport_security_name']))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_input'] = df_not_matched_2.apply(combine_input_columns, axis=1)


output_columns = [
    'slug',
    'company_name',
    'legal_entity_name',
    # 'NAME',
    # 'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_output'] = df_not_matched_2.apply(combine_output_columns, axis=1)

words_to_find = df_not_matched_2['combined_input'].tolist()
word_list = df_not_matched_2['combined_output'].tolist()


model = SentenceTransformer("Vsevolod/company-names-similarity-sentence-transformer")


embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_not_matched_2['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_2['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_2['DOMICILESTATECODE_WEIGHT'] = df_not_matched_2['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_2['SHARE_TYPE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_2['FUNDING_DATE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_2['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_2['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_2['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_2['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_2['FORGE_PRICE_WEIGHT'] = df_not_matched_2['FORGE_PRICE'].apply(weight_price)
df_not_matched_2['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_2['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_2[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5 
results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches

    results.append({
        'word_to_find': word,
        'matches': matches,
        'ground_truth_found': ground_truth_found
    })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Top-1 Accuracy: 25.93%
Top-5 Accuracy: 61.11%
