In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [2]:
# df_mfm = pd.read_csv('MFM names to match(in).csv')
df_mfm = pd.read_csv('private_companies.csv')

df_forge_price = pd.read_csv('../data/forge_price_sample.csv')
df_funding_rounds = pd.read_csv('../data/funding-rounds.csv')
df_issuers = pd.read_csv('../data/issuers.csv')
df_trade_facts = pd.read_csv('../data/trade_facts.csv')


df_merged_result = df_issuers.merge(df_forge_price, on='ISSUER_SLUG', how='left') \
            .merge(df_funding_rounds, on='ISSUER_SLUG', how='left') \
            .merge(df_trade_facts, on='ISSUER_SLUG', how='left')
df_merged_result = df_merged_result.rename(columns={'ISSUER_SLUG': 'slug'})
df_merged_result

Unnamed: 0,slug,NAME,LEGALENTITYNAME,SEARCHALIASES,DESCRIPTION,STRUCTURED_DESCRIPTION,LIFECYCLESTATUS,BANNERMESSAGE,SUB_SECTOR,SECTOR,...,FORGE_PRICE_ISSUER_TIER,FORGE_PRICE,FORGE_IMPLIED_VALUATION,FORGE_PRICE_SOURCE_EXTERNAL,HAS_IOIS,ARRAY_AGG(FUNDING_DATE),ARRAY_AGG(FR.SHARE_TYPE),NUM_TRADES,LAST_CLOSED_TRADE_DATE,LAST_PENDING_TRADE_DATE
0,tradealgo,TradeAlgo,TA Fintech Inc.,[],,,,,Personal Finance,FinTech,...,,,,,,,,,,
1,latitude,Latitude,"Oasis Tech, Inc.",[],,,,,Gaming,Consumer & Lifestyle,...,,,,,,,,,,
2,revery-ai,Revery AI,Revery AI Inc.,[],,,,,E-commerce software,Consumer & Lifestyle,...,,,,,,,,,,
3,soci,SOCi,"SOCi, Inc.",[],,,,,Sales & Marketing / Adtech,Enterprise Software,...,,,,,,,,,,
4,colossyan,Colossyan,Colossyan Inc.,[],,,,,Productivity,Enterprise Software,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5291,vetster,Vetster,"Vetster, Inc.",[],Vetster is a telehealth company which offers o...,,,,Pets,Consumer & Lifestyle,...,,,,,,,,,,
5292,embrace,Embrace,"Embrace Mobile, Inc.",[],Embrace is a mobile app monitoring company whi...,,,,Data Management/Storage,Enterprise Software,...,,,,,,,,,,
5293,particle-health,Particle Health,"Particle Health, Inc.",[],Particle Health is a healthcare technology com...,,,,Digital Health,Healthcare,...,,,,,,,,,,
5294,leaf-trade,Leaf Trade,Leaf Trade Inc.,[],Leaf Trade is a technology company which offer...,,,,Business Operations,Enterprise Software,...,,,,,,,,,,


In [9]:
stop_words_normalized = [
        "healthcare",
        "technologies",
        "therapeutics",
        "financial",
        "software",
        "holdings",
        "transportation",
        "pharmaceuticals",
        "capital",
        "copper",
        "communications",
        "biotechnology",
        "biopharmaceuticals",
        "group",
        "technology",
        "media",
        "energy",
        "industries",
        "biotherapeutics",
        "solution",
        "bioscience",
        "industries",
        "corporation",
        "systems",
        "enterprises",
        "robotics",
        "bank",
        "inc",
        "llc",
        "pp",
        "series a",
        "series seed",
        "series b",
        "series c",
        "series d",
        "series e",
        "series f",
        "series g",
        "series h",
        "series i"
    ]

def normalize_name(name):
    name = name.lower() 
    for word in stop_words_normalized:
        name = name.replace(word, '')
    return name.strip()


stop_words = set(stopwords.words('english'))
def remove_stop_words(sentence):
    words = sentence.split()
    filtered_sentence = [w for w in words if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)


def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))


def clean_text(word_to_find):
    word_to_find_normalzed = normalize_name(word_to_find)

    word_to_find_no_stop_word = remove_stop_words(word_to_find_normalzed)

    word_to_find_no_punc = remove_punctuation(word_to_find_no_stop_word)

    return word_to_find_no_punc

In [3]:
# read the column nport_name from df_mfm_mappings where the column count is greater than or equal 1000
# df_mfm_mappings = df_mfm[df_mfm['count'] >= 1000][['nport_name']]
df_mfm_mappings = df_mfm[:1000]


In [4]:
words_to_find = df_mfm_mappings['nport_name']
word_list = df_merged_result['slug']
print(len(word_list), len(words_to_find))

5296 1000


In [5]:
# Weight functions

from datetime import datetime

def weight_domicile_country_code(code):
    return 1 if code == 'US' else 0

def weight_domicile_state_code(code):
    return 1 if code == 'DE' else 0


funding_round_ranking = {
    'seed': 0,
    'series a': 1,
    'series b': 2,
    'series c': 3,
    'series d': 4,
    'series e': 5,
    'series f': 6,
    'series g': 7,
    'series h': 8,
    'series i': 9
}
def normalize_and_weight_series(series_types):  
    if pd.isna(series_types) or not series_types:  # Handle NaN or empty lists
        return 0  # Neutral weight for missing values or empty lists

    normalized_weights = []
    
    for series in series_types:
        series_lower = series.lower().strip()
        
        match = re.search(r'series [a-z]', series_lower)
        if match:
            normalized_type = match.group()
            weight = funding_round_ranking.get(normalized_type, 0)
            normalized_weights.append(weight)
        else:
            # Fallback for unrecognized series types (neutral)
            normalized_weights.append(0)  
    
    return max(normalized_weights)  # Return the highest weight in the list


def weight_recency(funding_dates_str):
    if pd.isna(funding_dates_str) or not funding_dates_str: # Handle NaN or empty lists
        return -99999  # Lowest weight for missing or empty values
    
    funding_dates_clean = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}', funding_dates_str)
    funding_dates = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S.%f') for date in funding_dates_clean]
    most_recent_date = max(funding_dates)
    now = datetime.now()
    recency_weight = (now - most_recent_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)


def weight_price_source(price_source):
    if pd.isna(price_source):  # Handle NaN or missing values
        return -0.1  # Assign negative weight to missing values
    price_source_lower = price_source.lower().strip()
    
    if re.search(r'vwap', price_source_lower):
        return 1  
    elif re.search(r'iois', price_source_lower):
        return 0.5  
    elif re.search(r'primary', price_source_lower):
        return 0.2  
    else:
        return -0.1  # Negative weight for other or unrecognized types

def weight_price_issuer_tier(tier):
    if pd.isna(tier):  # Handle NaN or missing values
        return 0  # Neutral weight for missing
    tier_lower = tier.lower().strip()
    if tier_lower == 'tier_1':
        return 1  # Highest weight for TIER_1
    else:
        return 0.5  # Neutral or medium weight for other tiers


def weight_price(price):
    if pd.isna(price):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return price  # Higher value means higher weight

def weight_implied_valuation(valuation):
    if pd.isna(valuation):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return valuation  # Higher value means higher weight

#create weight functions for HAS_IOIS, if true less weight, if false more weight
def weight_has_iois(has_iois):
    return 0 if has_iois else 1

# create weight functions for NUM_TRADES, if more trades, higher weight
def weight_num_trades(num_trades):
    return num_trades if num_trades else 0

# create weight functions for LAST_CLOSED_TRADE_DATE, if more recent, higher weight
def weight_last_closed_trade_date(last_closed_trade_date):
    if pd.isna(last_closed_trade_date) or not last_closed_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_closed_trade_date = datetime.strptime(last_closed_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_closed_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for LAST_PENDING_TRADE_DATE, if more recent, higher weight
def weight_last_pending_trade_date(last_pending_trade_date):
    if pd.isna(last_pending_trade_date) or not last_pending_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_pending_trade_date = datetime.strptime(last_pending_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_pending_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for ARCHIVEDAT, if present, lower weight
def weight_archived_at(archived_at):
    return 0 if pd.isna(archived_at) else 1


# output_columns += [
#     'YEARFOUNDED',
#     'CITY'
# ]


In [13]:
def match_company(name, master_list, top_n=5):
    name = normalize_name(name)
    
    # Extract the top N matches along with their scores
    ranked_matches = process.extract(name, master_list, scorer=fuzz.token_set_ratio, limit=top_n)
    return ranked_matches if ranked_matches else []

# matched_companies = [match_company(c, word_list) for c in words_to_find]
# matched_companies


In [45]:
def match_company_with_weights(name, df_master, output_columns, column_weights, weighted_feature_columns, top_n=5):
    name = normalize_name(name)
    matches = []

    # Calculate max values for normalization
    max_feature_values = {col: df_master[col].max() for col in weighted_feature_columns}
    
    for index, row in df_master.iterrows():
        total_score = 0
        
        # Similarity scores for text columns
        for col in output_columns:
            col_value = str(row[col]) if pd.notnull(row[col]) else ''
            col_similarity = fuzz.token_set_ratio(name, normalize_name(col_value))
            weight = column_weights.get(col, 1.0)
            total_score += weight * col_similarity

        for col in weighted_feature_columns:
            feature_weight = row[col] if pd.notnull(row[col]) else 0

            # Normalize the feature weight
            max_value = max_feature_values[col] if max_feature_values[col] != 0 else 1
            scaled_feature_weight = feature_weight / max_value

            total_score += scaled_feature_weight

        matches.append((row['slug'], total_score, row))
    
    # Sort the matches based on total_score in descending order
    matches.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top N matches
    top_matches = matches[:top_n]
    
    return top_matches


In [7]:
df_master = df_merged_result.copy()

df_master['DOMICILECOUNTRYCODE_WEIGHT'] = df_master['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_master['DOMICILESTATECODE_WEIGHT'] = df_master['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_master['SHARE_TYPE_WEIGHT'] = df_master['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_master['FUNDING_DATE_WEIGHT'] = df_master['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_master['FORGE_PRICE_SOURCE_WEIGHT'] = df_master['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_master['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_master['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_master['FORGE_PRICE_WEIGHT'] = df_master['FORGE_PRICE'].apply(weight_price)
df_master['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_master['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)
df_master['HAS_IOIS_WEIGHT'] = df_master['HAS_IOIS'].apply(weight_has_iois)
df_master['NUM_TRADES_WEIGHT'] = df_master['NUM_TRADES'].apply(weight_num_trades)
df_master['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_master['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_master['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_master['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)


output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

# Define weights for each output column
column_weights = {
    'slug': 1.0,
    'NAME': 2.0,
    'LEGALENTITYNAME': 2.0,
    'WEBSITE': 1.5,
    'SEARCHALIASES': 1.5,
    'DESCRIPTION': 1.0,
    'STRUCTURED_DESCRIPTION': 1.0,
    'LIFECYCLESTATUS': 1.0,
    'BANNERMESSAGE': 1.0,
    'SUB_SECTOR': 1.0,
    'SECTOR': 1.0,
    'CRUNCHBASEURL': 1.0
}

weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]


In [47]:
result = []
# for c in words_to_find:
#     # print(c)
#     res = match_company(c, word_list)
#     first_match = res[0][0] if res else None
#     if res:
#         result.append((c, first_match, res))

# result

for c in words_to_find:
    top_matches = match_company_with_weights(
        name=c,
        df_master=df_master,
        output_columns=output_columns,
        column_weights=column_weights,
        weighted_feature_columns=weighted_feature_columns,
        top_n=5
    )
    if top_matches:
        first_match = top_matches[0][0]  # Get the slug of the first match
        # Extract slugs and scores for all top matches
        top_slugs_with_scores = [(slug, score) for slug, score, _ in top_matches]
        result.append((c, first_match, top_slugs_with_scores))
    else:
        first_match = None
        result.append((c, first_match, []))

result

[('FNMA GTD MTG PASS THRU CTF',
  'pat-mcgrath-labs',
  [('pat-mcgrath-labs', 9026.411344537815),
   ('the-players-tribune', 9010.911344537815),
   ('magisto', 8997.911344537815),
   ('the-virtual-reality-company', 8994.411344537815),
   ('smartsheet-com', 8992.411748396169)]),
 ('Freddie Mac',
  'stay-alfred',
  [('stay-alfred', 9074.411344537815),
   ('freeline-therapeutics', 9072.411344537815),
   ('kreditech', 9070.411344537815),
   ('freetrade', 9068.411344537815),
   ('fieldwirelabs', 9035.911344537815)]),
 ('UGCard',
  'popsugar',
  [('popsugar', 9074.911344537815),
   ('autogrid', 9047.411344537815),
   ('garuda-therapeutics', 9031.911344537815),
   ('scalyr', 9021.911344537815),
   ('tango-card', 9020.411344537815)]),
 ('Government National Mortgage Association',
  'valon-mortgage',
  [('valon-mortgage', 9177.911344537815),
   ('generation-bio', 9119.911360910452),
   ('convergent-dental', 9092.411344537815),
   ('real-time-translation', 9089.411344537815),
   ('turnkey-vacati

In [24]:
len(result)

1

In [48]:
df_result = pd.DataFrame(result, columns=['nport_name', 'first_matched_company', 'all_matches'])
# df_result.to_csv('fuzzy_matched.csv', index=False)
df_result.to_csv('weighted_fuzzy_matched.csv', index=False)

In [21]:
matched_companies_full_record = [(c, word_to_find) for c, word_to_find in zip(matched_companies, words_to_find)]
matched_companies_full_record

[([('portalone', 63, 635),
   ('lotame', 62, 2066),
   ('notable', 59, 920),
   ('oportun', 59, 3583),
   ('incorta', 59, 4650)],
  'nport_name')]

In [None]:
# Ranked matchings
exact_matches = 0
partial_matches = 0
total_items = len(word_list)
matched = []
partial_matched = []
not_matched = []

for ranked_match_list, expected in zip(matched_companies_full_record, word_list):
    if ranked_match_list:
        first_match = ranked_match_list[0][0][0]  # Get the first-ranked match
        nport_issuer_name = ranked_match_list[1]
        

        # Exact match
        if first_match == expected:
            exact_matches += 1
            matched.append((nport_issuer_name, expected, first_match))
            
        # Partial match (i.e., the expected company appears anywhere else in the ranked list)
        elif any(match[0] == expected for match in ranked_match_list[0]):
            partial_matches += 1
            partial_matched.append((nport_issuer_name, expected, ranked_match_list[0]))
            
        else:
            not_matched.append((nport_issuer_name, expected, ranked_match_list[0]))


exact_match_accuracy = exact_matches / total_items if total_items > 0 else 0
overall_match_accuracy = (exact_matches + partial_matches) / total_items if total_items > 0 else 0


print(f"Total items: {total_items}")
print(f"Number of exact matches: {exact_matches}")
print(f"Number of partial matches: {partial_matches}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2%}")
print(f"Overall Accuracy (including partial matches): {overall_match_accuracy:.2%}")



In [18]:
print("Partial matched items:")
for item in partial_matched:
    print(item)

partial_matched_df = pd.DataFrame(partial_matched, columns=['nport_issuer_name', 'slug', 'rankings'])
partial_matched_df['method'] = 'fuzzy_matched'
partial_matched_df.to_csv('data/partial_matched_fuzzy.csv', index=False)


Partial matched items:


In [35]:
# Function to filter duplicates while keeping the highest score for each unique name
def filter_unique_matches(ranked_matches):
    unique_ranked_matches = {}
    for match in ranked_matches:
        if match[0] not in unique_ranked_matches or unique_ranked_matches[match[0]][1] < match[1]:
            unique_ranked_matches[match[0]] = match
    return list(unique_ranked_matches.values())

# # Apply the filtering to create a new column with filtered matches in partial_matched
# filtered_partial_matches = [
#     (nport_issuer_name, expected, ranked_list, filter_unique_matches(ranked_list))
#     for nport_issuer_name, expected, ranked_list in partial_matched
# ]

# # Create DataFrame with both original and filtered matches
# partial_matched_df = pd.DataFrame(filtered_partial_matches, columns=['nport_issuer_name', 'slug', 'rankings', 'filtered_rankings'])
# partial_matched_df['method'] = 'fuzzy_matched'
# partial_matched_df.to_csv('data/partial_matched_fuzzy.csv', index=False)

In [37]:
# apply filter_unique_matches to df_result
df_result['filtered_rankings'] = df_result['matched_companies'].apply(filter_unique_matches)
df_result.to_csv('MFM names/fuzzy_matched.csv', index=False)

In [None]:
print(f"Number of not matches: {len(not_matched)}")
print("Not matched items:")
# for item in not_matched:
#     print(item)

In [20]:

not_matched_slugs = not_matched + partial_matched


In [239]:
# Perfect matching 
def find_matches(phrase, word_list, exact_match_weight=3):
    cleaned_words = clean_text(phrase).split()
    
    # A dictionary to keep track of match scores
    match_scores = {w: 0 for w in word_list}
    
    for word in cleaned_words:
        regex_pattern = re.escape(word)
        print(regex_pattern)
        compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)
        print(compiled_pattern)
        
        # Iterate through the word list and conut matches
        for list_item in word_list:
            if compiled_pattern.search(clean_text(list_item)):
                match_scores[list_item] += 1

    # Add exact match weight if the whole phrase is found
    for list_item in word_list:
        if re.search(re.escape(clean_text(phrase)), clean_text(list_item)):
            match_scores[list_item] += exact_match_weight
    
    # Rank the results based on the number of matches
    ranked_results = sorted(match_scores.items(), key=lambda x: x[1], reverse=True)
    
    return ranked_results



In [38]:
# match with weights on first word

def find_matches_with_weights(phrase, word_list, exact_match_weight=3, first_word_weight=2, other_word_weight=1):
    cleaned_words = clean_text(phrase).split()
    match_scores = {w: 0 for w in word_list}
    
    # Handle the first word with a higher weight
    if cleaned_words:
        first_word = cleaned_words[0]
        first_word_regex = re.escape(first_word)
        # print(first_word_regex)
        compiled_first_word_pattern = re.compile(first_word_regex, re.IGNORECASE)
        # print(compiled_first_word_pattern)
        
        # Add higher weight for first word matches
        for list_item in word_list:
            if compiled_first_word_pattern.search(clean_text(list_item)):
                match_scores[list_item] += first_word_weight
    
    # Handle other words with a standard weight
    for word in cleaned_words[1:]:  
        regex_pattern = re.escape(word)
        # print(regex_pattern)
        compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)       
        # print(compiled_pattern)
        
        for list_item in word_list:
            if compiled_pattern.search(clean_text(list_item)):
                match_scores[list_item] += other_word_weight

    # Add exact match weight if the whole phrase is found
    for list_item in word_list:
        if re.search(re.escape(clean_text(phrase)), clean_text(list_item)):
            match_scores[list_item] += exact_match_weight
    
    # Rank the results based on the number of matches
    ranked_results = sorted(match_scores.items(), key=lambda x: x[1], reverse=True)
    
    return ranked_results


In [65]:
all_matches = []
for t in words_to_find:
    matches = find_matches_with_weights(t, word_list)
    ranked_matches = [match for match in matches if match[1] > 0]
    ranked_matches_str = ' '.join([str(match) for match in ranked_matches[:5]])
    first_match = ranked_matches[0][0] if ranked_matches else None
    all_matches.append((t, first_match, ranked_matches_str))
all_matches

[('Fannie Mae', 'zunum-aero', "('zunum-aero', 1) ('panorama-education', 1)"),
 ('FNMA GTD MTG PASS THRU CTF',
  'cloudpassage',
  "('cloudpassage', 1) ('moviepass', 1) ('imperfect-foods', 1) ('compass', 1) ('passport', 1)"),
 ('Block, Inc.',
  'blockfi',
  "('blockfi', 2) ('blockstream', 2) ('blockchain-capital', 2) ('autoblocks-ai', 2) ('cityblock-health', 2)"),
 ('Freddie Mac',
  'chromacode',
  "('chromacode', 1) ('humacyte', 1) ('alto-pharmacy', 1) ('thought-machine', 1) ('karmacheck', 1)"),
 ('UGCard', None, ''),
 ('Government National Mortgage Association',
  'better-mortgage',
  "('better-mortgage', 1) ('c3-international', 1) ('zipline-international', 1) ('valon-mortgage', 1) ('branch-international', 1)"),
 ('Upstart Network, Inc.',
  'upstart-network',
  "('upstart-network', 3) ('cumulus-networks', 1) ('eagle-eye-networks', 1) ('broadview-networks', 1) ('illusive-networks', 1)"),
 ('United States Treasury',
  'unitedmasters',
  "('unitedmasters', 2) ('kitchen-united', 2) ('mode

In [66]:
all_matches_df = pd.DataFrame(all_matches, columns=['nport_name', 'first_matched_company', 'all_matches'])
all_matches_df.to_csv('MFM names/perfect_matched.csv', index=False)

In [30]:
correct_first_match = 0
partial_matches = 0
total_items = len(not_matched_slugs)

unmatched_items = []
partial_matched_items = []
matched_items = []
df_rows = []

for t in not_matched_slugs:
    not_matched_slug = t[1]  # slug
    not_matched_nport_issuer_name = t[0]  # nport_issuer_name
    
    # Get ranked matches
    matches = find_matches_with_weights(not_matched_nport_issuer_name, word_list)
    
    ranked_matches = [match for match in matches if match[1] > 0]
    ranked_matches_str = ', '.join([f"('{item[0]}', {item[1]})" for item in ranked_matches])
    
    match_status = ""
    
    if ranked_matches:
        # Check the first match
        first_match = ranked_matches[0][0]
        
        if first_match == not_matched_slug:
            # Exact match in the first position
            correct_first_match += 1
            match_status = "Matched"
            # add t[0], t[1], ranked_matches to list
            matched_items.append((t[0], t[1], ranked_matches[0]))
        else:
            # If second itme exists in the rest of the ranked matches
            if any(match[0] == not_matched_slug for match in ranked_matches[1:]):
                partial_matches += 1
                partial_matched_items.append((t[0], t[1], ranked_matches))
                match_status = "Partial Match"
            else:
                unmatched_items.append((t[0], t[1], ranked_matches))
                match_status = "Not Matched"
    else:
        unmatched_items.append(t)
        match_status = "Not Ranked - Not Matched"

#     df_rows.append([not_matched_slug, not_matched_nport_issuer_name, ranked_matches_str, match_status])

# _df = pd.DataFrame(df_rows, columns=['not_matched_slug', 'not_matched_nport_issuer_name', 'Ranked Matches', 'Match Status'])

# _df.to_csv('data/output.csv', index=False)

first_match_accuracy = correct_first_match / total_items if total_items > 0 else 0
overall_match_accuracy = (correct_first_match + partial_matches) / total_items if total_items > 0 else 0

print(f"First Match Accuracy: {first_match_accuracy:.2f}")
print(f"Overall Match Accuracy (including partial matches): {overall_match_accuracy:.2f}")
print("Unmatched items:", len(unmatched_items))


First Match Accuracy: 0.22
Overall Match Accuracy (including partial matches): 0.52
Unmatched items: 108


In [31]:
matched_items_df = pd.DataFrame(matched_items, columns=['nport_issuer_name', 'slug', 'rankings'])
matched_items_df.to_csv('data/_test.csv', index=False)

In [32]:
# create a df out of _df where match status is Partial Match
partial_matched_df = pd.DataFrame(partial_matched_items, columns=['nport_issuer_name', 'slug', 'rankings'])
partial_matched_df

Unnamed: 0,nport_issuer_name,slug,rankings
0,Life Healthcare Group Holdings Ltd,1life-healthcare,"[(caris-life-sciences, 20), (1life-healthcare,..."
1,Ant Group Co Ltd,ant-financial,"[(palantir, 26), (convoy, 15), (ant-financial,..."
2,Ant International Co. Limited,ant-financial,"[(palantir, 26), (convoy, 15), (ant-financial,..."
3,Ant International Co. Limited Class C Shares,ant-financial,"[(convoy, 30), (palantir, 26), (databricks, 21..."
4,AppLovin Corp.,applovin,"[(hashicorp, 10), (applovin, 8), (brain-corp, ..."
...,...,...,...
61,"TEBRA TECHNOLOGIES, INC. (F.K.A. KAREO, INC.) ...",tebra,"[(kareo, 4), (tebra, 4), (commonbond, 2)]"
62,The Honest Company Inc.,the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."
63,"The Honest Company, Inc.",the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."
64,THE HONEST COMPANY INC,the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."


In [37]:
# Apply the filtering to create a new column with filtered matches in partial_matched
filtered_partial_matched = [
    (row['nport_issuer_name'], row['slug'], row['rankings'], filter_unique_matches(row['rankings']))
    for _, row in partial_matched_df.iterrows()
]
filtered_partial_matched_df = pd.DataFrame(filtered_partial_matched, columns=['nport_issuer_name', 'slug', 'rankings', 'filtered_rankings'])
filtered_partial_matched_df['method'] = 'perfect_matched'
filtered_partial_matched_df.to_csv('data/partial_matched_perfect.csv', index=False)



In [None]:
# create a pd dataframe of the unmatched items
_not_matched_list = []
for t in unmatched_items:
    _not_matched_list.append([t[1], t[0]])

_df_not_matched = pd.DataFrame(_not_matched_list, columns=['slug', 'nport_issuer_name'])
print(_df_not_matched.head())
_df_not_matched.to_csv('data/unmatched_perfect.csv', index=False)

           slug                 nport_issuer_name
0        byju-s        Think & Learn Private Ltd.
1  warby-parker                          JAND INC
2   3d-robotics           Nauticus Robotics, Inc.
3        adroll  NEXTROLL, INC. COMMON SHARES   /
4        adroll        NEXTROLL COMMON SHARES   /


In [58]:
not_matched_list = []
for t in unmatched_items:
    not_matched_list.append([t[0], t[1]])
for t in partial_matched_items:
    not_matched_list.append((t[0], t[1]))

df_not_matched = pd.DataFrame(not_matched_list, columns=['nport_issuer_name','slug'])

In [None]:

df_forge_price = pd.read_csv('data/forge_price_sample.csv')
df_funding_rounds = pd.read_csv('data/funding-rounds.csv')
df_issuers = pd.read_csv('data/issuers.csv')
df_trade_facts = pd.read_csv('data/trade_facts.csv')


df_merged_result = df_issuers.merge(df_forge_price, on='ISSUER_SLUG', how='left') \
            .merge(df_funding_rounds, on='ISSUER_SLUG', how='left') \
            .merge(df_trade_facts, on='ISSUER_SLUG', how='left')
# df_merged_result.to_csv('data/merged_result.csv', index=False)
df_merged_result = df_merged_result.rename(columns={'ISSUER_SLUG': 'slug'})

# join df_not_matched with df_merged_result on left join where df_not_matched.slug = df_merged_result.ISSUER_SLUG 
df_joined_unmatched_merged_result = df_not_matched.merge(df_merged_result, how='left')

print(df_joined_unmatched_merged_result.head())

df_joined_unmatched_merged_result.to_csv('data/joined_unmatched_merged_result.csv', index=False)

In [70]:
# Weight functions

from datetime import datetime

def weight_domicile_country_code(code):
    return 1 if code == 'US' else 0

def weight_domicile_state_code(code):
    return 1 if code == 'DE' else 0


funding_round_ranking = {
    'seed': 0,
    'series a': 1,
    'series b': 2,
    'series c': 3,
    'series d': 4,
    'series e': 5,
    'series f': 6,
    'series g': 7,
    'series h': 8,
    'series i': 9
}
def normalize_and_weight_series(series_types):  
    if pd.isna(series_types) or not series_types:  # Handle NaN or empty lists
        return 0  # Neutral weight for missing values or empty lists

    normalized_weights = []
    
    for series in series_types:
        series_lower = series.lower().strip()
        
        match = re.search(r'series [a-z]', series_lower)
        if match:
            normalized_type = match.group()
            weight = funding_round_ranking.get(normalized_type, 0)
            normalized_weights.append(weight)
        else:
            # Fallback for unrecognized series types (neutral)
            normalized_weights.append(0)  
    
    return max(normalized_weights)  # Return the highest weight in the list


def weight_recency(funding_dates_str):
    if pd.isna(funding_dates_str) or not funding_dates_str: # Handle NaN or empty lists
        return -99999  # Lowest weight for missing or empty values
    
    funding_dates_clean = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}', funding_dates_str)
    funding_dates = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S.%f') for date in funding_dates_clean]
    most_recent_date = max(funding_dates)
    now = datetime.now()
    recency_weight = (now - most_recent_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)


def weight_price_source(price_source):
    if pd.isna(price_source):  # Handle NaN or missing values
        return -0.1  # Assign negative weight to missing values
    price_source_lower = price_source.lower().strip()
    
    if re.search(r'vwap', price_source_lower):
        return 1  
    elif re.search(r'iois', price_source_lower):
        return 0.5  
    elif re.search(r'primary', price_source_lower):
        return 0.2  
    else:
        return -0.1  # Negative weight for other or unrecognized types

def weight_price_issuer_tier(tier):
    if pd.isna(tier):  # Handle NaN or missing values
        return 0  # Neutral weight for missing
    tier_lower = tier.lower().strip()
    if tier_lower == 'tier_1':
        return 1  # Highest weight for TIER_1
    else:
        return 0.5  # Neutral or medium weight for other tiers


def weight_price(price):
    if pd.isna(price):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return price  # Higher value means higher weight

def weight_implied_valuation(valuation):
    if pd.isna(valuation):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return valuation  # Higher value means higher weight

#create weight functions for HAS_IOIS, if true less weight, if false more weight
def weight_has_iois(has_iois):
    return 0 if has_iois else 1

# create weight functions for NUM_TRADES, if more trades, higher weight
def weight_num_trades(num_trades):
    return num_trades if num_trades else 0

# create weight functions for LAST_CLOSED_TRADE_DATE, if more recent, higher weight
def weight_last_closed_trade_date(last_closed_trade_date):
    if pd.isna(last_closed_trade_date) or not last_closed_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_closed_trade_date = datetime.strptime(last_closed_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_closed_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for LAST_PENDING_TRADE_DATE, if more recent, higher weight
def weight_last_pending_trade_date(last_pending_trade_date):
    if pd.isna(last_pending_trade_date) or not last_pending_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_pending_trade_date = datetime.strptime(last_pending_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_pending_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for ARCHIVEDAT, if present, lower weight
def weight_archived_at(archived_at):
    return 0 if pd.isna(archived_at) else 1


# output_columns += [
#     'YEARFOUNDED',
#     'CITY'
# ]


In [None]:
# words_to_find = df_mfm_mappings['nport_name']
# word_list = df_merged_result['slug']

df_words_to_find = pd.DataFrame(words_to_find, columns=['nport_issuer_name'])

In [49]:
# Normalize 'words_to_find_2' if necessary
words_to_find = df_mfm_mappings['nport_name']
word_list = df_merged_result['slug']
print(len(word_list), len(words_to_find))

words_to_find_2 = [normalize_name(str(word)) for word in words_to_find]

5296 1000


In [51]:
# Process 'df_not_matched_2'
df_not_matched_2 = df_merged_result.copy()

# Combine text columns into 'combined_output'
output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',	
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(str(row[col])))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_output'] = df_not_matched_2.apply(combine_output_columns, axis=1)
word_list = df_not_matched_2['combined_output'].tolist()

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings
embeddings_to_find = model.encode(words_to_find_2, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)

# Apply the weight functions to 'df_not_matched_2'
df_not_matched_2['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_2['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_2['DOMICILESTATECODE_WEIGHT'] = df_not_matched_2['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_2['SHARE_TYPE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_2['FUNDING_DATE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_2['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_2['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_2['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_2['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_2['FORGE_PRICE_WEIGHT'] = df_not_matched_2['FORGE_PRICE'].apply(weight_price)
df_not_matched_2['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_2['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)
df_not_matched_2['HAS_IOIS_WEIGHT'] = df_not_matched_2['HAS_IOIS'].apply(weight_has_iois)
df_not_matched_2['NUM_TRADES_WEIGHT'] = df_not_matched_2['NUM_TRADES'].apply(weight_num_trades)
df_not_matched_2['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_not_matched_2['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Ensure the data types are numeric
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].apply(pd.to_numeric, errors='coerce')

# Replace any remaining NaNs with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_2[weighted_feature_columns])

# Compute a single weighted score per candidate
weighted_scores = weighted_features.sum(axis=1)  # Sum across features
weighted_scores = weighted_scores.reshape(-1, 1)  # Shape (number_of_candidates, 1)

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings_to_find, embeddings_list)

# Adjust the cosine similarity scores using weighted scores
beta = 1.0  
adjusted_cosine_sim_matrix = cosine_sim_matrix * (1 + beta * weighted_scores.T)

# Find top matches for each word in 'words_to_find_2'
results_data = []

for idx, (word, scores) in enumerate(zip(words_to_find_2, adjusted_cosine_sim_matrix)):
    current_npot_issuer_name = words_to_find[idx]

    # Get indices sorted by score in descending order
    top_indices = scores.argsort()[::-1]
    matches = []
    seen_slugs = set()
    for index in top_indices:
        matched_row = df_not_matched_2.iloc[index]
        match_slug = matched_row.get('slug', '')
        match_score = scores[index]
        if match_slug not in seen_slugs:
            seen_slugs.add(match_slug)
            # Append the tuple (match_slug, score)
            matches.append((match_slug, match_score))
        # Stop if we have collected 5 matches
        if len(matches) >= 5:
            break
    # Prepare the result for this word
    nport_name = word
    if matches:
        first_matched_company = matches[0][0]  # First match's slug
    else:
        first_matched_company = ''
    # Format matches as list of tuples (match_slug, score)
    matches_formatted = [f"({slug}, {score:.4f})" for slug, score in matches]
    all_matches_str = ', '.join(matches_formatted)
    results_data.append({
        'nport_name': current_npot_issuer_name,
        'first_matched_company': first_matched_company,
        'all_matches': all_matches_str
    })

df_results = pd.DataFrame(results_data)

# Save the results to a CSV file
df_results.to_csv('embedding_matched.csv', index=False)





Additional weights for input text

In [52]:
words_to_find = df_mfm_mappings['nport_name']
word_list = df_merged_result['slug']
print(len(word_list), len(words_to_find))

words_to_find_3 = [normalize_name(str(word)) for word in words_to_find]

5296 1000


In [54]:
df_not_matched_3 = df_merged_result.copy()

output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

# Define weights for each output column
column_weights = {
    'slug': 1.0,
    'NAME': 2.0,
    'LEGALENTITYNAME': 2.0,
    'WEBSITE': 1.5,
    'SEARCHALIASES': 1.5,
    'DESCRIPTION': 1.0,
    'STRUCTURED_DESCRIPTION': 1.0,
    'LIFECYCLESTATUS': 1.0,
    'BANNERMESSAGE': 1.0,
    'SUB_SECTOR': 1.0,
    'SECTOR': 1.0,
    'CRUNCHBASEURL': 1.0
}

# Clean and store text for each output column
for col in output_columns:
    if col in df_not_matched_3.columns:
        df_not_matched_3[col + '_clean'] = df_not_matched_3[col].apply(
            lambda x: clean_text(x) if pd.notnull(x) else ''
        )

    else:
        df_not_matched_3[col + '_clean'] = ''


# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_to_find_3 = model.encode(words_to_find_3, convert_to_numpy=True, normalize_embeddings=True)
weighted_embeddings_list_3 = []

for idx, row in df_not_matched_3.iterrows():
    combined_embedding = np.zeros(model.get_sentence_embedding_dimension())
    total_weight = 0.0

    for col in output_columns:
        col_clean = col + '_clean'
        text = row[col_clean]
        weight = column_weights.get(col, 1.0)
        if text:
            embedding = model.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0]
            combined_embedding += weight * embedding
            total_weight += weight

    if total_weight > 0:
        combined_embedding /= total_weight  # Normalize by total weight
    weighted_embeddings_list_3.append(combined_embedding)

embeddings_list_3 = np.array(weighted_embeddings_list_3)

# Now, proceed to include weighted features as before
# Apply the weight functions to the DataFrame
df_not_matched_3['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_3['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_3['DOMICILESTATECODE_WEIGHT'] = df_not_matched_3['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_3['SHARE_TYPE_WEIGHT'] = df_not_matched_3['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_3['FUNDING_DATE_WEIGHT'] = df_not_matched_3['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_3['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_3['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_3['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_3['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_3['FORGE_PRICE_WEIGHT'] = df_not_matched_3['FORGE_PRICE'].apply(weight_price)
df_not_matched_3['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_3['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)
df_not_matched_3['HAS_IOIS_WEIGHT'] = df_not_matched_3['HAS_IOIS'].apply(weight_has_iois)
df_not_matched_3['NUM_TRADES_WEIGHT'] = df_not_matched_3['NUM_TRADES'].apply(weight_num_trades)
df_not_matched_3['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_not_matched_3['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_not_matched_3['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_not_matched_3['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_3[weighted_feature_columns] = df_not_matched_3[weighted_feature_columns].fillna(0.0)

# Ensure the data types are numeric
df_not_matched_3[weighted_feature_columns] = df_not_matched_3[weighted_feature_columns].apply(pd.to_numeric, errors='coerce')

# Replace any remaining NaNs with zeros
df_not_matched_3[weighted_feature_columns] = df_not_matched_3[weighted_feature_columns].fillna(0.0)


# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_3[weighted_feature_columns])

# # Combine embeddings with weighted features
# combined_embeddings_to_find_3 = np.hstack((embeddings_to_find_3, weighted_features))
# combined_embeddings_list_3 = np.hstack((embeddings_list_3, weighted_features))

# # Compute cosine similarity matrix
# cosine_sim_matrix_3 = cosine_similarity(combined_embeddings_to_find_3, combined_embeddings_list_3)

# Compute a single weighted score per candidate
weighted_scores = weighted_features.sum(axis=1)  # Sum across features
weighted_scores = weighted_scores.reshape(-1, 1)  # Shape (number_of_candidates, 1)

# Compute cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings_to_find_3, embeddings_list_3)

# Adjust the cosine similarity scores using weighted scores
beta = 1.0  
adjusted_cosine_sim_matrix = cosine_sim_matrix * (1 + beta * weighted_scores.T)

results_data = []

for idx, (word, scores) in enumerate(zip(words_to_find_3, adjusted_cosine_sim_matrix)):
    current_npot_issuer_name = words_to_find[idx]

    # Get indices sorted by score in descending order
    top_indices = scores.argsort()[::-1]
    matches = []
    seen_slugs = set()
    for index in top_indices:
        matched_row = df_not_matched_2.iloc[index]
        match_slug = matched_row.get('slug', '')
        match_score = scores[index]
        if match_slug not in seen_slugs:
            seen_slugs.add(match_slug)
            # Append the tuple (match_slug, score)
            matches.append((match_slug, match_score))
        # Stop if we have collected 5 matches
        if len(matches) >= 5:
            break
    # Prepare the result for this word
    nport_name = word
    if matches:
        first_matched_company = matches[0][0]  # First match's slug
    else:
        first_matched_company = ''
    # Format matches as list of tuples (match_slug, score)
    matches_formatted = [f"({slug}, {score:.4f})" for slug, score in matches]
    all_matches_str = ', '.join(matches_formatted)
    results_data.append({
        'nport_name': current_npot_issuer_name,
        'first_matched_company': first_matched_company,
        'all_matches': all_matches_str
    })

df_results = pd.DataFrame(results_data)

# Save the results to a CSV file
df_results.to_csv('weighted_embedding_matched.csv', index=False)

In [81]:
partial_matched_results_df = pd.DataFrame(partial_matched_results)
partial_matched_results_df['method'] = 'embedding_matched'

partial_matched_results_df.to_csv('data/_test.csv', index=False)

In [80]:
exact_matxches_results_df = pd.DataFrame(exact_matches)
exact_matxches_results_df.to_csv('data/_test_exact.csv', index=False)