In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [2]:
df_mfm_mappings = pd.read_csv('issuer-mfm-mappings.csv')

In [3]:
stop_words_normalized = [
        "healthcare",
        "technologies",
        "therapeutics",
        "financial",
        "software",
        "holdings",
        "transportation",
        "pharmaceuticals",
        "capital",
        "copper",
        "communications",
        "biotechnology",
        "biopharmaceuticals",
        "group",
        "technology",
        "media",
        "energy",
        "industries",
        "biotherapeutics",
        "solution",
        "bioscience",
        "industries",
        "corporation",
        "systems",
        "enterprises",
        "robotics",
        "bank",
        "inc",
        "llc",
        "pp",
        "series a",
        "series seed",
        "series b",
        "series c",
        "series d",
        "series e",
        "series f",
        "series g",
        "series h",
        "series i"
    ]

def normalize_name(name):
    name = name.lower() 
    for word in stop_words_normalized:
        name = name.replace(word, '')
    return name.strip()


stop_words = set(stopwords.words('english'))
def remove_stop_words(sentence):
    words = sentence.split()
    filtered_sentence = [w for w in words if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)


def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))


def clean_text(word_to_find):
    word_to_find_normalzed = normalize_name(word_to_find)

    word_to_find_no_stop_word = remove_stop_words(word_to_find_normalzed)

    word_to_find_no_punc = remove_punctuation(word_to_find_no_stop_word)

    return word_to_find_no_punc

In [4]:
words_to_find = df_mfm_mappings.apply(lambda row: row['nport_issuer_name'] if pd.notna(row['nport_issuer_name']) else row['nport_security_name'], axis=1)
word_list = df_mfm_mappings['slug']

In [5]:
def match_company(name, master_list, top_n=5):
    name = normalize_name(name)
    
    # Extract the top N matches along with their scores
    ranked_matches = process.extract(name, master_list, scorer=fuzz.token_set_ratio, limit=top_n)
    return ranked_matches if ranked_matches else []

matched_companies = [match_company(c, word_list) for c in words_to_find]


In [6]:
matched_companies_full_record = [(c, word_to_find) for c, word_to_find in zip(matched_companies, words_to_find)]

In [233]:
x = zip(matched_companies_full_record, word_list)
f = list(x)[1]
f[0][1]
for ranked_match_list, expected in zip(matched_companies_full_record, word_list):
    print(ranked_match_list[1], ranked_match_list[0], expected)


Arcellx, Inc.,, [('arcellx', 100, 0), ('arcellx', 100, 242), ('arcellx', 100, 243), ('arcellx', 100, 244), ('arcellx', 100, 245)] arcellx
BOMBAS LLC [('bombas', 100, 1), ('compass', 62, 716), ('compass', 62, 717), ('compass', 62, 718), ('compass', 62, 719)] bombas
Bright Health Group, Inc. [('bright-health', 100, 2), ('bright-health', 100, 449), ('bright-health', 100, 450), ('bright-health', 100, 451), ('bright-health', 100, 452)] bright-health
Think & Learn Private Ltd. [('clear-street', 46, 659), ('clear-street', 46, 660), ('clear-street', 46, 661), ('lanzatech', 44, 1587), ('lanzatech', 44, 1588)] byju-s
C3Ai Inc [('c3-iot', 60, 4), ('c3-iot', 60, 480), ('c3-iot', 60, 481), ('c3-iot', 60, 482), ('care', 50, 533)] c3-iot
CARESYNTAX INC C3 [('caresyntax', 100, 5), ('caresyntax', 100, 539), ('caresyntax', 100, 540), ('caresyntax', 100, 541), ('c3-iot', 50, 4)] caresyntax
Carmot Therapeutics, Inc. [('carmot-therapeutics', 100, 6), ('clearmotion', 71, 662), ('clearmotion', 71, 663), ('cl

In [8]:
# Ranked matchings
exact_matches = 0
partial_matches = 0
total_items = len(word_list)
matched = []
partial_matched = []
not_matched = []

for ranked_match_list, expected in zip(matched_companies_full_record, word_list):
    if ranked_match_list:
        first_match = ranked_match_list[0][0][0]  # Get the first-ranked match
        nport_issuer_name = ranked_match_list[1]
        

        # Exact match
        if first_match == expected:
            exact_matches += 1
            matched.append((nport_issuer_name, expected, first_match))
            
        # Partial match (i.e., the expected company appears anywhere else in the ranked list)
        elif any(match[0] == expected for match in ranked_match_list[0]):
            partial_matches += 1
            partial_matched.append((nport_issuer_name, expected, ranked_match_list[0]))
            
        else:
            not_matched.append((nport_issuer_name, expected, ranked_match_list[0]))


exact_match_accuracy = exact_matches / total_items if total_items > 0 else 0
overall_match_accuracy = (exact_matches + partial_matches) / total_items if total_items > 0 else 0


print(f"Total items: {total_items}")
print(f"Number of exact matches: {exact_matches}")
print(f"Number of partial matches: {partial_matches}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2%}")
print(f"Overall Accuracy (including partial matches): {overall_match_accuracy:.2%}")



Total items: 3108
Number of exact matches: 2884
Number of partial matches: 45
Exact Match Accuracy: 92.79%
Overall Accuracy (including partial matches): 94.24%


In [235]:
print("Partial matched items:")
for item in partial_matched:
    print(item)

partial_matched_df = pd.DataFrame(partial_matched, columns=['nport_issuer_name', 'slug', 'rankings'])
partial_matched_df['method'] = 'fuzzy_matched'
partial_matched_df.to_csv('data/partial_matched_fuzzy.csv', index=False)


Partial matched items:
('Ant International Co', 'ant-financial', [('zipline-international', 79, 3069), ('zipline-international', 79, 3070), ('ant-financial', 61, 218), ('ant-financial', 61, 219), ('ant-financial', 61, 220)])
('Ant International', 'ant-financial', [('zipline-international', 87, 3069), ('zipline-international', 87, 3070), ('ant-financial', 67, 218), ('ant-financial', 67, 219), ('ant-financial', 67, 220)])
('ASAPP INC', 'asapp', [('asana', 75, 257), ('asana', 75, 258), ('asana', 75, 259), ('asana', 75, 260), ('asapp', 75, 261)])
('Roku, Inc.', 'boku', [('roku', 100, 2314), ('roku', 100, 2315), ('roku', 100, 2316), ('roku', 100, 2317), ('boku', 75, 415)])
('Roku Inc', 'boku', [('roku', 100, 2314), ('roku', 100, 2315), ('roku', 100, 2316), ('roku', 100, 2317), ('boku', 75, 415)])
('BOLT BIOTHERAPEUTICS INC', 'bolt-biotherapeutics', [('bolt', 100, 418), ('bolt-biotherapeutics', 67, 419), ('bolt-biotherapeutics', 67, 420), ('bolt-biotherapeutics', 67, 421), ('bolt-financial',

In [9]:
# Function to filter duplicates while keeping the highest score for each unique name
def filter_unique_matches(ranked_matches):
    unique_ranked_matches = {}
    for match in ranked_matches:
        if match[0] not in unique_ranked_matches or unique_ranked_matches[match[0]][1] < match[1]:
            unique_ranked_matches[match[0]] = match
    return list(unique_ranked_matches.values())

# Apply the filtering to create a new column with filtered matches in partial_matched
filtered_partial_matches = [
    (nport_issuer_name, expected, ranked_list, filter_unique_matches(ranked_list))
    for nport_issuer_name, expected, ranked_list in partial_matched
]

# Create DataFrame with both original and filtered matches
partial_matched_df = pd.DataFrame(filtered_partial_matches, columns=['nport_issuer_name', 'slug', 'rankings', 'filtered_rankings'])
partial_matched_df['method'] = 'fuzzy_matched'
partial_matched_df.to_csv('data/partial_matched_fuzzy.csv', index=False)

In [237]:
print(f"Number of not matches: {len(not_matched)}")
print("Not matched items:")
for item in not_matched:
    print(item)

Number of not matches: 176
Not matched items:
('Think & Learn Private Ltd.', 'byju-s', [('clear-street', 46, 659), ('clear-street', 46, 660), ('clear-street', 46, 661), ('lanzatech', 44, 1587), ('lanzatech', 44, 1588)])
('DOOR DASH', 'doordash', [('honor', 57, 1411), ('honor', 57, 1412), ('honor', 57, 1413), ('honor', 57, 1414), ('honor', 57, 1415)])
('NEOGEN CORPORATION', 'neogene-therapeutics', [('neurogene', 80, 1812), ('neurogene', 80, 1813), ('neurogene', 80, 1814), ('heliogen', 71, 1401), ('heliogen', 71, 1402)])
('RAPPORT THERAPEUTICS INC', 'rapport-therapeutics', [('rover', 60, 2323), ('rover', 60, 2324), ('rover', 60, 2325), ('rover', 60, 2326), ('rover', 60, 2327)])
('JAND INC', 'warby-parker', [('xad', 57, 3001), ('23andme', 55, 59), ('23andme', 55, 60), ('23andme', 55, 61), ('23andme', 55, 62)])
('Life Healthcare Group Holdings Ltd', '1life-healthcare', [('trisalus-life-sciences', 67, 41), ('verily-life-sciences', 67, 44), ('caris-life-sciences', 67, 546), ('caris-life-scie

In [20]:

not_matched_slugs = not_matched + partial_matched


In [239]:
# Perfect matching 
def find_matches(phrase, word_list, exact_match_weight=3):
    cleaned_words = clean_text(phrase).split()
    
    # A dictionary to keep track of match scores
    match_scores = {w: 0 for w in word_list}
    
    for word in cleaned_words:
        regex_pattern = re.escape(word)
        print(regex_pattern)
        compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)
        print(compiled_pattern)
        
        # Iterate through the word list and conut matches
        for list_item in word_list:
            if compiled_pattern.search(clean_text(list_item)):
                match_scores[list_item] += 1

    # Add exact match weight if the whole phrase is found
    for list_item in word_list:
        if re.search(re.escape(clean_text(phrase)), clean_text(list_item)):
            match_scores[list_item] += exact_match_weight
    
    # Rank the results based on the number of matches
    ranked_results = sorted(match_scores.items(), key=lambda x: x[1], reverse=True)
    
    return ranked_results



In [21]:
# match with weights on first word

def find_matches_with_weights(phrase, word_list, exact_match_weight=3, first_word_weight=2, other_word_weight=1):
    cleaned_words = clean_text(phrase).split()
    match_scores = {w: 0 for w in word_list}
    
    # Handle the first word with a higher weight
    if cleaned_words:
        first_word = cleaned_words[0]
        first_word_regex = re.escape(first_word)
        # print(first_word_regex)
        compiled_first_word_pattern = re.compile(first_word_regex, re.IGNORECASE)
        # print(compiled_first_word_pattern)
        
        # Add higher weight for first word matches
        for list_item in word_list:
            if compiled_first_word_pattern.search(clean_text(list_item)):
                match_scores[list_item] += first_word_weight
    
    # Handle other words with a standard weight
    for word in cleaned_words[1:]:  
        regex_pattern = re.escape(word)
        # print(regex_pattern)
        compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)       
        # print(compiled_pattern)
        
        for list_item in word_list:
            if compiled_pattern.search(clean_text(list_item)):
                match_scores[list_item] += other_word_weight

    # Add exact match weight if the whole phrase is found
    for list_item in word_list:
        if re.search(re.escape(clean_text(phrase)), clean_text(list_item)):
            match_scores[list_item] += exact_match_weight
    
    # Rank the results based on the number of matches
    ranked_results = sorted(match_scores.items(), key=lambda x: x[1], reverse=True)
    
    return ranked_results


In [30]:
correct_first_match = 0
partial_matches = 0
total_items = len(not_matched_slugs)

unmatched_items = []
partial_matched_items = []
matched_items = []
df_rows = []

for t in not_matched_slugs:
    not_matched_slug = t[1]  # slug
    not_matched_nport_issuer_name = t[0]  # nport_issuer_name
    
    # Get ranked matches
    matches = find_matches_with_weights(not_matched_nport_issuer_name, word_list)
    
    ranked_matches = [match for match in matches if match[1] > 0]
    ranked_matches_str = ', '.join([f"('{item[0]}', {item[1]})" for item in ranked_matches])
    
    match_status = ""
    
    if ranked_matches:
        # Check the first match
        first_match = ranked_matches[0][0]
        
        if first_match == not_matched_slug:
            # Exact match in the first position
            correct_first_match += 1
            match_status = "Matched"
            # add t[0], t[1], ranked_matches to list
            matched_items.append((t[0], t[1], ranked_matches[0]))
        else:
            # If second itme exists in the rest of the ranked matches
            if any(match[0] == not_matched_slug for match in ranked_matches[1:]):
                partial_matches += 1
                partial_matched_items.append((t[0], t[1], ranked_matches))
                match_status = "Partial Match"
            else:
                unmatched_items.append((t[0], t[1], ranked_matches))
                match_status = "Not Matched"
    else:
        unmatched_items.append(t)
        match_status = "Not Ranked - Not Matched"

#     df_rows.append([not_matched_slug, not_matched_nport_issuer_name, ranked_matches_str, match_status])

# _df = pd.DataFrame(df_rows, columns=['not_matched_slug', 'not_matched_nport_issuer_name', 'Ranked Matches', 'Match Status'])

# _df.to_csv('data/output.csv', index=False)

first_match_accuracy = correct_first_match / total_items if total_items > 0 else 0
overall_match_accuracy = (correct_first_match + partial_matches) / total_items if total_items > 0 else 0

print(f"First Match Accuracy: {first_match_accuracy:.2f}")
print(f"Overall Match Accuracy (including partial matches): {overall_match_accuracy:.2f}")
print("Unmatched items:", len(unmatched_items))


First Match Accuracy: 0.22
Overall Match Accuracy (including partial matches): 0.52
Unmatched items: 108


In [31]:
matched_items_df = pd.DataFrame(matched_items, columns=['nport_issuer_name', 'slug', 'rankings'])
matched_items_df.to_csv('data/_test.csv', index=False)

In [32]:
# create a df out of _df where match status is Partial Match
partial_matched_df = pd.DataFrame(partial_matched_items, columns=['nport_issuer_name', 'slug', 'rankings'])
partial_matched_df

Unnamed: 0,nport_issuer_name,slug,rankings
0,Life Healthcare Group Holdings Ltd,1life-healthcare,"[(caris-life-sciences, 20), (1life-healthcare,..."
1,Ant Group Co Ltd,ant-financial,"[(palantir, 26), (convoy, 15), (ant-financial,..."
2,Ant International Co. Limited,ant-financial,"[(palantir, 26), (convoy, 15), (ant-financial,..."
3,Ant International Co. Limited Class C Shares,ant-financial,"[(convoy, 30), (palantir, 26), (databricks, 21..."
4,AppLovin Corp.,applovin,"[(hashicorp, 10), (applovin, 8), (brain-corp, ..."
...,...,...,...
61,"TEBRA TECHNOLOGIES, INC. (F.K.A. KAREO, INC.) ...",tebra,"[(kareo, 4), (tebra, 4), (commonbond, 2)]"
62,The Honest Company Inc.,the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."
63,"The Honest Company, Inc.",the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."
64,THE HONEST COMPANY INC,the-honest-company,"[(the-honest-company-inc, 12), (the-honest-com..."


In [37]:
# Apply the filtering to create a new column with filtered matches in partial_matched
filtered_partial_matched = [
    (row['nport_issuer_name'], row['slug'], row['rankings'], filter_unique_matches(row['rankings']))
    for _, row in partial_matched_df.iterrows()
]
filtered_partial_matched_df = pd.DataFrame(filtered_partial_matched, columns=['nport_issuer_name', 'slug', 'rankings', 'filtered_rankings'])
filtered_partial_matched_df['method'] = 'perfect_matched'
filtered_partial_matched_df.to_csv('data/partial_matched_perfect.csv', index=False)



In [None]:
# create a pd dataframe of the unmatched items
_not_matched_list = []
for t in unmatched_items:
    _not_matched_list.append([t[1], t[0]])

_df_not_matched = pd.DataFrame(_not_matched_list, columns=['slug', 'nport_issuer_name'])
print(_df_not_matched.head())
_df_not_matched.to_csv('data/unmatched_perfect.csv', index=False)

           slug                 nport_issuer_name
0        byju-s        Think & Learn Private Ltd.
1  warby-parker                          JAND INC
2   3d-robotics           Nauticus Robotics, Inc.
3        adroll  NEXTROLL, INC. COMMON SHARES   /
4        adroll        NEXTROLL COMMON SHARES   /


In [58]:
not_matched_list = []
for t in unmatched_items:
    not_matched_list.append([t[0], t[1]])
for t in partial_matched_items:
    not_matched_list.append((t[0], t[1]))

df_not_matched = pd.DataFrame(not_matched_list, columns=['nport_issuer_name','slug'])

In [59]:

df_forge_price = pd.read_csv('data/forge_price_sample.csv')
df_funding_rounds = pd.read_csv('data/funding-rounds.csv')
df_issuers = pd.read_csv('data/issuers.csv')
df_trade_facts = pd.read_csv('data/trade_facts.csv')


df_merged_result = df_issuers.merge(df_forge_price, on='ISSUER_SLUG', how='left') \
            .merge(df_funding_rounds, on='ISSUER_SLUG', how='left') \
            .merge(df_trade_facts, on='ISSUER_SLUG', how='left')
# df_merged_result.to_csv('data/merged_result.csv', index=False)
df_merged_result = df_merged_result.rename(columns={'ISSUER_SLUG': 'slug'})

# join df_not_matched with df_merged_result on left join where df_not_matched.slug = df_merged_result.ISSUER_SLUG 
df_joined_unmatched_merged_result = df_not_matched.merge(df_merged_result, how='left')

print(df_joined_unmatched_merged_result.head())

df_joined_unmatched_merged_result.to_csv('data/joined_unmatched_merged_result.csv', index=False)

                  nport_issuer_name          slug          NAME  \
0        Think & Learn Private Ltd.        byju-s        Byju's   
1                          JAND INC  warby-parker  Warby Parker   
2           Nauticus Robotics, Inc.   3d-robotics           3DR   
3  NEXTROLL, INC. COMMON SHARES   /        adroll      NextRoll   
4        NEXTROLL COMMON SHARES   /        adroll      NextRoll   

           LEGALENTITYNAME                                      SEARCHALIASES  \
0  Think & Learn Pvt. Ltd.        [\n  "Byjus ",\n  "BYJU'S ",\n  "byju's"\n]   
1               Jand, Inc.                  [\n  "Jand",\n  "Warby Parker"\n]   
2        3D Robotics, Inc.  [\n  "3DR ",\n  "3dr ",\n  "3DRobotics ",\n  "...   
3           NextRoll, Inc.  [\n  "Adroll    ",\n  "adroll    ",\n  "Ad Rol...   
4           NextRoll, Inc.  [\n  "Adroll    ",\n  "adroll    ",\n  "Ad Rol...   

                                         DESCRIPTION STRUCTURED_DESCRIPTION  \
0  BYJU's is an ed-tech company

In [60]:
# Weight functions

from datetime import datetime

def weight_domicile_country_code(code):
    return 1 if code == 'US' else 0

def weight_domicile_state_code(code):
    return 1 if code == 'DE' else 0


funding_round_ranking = {
    'seed': 0,
    'series a': 1,
    'series b': 2,
    'series c': 3,
    'series d': 4,
    'series e': 5,
    'series f': 6,
    'series g': 7,
    'series h': 8,
    'series i': 9
}
def normalize_and_weight_series(series_types):  
    if pd.isna(series_types) or not series_types:  # Handle NaN or empty lists
        return 0  # Neutral weight for missing values or empty lists

    normalized_weights = []
    
    for series in series_types:
        series_lower = series.lower().strip()
        
        match = re.search(r'series [a-z]', series_lower)
        if match:
            normalized_type = match.group()
            weight = funding_round_ranking.get(normalized_type, 0)
            normalized_weights.append(weight)
        else:
            # Fallback for unrecognized series types (neutral)
            normalized_weights.append(0)  
    
    return max(normalized_weights)  # Return the highest weight in the list


def weight_recency(funding_dates_str):
    if pd.isna(funding_dates_str) or not funding_dates_str: # Handle NaN or empty lists
        return -99999  # Lowest weight for missing or empty values
    
    funding_dates_clean = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}', funding_dates_str)
    funding_dates = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S.%f') for date in funding_dates_clean]
    most_recent_date = max(funding_dates)
    now = datetime.now()
    recency_weight = (now - most_recent_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)


def weight_price_source(price_source):
    if pd.isna(price_source):  # Handle NaN or missing values
        return -0.1  # Assign negative weight to missing values
    price_source_lower = price_source.lower().strip()
    
    if re.search(r'vwap', price_source_lower):
        return 1  
    elif re.search(r'iois', price_source_lower):
        return 0.5  
    elif re.search(r'primary', price_source_lower):
        return 0.2  
    else:
        return -0.1  # Negative weight for other or unrecognized types

def weight_price_issuer_tier(tier):
    if pd.isna(tier):  # Handle NaN or missing values
        return 0  # Neutral weight for missing
    tier_lower = tier.lower().strip()
    if tier_lower == 'tier_1':
        return 1  # Highest weight for TIER_1
    else:
        return 0.5  # Neutral or medium weight for other tiers


def weight_price(price):
    if pd.isna(price):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return price  # Higher value means higher weight

def weight_implied_valuation(valuation):
    if pd.isna(valuation):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return valuation  # Higher value means higher weight

#create weight functions for HAS_IOIS, if true less weight, if false more weight
def weight_has_iois(has_iois):
    return 0 if has_iois else 1

# create weight functions for NUM_TRADES, if more trades, higher weight
def weight_num_trades(num_trades):
    return num_trades if num_trades else 0

# create weight functions for LAST_CLOSED_TRADE_DATE, if more recent, higher weight
def weight_last_closed_trade_date(last_closed_trade_date):
    if pd.isna(last_closed_trade_date) or not last_closed_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_closed_trade_date = datetime.strptime(last_closed_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_closed_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for LAST_PENDING_TRADE_DATE, if more recent, higher weight
def weight_last_pending_trade_date(last_pending_trade_date):
    if pd.isna(last_pending_trade_date) or not last_pending_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_pending_trade_date = datetime.strptime(last_pending_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_pending_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for ARCHIVEDAT, if present, lower weight
def weight_archived_at(archived_at):
    return 0 if pd.isna(archived_at) else 1


# output_columns += [
#     'YEARFOUNDED',
#     'CITY'
# ]


In [74]:
# all-MiniLM-L6-v2

df_not_matched_2 = df_joined_unmatched_merged_result.copy()
# apply normalize_name(str(text)) to df_not_matched_2
df_not_matched_2['cleaned_nport_issuer_name'] = df_not_matched_2.apply(lambda row: normalize_name(row['nport_issuer_name']), axis=1)

output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',	
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_output'] = df_not_matched_2.apply(combine_output_columns, axis=1)

words_to_find = df_not_matched_2['cleaned_nport_issuer_name'].tolist()
word_list = df_not_matched_2['combined_output'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')  

embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_not_matched_2['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_2['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_2['DOMICILESTATECODE_WEIGHT'] = df_not_matched_2['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_2['SHARE_TYPE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_2['FUNDING_DATE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_2['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_2['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_2['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_2['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_2['FORGE_PRICE_WEIGHT'] = df_not_matched_2['FORGE_PRICE'].apply(weight_price)
df_not_matched_2['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_2['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

df_not_matched_2['HAS_IOIS_WEIGHT'] = df_not_matched_2['HAS_IOIS'].apply(weight_has_iois)
df_not_matched_2['NUM_TRADES_WEIGHT'] = df_not_matched_2['NUM_TRADES'].apply(weight_num_trades)
df_not_matched_2['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_not_matched_2['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)


# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_2[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 20 
not_matched_results = []
partial_matched_results = []
exact_matxches = []


for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    seen_matches = set()  # Reset seen_matches for each new word being processed
    ground_truth_found = False

    current_slug = df_not_matched_2.iloc[idx]['slug']
    current_npot_issuer_name = df_not_matched_2.iloc[idx]['nport_issuer_name']

    for rank, index in enumerate(top_indices):
        # Retrieve a concise match identifier like 'slug' or 'NAME' instead of full text
        matched_word = df_not_matched_2.iloc[index]['slug'] if 'slug' in df_not_matched_2.columns else word_list[index]
        match_score = scores[index]
        
        # Only add the match if it's not a duplicate within the current ranking list
        if matched_word not in seen_matches:
            matches.append({
                'rank': len(matches) + 1,  # Rank based on unique entries added
                'match_word': matched_word,
                'score': match_score
            })
            seen_matches.add(matched_word)  # Mark this word as seen within this record

        if index == idx:
            ground_truth_found = True
            if len(matches) == 1:
                correct_top1 += 1  # Ground truth is the top match
                exact_matxches.append({
                        'nport_issuer_name': current_npot_issuer_name, 
                        'slug': current_slug,
                        'rankings': matches
                        })
                break
            correct_topk += 1     # Ground truth is within top_k matches
            partial_matched_results.append({
                'nport_issuer_name': current_npot_issuer_name, 
                'slug': current_slug,
                'rankings': sorted(matches, key=lambda x: x['score'], reverse=True)  # Sort by score
            })

    if not ground_truth_found:
        not_matched_results.append({
            'nport_issuer_name': current_npot_issuer_name, 
            'slug': current_slug,
            'rankings': sorted(matches, key=lambda x: x['score'], reverse=True)  # Sort by score
        })


total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")

# save not_matched_results to csv
df_not_matched_results = pd.DataFrame(not_matched_results)
df_not_matched_results.to_csv('data/not_matched_results.csv', index=False)



Top-1 Accuracy: 87.93%
Top-20 Accuracy: 10.34%


In [75]:
partial_matched_results_df = pd.DataFrame(partial_matched_results)
partial_matched_results_df['method'] = 'embedding_matched'

partial_matched_results_df.to_csv('data/partial_matched_embedding.csv', index=False)
partial_matched_results_df

Unnamed: 0,nport_issuer_name,slug,rankings,method
0,"Nauticus Robotics, Inc.",3d-robotics,"[{'rank': 1, 'match_word': 'aura-biosciences',...",embedding_matched
1,Formagrid Inc Ser F Cvt Pfd Pp,airtable,"[{'rank': 1, 'match_word': 'carbon-health', 's...",embedding_matched
2,Formagrid Inc Ser F Cvt /Pfd/,airtable,"[{'rank': 1, 'match_word': 'carbon-health', 's...",embedding_matched
3,GROUPON INC COMMON STOCK USD.0001,groupon,"[{'rank': 1, 'match_word': 'compass-therapeuti...",embedding_matched
4,Mh Sub I Llc,internet-brands,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
5,Mh Sub I Llc / Micro Holding Corp.,internet-brands,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
6,ANI Technologies Pvt Ltd,ola,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
7,ANI Technologies Pvt Ltd.,ola,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
8,Ani Technologies Pvt,ola,"[{'rank': 1, 'match_word': 'byju-s', 'score': ...",embedding_matched
9,Ani Technologies Pvt .,ola,"[{'rank': 1, 'match_word': 'byju-s', 'score': ...",embedding_matched


In [76]:
df_not_matched_results

Unnamed: 0,nport_issuer_name,slug,rankings
0,THEMIS SOLUTIONS INC PP,clio,"[{'rank': 1, 'match_word': 'internet-brands', ..."
1,PDD HOLDINGS INC,pinduoduo,"[{'rank': 1, 'match_word': 'd-wave-systems', '..."
2,PDD Holdings Inc.,pinduoduo,"[{'rank': 1, 'match_word': 'd-wave-systems', '..."


In [None]:
exact_matxches_results_df = pd.DataFrame(exact_matxches)
exact_matxches_results_df.to_csv('data/exact_matxches_embedding.csv', index=False)

overal = 3108
- matched = 2985 - (96%)
- partial matched = 118 (3.7%)
- unmatched = 5 - (0.16%)



Additional weights for input text

In [77]:

not_matched_plus_partial = pd.concat([df_not_matched_results, partial_matched_results_df])
not_matched_plus_partial

Unnamed: 0,nport_issuer_name,slug,rankings,method
0,THEMIS SOLUTIONS INC PP,clio,"[{'rank': 1, 'match_word': 'internet-brands', ...",
1,PDD HOLDINGS INC,pinduoduo,"[{'rank': 1, 'match_word': 'd-wave-systems', '...",
2,PDD Holdings Inc.,pinduoduo,"[{'rank': 1, 'match_word': 'd-wave-systems', '...",
0,"Nauticus Robotics, Inc.",3d-robotics,"[{'rank': 1, 'match_word': 'aura-biosciences',...",embedding_matched
1,Formagrid Inc Ser F Cvt Pfd Pp,airtable,"[{'rank': 1, 'match_word': 'carbon-health', 's...",embedding_matched
2,Formagrid Inc Ser F Cvt /Pfd/,airtable,"[{'rank': 1, 'match_word': 'carbon-health', 's...",embedding_matched
3,GROUPON INC COMMON STOCK USD.0001,groupon,"[{'rank': 1, 'match_word': 'compass-therapeuti...",embedding_matched
4,Mh Sub I Llc,internet-brands,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
5,Mh Sub I Llc / Micro Holding Corp.,internet-brands,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched
6,ANI Technologies Pvt Ltd,ola,"[{'rank': 1, 'match_word': 'ant-financial', 's...",embedding_matched


In [78]:
df_joined_not_matched_plus_partial = not_matched_plus_partial.merge(df_merged_result, how='left')

print(df_joined_not_matched_plus_partial.head())

df_joined_not_matched_plus_partial.to_csv('data/_test_joined_not_matched_plus_partial.csv', index=False)

                nport_issuer_name         slug  \
0         THEMIS SOLUTIONS INC PP         clio   
1                PDD HOLDINGS INC    pinduoduo   
2               PDD Holdings Inc.    pinduoduo   
3         Nauticus Robotics, Inc.  3d-robotics   
4  Formagrid Inc Ser F Cvt Pfd Pp     airtable   

                                            rankings             method  \
0  [{'rank': 1, 'match_word': 'internet-brands', ...                NaN   
1  [{'rank': 1, 'match_word': 'd-wave-systems', '...                NaN   
2  [{'rank': 1, 'match_word': 'd-wave-systems', '...                NaN   
3  [{'rank': 1, 'match_word': 'aura-biosciences',...  embedding_matched   
4  [{'rank': 1, 'match_word': 'carbon-health', 's...  embedding_matched   

        NAME                                    LEGALENTITYNAME  \
0       Clio                              Themis Solutions Inc.   
1  Pinduoduo  Shanghai Xun Meng Information Technology Co., ...   
2  Pinduoduo  Shanghai Xun Meng Information Tec

In [79]:
df_not_matched_3 = df_joined_not_matched_plus_partial.copy()

# Apply 'normalize_name' to 'nport_issuer_name'
df_not_matched_3['cleaned_nport_issuer_name'] = df_not_matched_3.apply(lambda row: normalize_name(row['nport_issuer_name']), axis=1)

output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

# Define weights for each output column
column_weights = {
    'slug': 1.0,
    'NAME': 2.0,
    'LEGALENTITYNAME': 2.0,
    'WEBSITE': 1.5,
    'SEARCHALIASES': 1.5,
    'DESCRIPTION': 1.0,
    'STRUCTURED_DESCRIPTION': 1.0,
    'LIFECYCLESTATUS': 1.0,
    'BANNERMESSAGE': 1.0,
    'SUB_SECTOR': 1.0,
    'SECTOR': 1.0,
    'CRUNCHBASEURL': 1.0
}

# Clean and store text for each output column
for col in output_columns:
    if col in df_not_matched_3.columns:
        df_not_matched_3[col + '_clean'] = df_not_matched_3[col].apply(
            lambda x: clean_text(x) if pd.notnull(x) else ''
        )

    else:
        df_not_matched_3[col + '_clean'] = ''

# Prepare 'words_to_find_3' (inputs)
words_to_find_3 = df_not_matched_3['cleaned_nport_issuer_name'].tolist()

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the input texts
embeddings_to_find_3 = model.encode(words_to_find_3, convert_to_numpy=True, normalize_embeddings=True)

# Generate and combine embeddings for output columns with weights
weighted_embeddings_list_3 = []

for idx, row in df_not_matched_3.iterrows():
    combined_embedding = np.zeros(model.get_sentence_embedding_dimension())
    total_weight = 0.0

    for col in output_columns:
        col_clean = col + '_clean'
        text = row[col_clean]
        weight = column_weights.get(col, 1.0)
        if text:
            embedding = model.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0]
            combined_embedding += weight * embedding
            total_weight += weight

    if total_weight > 0:
        combined_embedding /= total_weight  # Normalize by total weight
    weighted_embeddings_list_3.append(combined_embedding)

embeddings_list_3 = np.array(weighted_embeddings_list_3)

# Now, proceed to include weighted features as before
# Apply the weight functions to the DataFrame
df_not_matched_3['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_3['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_3['DOMICILESTATECODE_WEIGHT'] = df_not_matched_3['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_3['SHARE_TYPE_WEIGHT'] = df_not_matched_3['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_3['FUNDING_DATE_WEIGHT'] = df_not_matched_3['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_3['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_3['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_3['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_3['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_3['FORGE_PRICE_WEIGHT'] = df_not_matched_3['FORGE_PRICE'].apply(weight_price)
df_not_matched_3['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_3['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)
df_not_matched_3['HAS_IOIS_WEIGHT'] = df_not_matched_3['HAS_IOIS'].apply(weight_has_iois)
df_not_matched_3['NUM_TRADES_WEIGHT'] = df_not_matched_3['NUM_TRADES'].apply(weight_num_trades)
df_not_matched_3['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_not_matched_3['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_not_matched_3['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_not_matched_3['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)

# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_3[weighted_feature_columns] = df_not_matched_3[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_3[weighted_feature_columns])

# Combine embeddings with weighted features
combined_embeddings_to_find_3 = np.hstack((embeddings_to_find_3, weighted_features))
combined_embeddings_list_3 = np.hstack((embeddings_list_3, weighted_features))

# Compute cosine similarity matrix
cosine_sim_matrix_3 = cosine_similarity(combined_embeddings_to_find_3, combined_embeddings_list_3)

# Initialize counters for accuracy calculation
correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 20
not_matched_results = []
partial_matched_results = []
exact_matches = []

for idx, (word, scores) in enumerate(zip(words_to_find_3, cosine_sim_matrix_3)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    seen_matches = set()  # Reset seen_matches for each new word being processed
    ground_truth_found = False

    current_slug = df_not_matched_3.iloc[idx]['slug']
    current_nport_issuer_name = df_not_matched_3.iloc[idx]['nport_issuer_name']

    for rank, index in enumerate(top_indices):
        # Retrieve a concise match identifier like 'slug' or 'NAME' instead of full text
        matched_word = df_not_matched_3.iloc[index]['slug'] if 'slug' in df_not_matched_3.columns else word_list[index]
        match_score = scores[index]

        # Only add the match if it's not a duplicate within the current ranking list
        if matched_word not in seen_matches:
            matches.append({
                'rank': len(matches) + 1,  # Rank based on unique entries added
                'match_word': matched_word,
                'score': match_score
            })
            seen_matches.add(matched_word)  # Mark this word as seen within this record

        if index == idx:
            ground_truth_found = True
            if len(matches) == 1:
                correct_top1 += 1  # Ground truth is the top match
                exact_matches.append({
                    'nport_issuer_name': current_nport_issuer_name,
                    'slug': current_slug,
                    'rankings': matches
                })
                break
            correct_topk += 1     # Ground truth is within top_k matches
            partial_matched_results.append({
                'nport_issuer_name': current_nport_issuer_name,
                'slug': current_slug,
                'rankings': sorted(matches, key=lambda x: x['score'], reverse=True)  # Sort by score
            })

    if not ground_truth_found:
        not_matched_results.append({
            'slug': current_slug,
            'nport_issuer_name': current_nport_issuer_name,
            'cleaned_nport_issuer_name': word,
            'matches': sorted(matches, key=lambda x: x['score'], reverse=True)  # Sort by score
        })

total = len(words_to_find_3)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")

# # Save not_matched_results to CSV
# df_not_matched_results = pd.DataFrame(not_matched_results)
# df_not_matched_results.to_csv('data/not_matched_results.csv', index=False)

Top-1 Accuracy: 80.95%
Top-20 Accuracy: 19.05%


In [81]:
partial_matched_results_df = pd.DataFrame(partial_matched_results)
partial_matched_results_df['method'] = 'embedding_matched'

partial_matched_results_df.to_csv('data/_test.csv', index=False)

In [80]:
exact_matxches_results_df = pd.DataFrame(exact_matches)
exact_matxches_results_df.to_csv('data/_test_exact.csv', index=False)