New Test Data

In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [3]:
stop_words_normalized = [
        "healthcare",
        "technologies",
        "therapeutics",
        "financial",
        "software",
        "holdings",
        "transportation",
        "pharmaceuticals",
        "capital",
        "copper",
        "communications",
        "biotechnology",
        "biopharmaceuticals",
        "group",
        "technology",
        "media",
        "energy",
        "industries",
        "biotherapeutics",
        "solution",
        "bioscience",
        "industries",
        "corporation",
        "systems",
        "enterprises",
        "robotics",
        "bank",
        "inc",
        "llc",
        "pp",
        "series a",
        "series seed",
        "series b",
        "series c",
        "series d",
        "series e",
        "series f",
        "series g",
        "series h",
        "series i"
    ]

def normalize_name(name):
    name = name.lower() 
    for word in stop_words_normalized:
        name = name.replace(word, '')
    return name.strip()


stop_words = set(stopwords.words('english'))
def remove_stop_words(sentence):
    words = sentence.split()
    filtered_sentence = [w for w in words if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)


def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))


def clean_text(word_to_find):
    word_to_find_normalzed = normalize_name(word_to_find)

    word_to_find_no_stop_word = remove_stop_words(word_to_find_normalzed)

    word_to_find_no_punc = remove_punctuation(word_to_find_no_stop_word)

    return word_to_find_no_punc

In [2]:
news_test_data_df = pd.read_csv('new_test_data.csv')
words_to_find = news_test_data_df['Input string']
word_list = news_test_data_df['slug']
news_test_data_df

Unnamed: 0,Input string,slug
0,TAE Technologies,tae-technologies
1,tae,tae-technologies
2,Tri Alpha Energy,tae-technologies
3,Tri Alpha,tae-technologies
4,TAE Life Sciences,tae-technologies
5,TAE Power Management,tae-technologies
6,X.AI Corp,xai
7,Grok,xai
8,Grok AI,xai
9,Xaira Therapeutics,xaira-therapeutics


In [4]:
def match_company(name, master_list, top_n=5):
    name = normalize_name(name)
    
    # Extract the top N matches along with their scores
    ranked_matches = process.extract(name, master_list, scorer=fuzz.token_set_ratio, limit=top_n)
    return ranked_matches if ranked_matches else []

matched_companies = [match_company(c, word_list) for c in words_to_find]
matched_companies_full_record = [(c, word_to_find) for c, word_to_find in zip(matched_companies, words_to_find)]

In [5]:
# Ranked matchings
exact_matches = 0
partial_matches = 0
total_items = len(word_list)
matched = []
partial_matched = []
not_matched = []

for ranked_match_list, expected in zip(matched_companies_full_record, word_list):
    if ranked_match_list:
        first_match = ranked_match_list[0][0][0]  # Get the first-ranked match
        nport_issuer_name = ranked_match_list[1]
        

        # Exact match
        if first_match == expected:
            exact_matches += 1
            matched.append((nport_issuer_name, expected, first_match))
            
        # Partial match (i.e., the expected company appears anywhere else in the ranked list)
        elif any(match[0] == expected for match in ranked_match_list[0]):
            partial_matches += 1
            partial_matched.append((nport_issuer_name, expected, ranked_match_list[0]))
            
        else:
            not_matched.append((nport_issuer_name, expected, ranked_match_list[0]))


exact_match_accuracy = exact_matches / total_items if total_items > 0 else 0
overall_match_accuracy = (exact_matches + partial_matches) / total_items if total_items > 0 else 0


print(f"Total items: {total_items}")
print(f"Number of exact matches: {exact_matches}")
print(f"Number of partial matches: {partial_matches}")
print(f"Exact Match Accuracy: {exact_match_accuracy:.2%}")
print(f"Overall Accuracy (including partial matches): {overall_match_accuracy:.2%}")



Total items: 15
Number of exact matches: 9
Number of partial matches: 5
Exact Match Accuracy: 60.00%
Overall Accuracy (including partial matches): 93.33%


In [None]:
print("Partial matched items:")
for item in partial_matched:
    print(item)


Partial matched items:
('Tri Alpha Energy', 'tae-technologies', [('xai', 33, 6), ('xai', 33, 7), ('xai', 33, 8), ('tae-technologies', 32, 0), ('tae-technologies', 32, 1)])
('Tri Alpha', 'tae-technologies', [('xai', 33, 6), ('xai', 33, 7), ('xai', 33, 8), ('tae-technologies', 32, 0), ('tae-technologies', 32, 1)])
('TAE Life Sciences', 'tae-technologies', [('orionis-biosciences', 56, 11), ('orionis-biosciences', 56, 12), ('orionis-biosciences', 56, 13), ('tae-technologies', 55, 0), ('tae-technologies', 55, 1)])
('X.AI Corp', 'xai', [('yixia-com', 44, 14), ('xaira-therapeutics', 37, 9), ('xaira-therapeutics', 37, 10), ('xai', 33, 6), ('xai', 33, 7)])
('Orion Medicines', 'xaira-therapeutics', [('orionis-biosciences', 71, 11), ('orionis-biosciences', 71, 12), ('orionis-biosciences', 71, 13), ('xaira-therapeutics', 36, 9), ('xaira-therapeutics', 36, 10)])


In [22]:
# Function to filter duplicates while keeping the highest score for each unique name
def filter_unique_matches(ranked_matches):
    unique_ranked_matches = {}
    for match in ranked_matches:
        if match[0] not in unique_ranked_matches or unique_ranked_matches[match[0]][1] < match[1]:
            unique_ranked_matches[match[0]] = match
    return list(unique_ranked_matches.values())

# Apply the filtering to create a new column with filtered matches in partial_matched
filtered_partial_matches = [
    (nport_issuer_name, expected, ranked_list, filter_unique_matches(ranked_list))
    for nport_issuer_name, expected, ranked_list in partial_matched
]

# Create DataFrame with both original and filtered matches
partial_matched_df = pd.DataFrame(filtered_partial_matches, columns=['nport_issuer_name', 'slug', 'rankings', 'filtered_rankings'])
partial_matched_df['method'] = 'fuzzy_matched'
partial_matched_df.to_csv('data/new_test_data_partial_matched_fuzzy.csv', index=False)


In [7]:
print(f"Number of not matches: {len(not_matched)}")
print("Not matched items:")
for item in not_matched:
    print(item)

Number of not matches: 1
Not matched items:
('Grok', 'xai', [('orionis-biosciences', 17, 11), ('orionis-biosciences', 17, 12), ('orionis-biosciences', 17, 13), ('yixia-com', 15, 14), ('tae-technologies', 10, 0)])


In [8]:
not_matched_slugs = [t for t in not_matched] 
not_matched_slugs

[('Grok',
  'xai',
  [('orionis-biosciences', 17, 11),
   ('orionis-biosciences', 17, 12),
   ('orionis-biosciences', 17, 13),
   ('yixia-com', 15, 14),
   ('tae-technologies', 10, 0)])]

In [9]:
# match with weights on first word

def find_matches_with_weights(phrase, word_list, exact_match_weight=3, first_word_weight=2, other_word_weight=1):
    cleaned_words = clean_text(phrase).split()
    match_scores = {w: 0 for w in word_list}
    
    # Handle the first word with a higher weight
    if cleaned_words:
        first_word = cleaned_words[0]
        first_word_regex = re.escape(first_word)
        # print(first_word_regex)
        compiled_first_word_pattern = re.compile(first_word_regex, re.IGNORECASE)
        # print(compiled_first_word_pattern)
        
        # Add higher weight for first word matches
        for list_item in word_list:
            if compiled_first_word_pattern.search(clean_text(list_item)):
                match_scores[list_item] += first_word_weight
    
    # Handle other words with a standard weight
    for word in cleaned_words[1:]:  
        regex_pattern = re.escape(word)
        # print(regex_pattern)
        compiled_pattern = re.compile(regex_pattern, re.IGNORECASE)       
        # print(compiled_pattern)
        
        for list_item in word_list:
            if compiled_pattern.search(clean_text(list_item)):
                match_scores[list_item] += other_word_weight

    # Add exact match weight if the whole phrase is found
    for list_item in word_list:
        if re.search(re.escape(clean_text(phrase)), clean_text(list_item)):
            match_scores[list_item] += exact_match_weight
    
    # Rank the results based on the number of matches
    ranked_results = sorted(match_scores.items(), key=lambda x: x[1], reverse=True)
    
    return ranked_results


In [13]:
correct_first_match = 0
partial_matches = 0
total_items = len(not_matched_slugs)

unmatched_items = []
partial_matched_items = []
df_rows = []

for t in not_matched_slugs:
    not_matched_slug = t[1]  # slug
    not_matched_nport_issuer_name = t[0]  # nport_issuer_name
    
    # Get ranked matches
    matches = find_matches_with_weights(not_matched_nport_issuer_name, word_list)
    
    ranked_matches = [match for match in matches if match[1] > 0]
    ranked_matches_str = ', '.join([f"('{item[0]}', {item[1]})" for item in ranked_matches])
    
    match_status = ""
    
    if ranked_matches:
        # Check the first match
        first_match = ranked_matches[0][0]
        
        if first_match == not_matched_slug:
            # Exact match in the first position
            correct_first_match += 1
            match_status = "Matched"
        else:
            # If second itme exists in the rest of the ranked matches
            if any(match[0] == not_matched_slug for match in ranked_matches[1:]):
                partial_matches += 1
                partial_matched_items.append(t)
                match_status = "Partial Match"
            else:
                unmatched_items.append(t)
                match_status = "Not Matched"
    else:
        unmatched_items.append(t)
        match_status = "Not Ranked - Not Matched"

    df_rows.append([not_matched_slug, not_matched_nport_issuer_name, ranked_matches_str, match_status])

_df = pd.DataFrame(df_rows, columns=['not_matched_slug', 'not_matched_nport_issuer_name', 'Ranked Matches', 'Match Status'])

_df.to_csv('data/new_test_data_output.csv', index=False)

first_match_accuracy = correct_first_match / total_items if total_items > 0 else 0
overall_match_accuracy = (correct_first_match + partial_matches) / total_items if total_items > 0 else 0

print(f"First Match Accuracy: {first_match_accuracy:.2f}")
print(f"Overall Match Accuracy (including partial matches): {overall_match_accuracy:.2f}")
print("Unmatched items:", len(unmatched_items))


First Match Accuracy: 0.00
Overall Match Accuracy (including partial matches): 0.00
Unmatched items: 1


In [14]:
partial_matched_items_df = pd.DataFrame(partial_matched_items, columns=['nport_issuer_name', 'slug', 'rankings'])
partial_matched_items_df['method'] = 'perfect_matched'

partial_matched_items_df.to_csv('data/new_test_data_partial_matched_perfect.csv', index=False)

In [15]:
# create a pd dataframe of the unmatched items
not_matched_list = []
for t in unmatched_items:
    not_matched_list.append([t[1], t[0]])

df_not_matched = pd.DataFrame(not_matched_list, columns=['slug', 'nport_issuer_name'])
print(df_not_matched.head())
df_not_matched.to_csv('data/new_test_data_unmatched_perfect.csv', index=False)

  slug nport_issuer_name
0  xai              Grok


In [27]:
df_forge_price = pd.read_csv('data/forge_price_sample.csv')
df_funding_rounds = pd.read_csv('data/funding-rounds.csv')
df_issuers = pd.read_csv('data/issuers.csv')
df_trade_facts = pd.read_csv('data/trade_facts.csv')


df_merged_result = df_issuers.merge(df_forge_price, on='ISSUER_SLUG', how='left') \
            .merge(df_funding_rounds, on='ISSUER_SLUG', how='left') \
            .merge(df_trade_facts, on='ISSUER_SLUG', how='left')
# df_merged_result.to_csv('data/merged_result.csv', index=False)
df_merged_result = df_merged_result.rename(columns={'ISSUER_SLUG': 'slug'})

# join df_not_matched with df_merged_result on left join where df_not_matched.slug = df_merged_result.ISSUER_SLUG 
df_joined_unmatched_merged_result = df_not_matched.merge(df_merged_result, how='left')

print(df_joined_unmatched_merged_result.head())

df_joined_unmatched_merged_result.to_csv('data/new_test_data_joined_unmatched_merged_result.csv', index=False)

  slug nport_issuer_name NAME LEGALENTITYNAME SEARCHALIASES  \
0  xai              Grok  xAI        xAI Corp            []   

                                         DESCRIPTION STRUCTURED_DESCRIPTION  \
0  xAI is an AI research company which offers AI ...                          

  LIFECYCLESTATUS BANNERMESSAGE         SUB_SECTOR  ...  \
0             NaN           NaN  Data Intelligence  ...   

  FORGE_PRICE_ISSUER_TIER FORGE_PRICE FORGE_IMPLIED_VALUATION  \
0                  TIER_1       11.97            2.400000e+10   

   FORGE_PRICE_SOURCE_EXTERNAL HAS_IOIS            ARRAY_AGG(FUNDING_DATE)  \
0                         VWAP    False  [\n  "2024-05-26 00:00:00.000"\n]   

  ARRAY_AGG(FR.SHARE_TYPE) NUM_TRADES LAST_CLOSED_TRADE_DATE  \
0       [\n  "Series B"\n]       28.0                6/27/24   

   LAST_PENDING_TRADE_DATE  
0                  6/26/24  

[1 rows x 28 columns]


In [17]:
# Weight functions

from datetime import datetime

def weight_domicile_country_code(code):
    return 1 if code == 'US' else 0

def weight_domicile_state_code(code):
    return 1 if code == 'DE' else 0


funding_round_ranking = {
    'seed': 0,
    'series a': 1,
    'series b': 2,
    'series c': 3,
    'series d': 4,
    'series e': 5,
    'series f': 6,
    'series g': 7,
    'series h': 8,
    'series i': 9
}
def normalize_and_weight_series(series_types):  
    if pd.isna(series_types) or not series_types:  # Handle NaN or empty lists
        return 0  # Neutral weight for missing values or empty lists

    normalized_weights = []
    
    for series in series_types:
        series_lower = series.lower().strip()
        
        match = re.search(r'series [a-z]', series_lower)
        if match:
            normalized_type = match.group()
            weight = funding_round_ranking.get(normalized_type, 0)
            normalized_weights.append(weight)
        else:
            # Fallback for unrecognized series types (neutral)
            normalized_weights.append(0)  
    
    return max(normalized_weights)  # Return the highest weight in the list


def weight_recency(funding_dates_str):
    if pd.isna(funding_dates_str) or not funding_dates_str: # Handle NaN or empty lists
        return -99999  # Lowest weight for missing or empty values
    
    funding_dates_clean = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}', funding_dates_str)
    funding_dates = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S.%f') for date in funding_dates_clean]
    most_recent_date = max(funding_dates)
    now = datetime.now()
    recency_weight = (now - most_recent_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)


def weight_price_source(price_source):
    if pd.isna(price_source):  # Handle NaN or missing values
        return -0.1  # Assign negative weight to missing values
    price_source_lower = price_source.lower().strip()
    
    if re.search(r'vwap', price_source_lower):
        return 1  
    elif re.search(r'iois', price_source_lower):
        return 0.5  
    elif re.search(r'primary', price_source_lower):
        return 0.2  
    else:
        return -0.1  # Negative weight for other or unrecognized types

def weight_price_issuer_tier(tier):
    if pd.isna(tier):  # Handle NaN or missing values
        return 0  # Neutral weight for missing
    tier_lower = tier.lower().strip()
    if tier_lower == 'tier_1':
        return 1  # Highest weight for TIER_1
    else:
        return 0.5  # Neutral or medium weight for other tiers


def weight_price(price):
    if pd.isna(price):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return price  # Higher value means higher weight

def weight_implied_valuation(valuation):
    if pd.isna(valuation):  # Handle NaN or missing values
        return 0  # Neutral weight for missing values
    return valuation  # Higher value means higher weight

#create weight functions for HAS_IOIS, if true less weight, if false more weight
def weight_has_iois(has_iois):
    return 0 if has_iois else 1

# create weight functions for NUM_TRADES, if more trades, higher weight
def weight_num_trades(num_trades):
    return num_trades if num_trades else 0

# create weight functions for LAST_CLOSED_TRADE_DATE, if more recent, higher weight
def weight_last_closed_trade_date(last_closed_trade_date):
    if pd.isna(last_closed_trade_date) or not last_closed_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_closed_trade_date = datetime.strptime(last_closed_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_closed_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# create weight functions for LAST_PENDING_TRADE_DATE, if more recent, higher weight
def weight_last_pending_trade_date(last_pending_trade_date):
    if pd.isna(last_pending_trade_date) or not last_pending_trade_date:  # Handle NaN or missing values
        return -99999  # Lowest weight for missing values
    last_pending_trade_date = datetime.strptime(last_pending_trade_date, '%m/%d/%y')
    now = datetime.now()
    recency_weight = (now - last_pending_trade_date).days
    return -recency_weight  # More recent = higher weight (less days, more weight)

# output_columns += [
    
#     'YEARFOUNDED',
#     'ARCHIVEDAT',
#     'CITY'
# ]


In [28]:
# all-MiniLM-L6-v2

df_not_matched_2 = df_joined_unmatched_merged_result.copy()
# apply normalize_name(str(text)) to df_not_matched_2
df_not_matched_2['cleaned_nport_issuer_name'] = df_not_matched_2.apply(lambda row: normalize_name(row['nport_issuer_name']), axis=1)

output_columns = [
    'slug',
    'NAME',
    'LEGALENTITYNAME',
    'SEARCHALIASES',
    'DESCRIPTION',
    'STRUCTURED_DESCRIPTION',
    'LIFECYCLESTATUS',
    'BANNERMESSAGE',	
    'SUB_SECTOR',
    'SECTOR',
    'WEBSITE',
    'CRUNCHBASEURL'
]

def combine_output_columns(row):
    texts = []
    for col in output_columns:
        if col in row and pd.notnull(row[col]):
            texts.append(clean_text(row[col]))
    combined_text = ' '.join(texts)
    return combined_text

df_not_matched_2['combined_output'] = df_not_matched_2.apply(combine_output_columns, axis=1)

words_to_find = df_not_matched_2['cleaned_nport_issuer_name'].tolist()
word_list = df_not_matched_2['combined_output'].tolist()


model = SentenceTransformer('all-MiniLM-L6-v2')  

embeddings_to_find = model.encode(words_to_find, convert_to_numpy=True, normalize_embeddings=True)
embeddings_list = model.encode(word_list, convert_to_numpy=True, normalize_embeddings=True)


 
# Apply the weight functions to the DataFrame
df_not_matched_2['DOMICILECOUNTRYCODE_WEIGHT'] = df_not_matched_2['DOMICILECOUNTRYCODE'].apply(weight_domicile_country_code)
df_not_matched_2['DOMICILESTATECODE_WEIGHT'] = df_not_matched_2['DOMICILESTATECODE'].apply(weight_domicile_state_code)
df_not_matched_2['SHARE_TYPE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FR.SHARE_TYPE)'].apply(normalize_and_weight_series)
df_not_matched_2['FUNDING_DATE_WEIGHT'] = df_not_matched_2['ARRAY_AGG(FUNDING_DATE)'].apply(weight_recency)
df_not_matched_2['FORGE_PRICE_SOURCE_WEIGHT'] = df_not_matched_2['FORGE_PRICE_SOURCE_EXTERNAL'].apply(weight_price_source)
df_not_matched_2['FORGE_PRICE_ISSUER_TIER_WEIGHT'] = df_not_matched_2['FORGE_PRICE_ISSUER_TIER'].apply(weight_price_issuer_tier)
df_not_matched_2['FORGE_PRICE_WEIGHT'] = df_not_matched_2['FORGE_PRICE'].apply(weight_price)
df_not_matched_2['FORGE_IMPLIED_VALUATION_WEIGHT'] = df_not_matched_2['FORGE_IMPLIED_VALUATION'].apply(weight_implied_valuation)

df_not_matched_2['HAS_IOIS_WEIGHT'] = df_not_matched_2['HAS_IOIS'].apply(weight_has_iois)
df_not_matched_2['NUM_TRADES_WEIGHT'] = df_not_matched_2['NUM_TRADES'].apply(weight_num_trades)
df_not_matched_2['LAST_CLOSED_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_CLOSED_TRADE_DATE'].apply(weight_last_closed_trade_date)
df_not_matched_2['LAST_PENDING_TRADE_DATE_WEIGHT'] = df_not_matched_2['LAST_PENDING_TRADE_DATE'].apply(weight_last_pending_trade_date)


# Select and normalize the weighted features
weighted_feature_columns = [
    'DOMICILECOUNTRYCODE_WEIGHT',
    'DOMICILESTATECODE_WEIGHT',
    'SHARE_TYPE_WEIGHT',
    'FUNDING_DATE_WEIGHT',
    'FORGE_PRICE_SOURCE_WEIGHT',
    'FORGE_PRICE_ISSUER_TIER_WEIGHT',
    'FORGE_PRICE_WEIGHT',
    'FORGE_IMPLIED_VALUATION_WEIGHT',
    'HAS_IOIS_WEIGHT',
    'NUM_TRADES_WEIGHT',
    'LAST_CLOSED_TRADE_DATE_WEIGHT',
    'LAST_PENDING_TRADE_DATE_WEIGHT'
]

# Fill NaN values with zeros
df_not_matched_2[weighted_feature_columns] = df_not_matched_2[weighted_feature_columns].fillna(0.0)

# Normalize the weighted features
scaler = MinMaxScaler()
weighted_features = scaler.fit_transform(df_not_matched_2[weighted_feature_columns])


combined_embeddings_to_find = np.hstack((embeddings_to_find, weighted_features))
combined_embeddings_list = np.hstack((embeddings_list, weighted_features))
cosine_sim_matrix = cosine_similarity(combined_embeddings_to_find, combined_embeddings_list)


correct_top1 = 0  # Correct matches at rank 1
correct_topk = 0  # Correct matches within top_k
top_k = 5 
not_matched_results = []
partial_matched_results = []

for idx, (word, scores) in enumerate(zip(words_to_find, cosine_sim_matrix)):
    top_indices = scores.argsort()[-top_k:][::-1]  # Indices of top_k scores in descending order
    matches = []
    ground_truth_found = False

    current_slug = df_not_matched_2.iloc[idx]['slug']
    current_npot_issuer_name = df_not_matched_2.iloc[idx]['nport_issuer_name']

    for rank, index in enumerate(top_indices):
        matched_word = word_list[index]
        match_score = scores[index]
        matches.append({
            'rank': rank + 1,
            'match_word': matched_word,
            'score': match_score
        })

        if index == idx:
            ground_truth_found = True
            if rank == 0:
                correct_top1 += 1  # Ground truth is the top match
            correct_topk += 1     # Ground truth is within top_k matches
            partial_matched_results.append({
                    'nport_issuer_name': current_npot_issuer_name, 
                    'slug': current_slug,
                    'rankings': matches
                    })

    if not ground_truth_found:
        not_matched_results.append({
                    'slug': current_slug,
                    'nport_issuer_name': current_npot_issuer_name, 
                    'cleaned_nport_issuer_name': word,
                    'matches': matches
                    })



total = len(words_to_find)
top1_accuracy = correct_top1 / total
topk_accuracy = correct_topk / total

print(f"Top-1 Accuracy: {top1_accuracy:.2%}")
print(f"Top-{top_k} Accuracy: {topk_accuracy:.2%}")

# save not_matched_results to csv
df_not_matched_results = pd.DataFrame(not_matched_results)
df_not_matched_results.to_csv('data/new_test_data_not_matched_results.csv', index=False)



Top-1 Accuracy: 100.00%
Top-5 Accuracy: 100.00%


In [19]:
partial_matched_results_df = pd.DataFrame(partial_matched_results)
partial_matched_items_df['method'] = 'embedding_matched'

partial_matched_items_df.to_csv('data/new_test_data_partial_matched_embedding.csv', index=False)

overal = 15
- matched = 10 - (66%)
- partial matched = 5 (33%)
- unmatched = 0 - (0%)

