In [1]:
# Installs
# pip install pandas
# pip install fuzzywuzzy
# pip install python-Levenshtein
# pip install rapidfuzz

In [2]:
import pandas as pd                       
from rapidfuzz import fuzz                         
from joblib import Parallel, delayed             
from typing import List, Dict             

In [3]:
# Adjust path here accordingly
df = pd.read_csv('data/Master_Dataset_w_Cleaned_Company_Names.csv', encoding='latin1')

In [4]:
# rename column Company_Name to Company Name 
df.rename(columns = {'Company_Name':'Company Name'}, inplace = True)

In [5]:
# Set desired threshold here
threshold = 90

In [6]:
def find_similar_names(company_name: str, company_list: pd.Series, threshold: int) -> List[Dict[str, str]]:
    """
    This function takes a company name and a list of companies, 
    and returns a list of dictionaries containing similar company names and their similarity scores.

    Parameters:
    company_name (str): The company name to compare.
    company_list (pd.Series): A list of company names to compare against.
    threshold (int): The minimum similarity score to consider a match.

    Returns:
    List[Dict[str, str]]: A list of dictionaries containing similar company names and their similarity scores.
    """
    similar_names = []
    
    if pd.isna(company_name):
        return similar_names
    
    for name in company_list:
        if pd.isna(name):
            continue
        
        similarity = fuzz.ratio(company_name, name)
        if similarity >= threshold and company_name != name:
            similar_names.append({"name": name, "similarity": similarity})
    
    return similar_names


def process_row(index, row, company_list, threshold):
    """
    Utility function to process each row in parallel to facilitate faster run times.
    """
    return find_similar_names(row, company_list, threshold)


In [7]:
company_list = df['Company Name']

In [8]:
# Getting representative company name via similarity
def get_most_representative_by_similarity(similar_names: List[Dict[str, int]]) -> str:
    """
    Returns the company name with the highest similarity score from the list of similar names.

    Parameters:
    similar_names (List[Dict[str, int]]): A list of dictionaries containing similar company names and their similarity scores.

    Returns:
    str: The most representative company name based on similarity.
    """
    if not similar_names:
        return None
    # Sort by similarity score and return the name with the highest score
    most_representative = max(similar_names, key=lambda x: x['similarity'])
    return most_representative['name']

# Using joblib's Parallel and delayed to parallelize the function
df['Similar Company Names'] = Parallel(n_jobs=-1)(delayed(process_row)(i, row, company_list, threshold) for i, row in df['Company Name'].items())
df['Most Representative Company Name by Similarity'] = df['Similar Company Names'].apply(get_most_representative_by_similarity)


In [9]:

df_similar_names = df[['Company Name', 'Similar Company Names', 'Most Representative Company Name by Similarity']]
df_similar_names = df_similar_names[df_similar_names.astype(str)['Similar Company Names'] != '[]']
df_similar_names.to_csv('outputs/Suggest via Similar Names Example Outputs.csv', index=False)

# Choosing Representative Name by Frequency

In [10]:
def get_most_representative_by_frequency(similar_names: List[Dict[str, int]], frequency_dict: Dict[str, int]) -> str:
    """
    This function returns the company name that occurs the most frequently in the dataset.

    Parameters:
    similar_names (List[Dict[str, int]]): A list of dictionaries containing similar company names and their similarity scores.
    frequency_dict (Dict[str, int]): A dictionary containing the frequency of each company name in the dataset.

    Returns:
    str: The most representative company name based on frequency of occurrence.
    """
    if not similar_names:
        return None
    
    most_representative = max(similar_names, key=lambda x: frequency_dict.get(x['name'], 0))
    return most_representative['name']

# Frequency of each company name
company_frequency = df['Company Name'].value_counts().to_dict()
# Use joblib's Parallel to parallelize the function
df['Similar Company Names'] = Parallel(n_jobs=-1)(delayed(process_row)(i, row, company_list, threshold) for i, row in df['Company Name'].items())
df['Most Representative Company by Frequency'] = df['Similar Company Names'].apply(lambda x: get_most_representative_by_frequency(x, company_frequency))


In [11]:
df_frequency = df[['Company Name', 'Similar Company Names', 'Most Representative Company by Frequency']]
df_frequency = df_frequency[df_frequency.astype(str)['Similar Company Names'] != '[]']
df_frequency.to_csv('outputs/Suggest via Frequency Example Outputs.csv', index=False)