In [13]:
# Installs
# pip install fuzzywuzzy
# pip install python-Levenshtein

In [2]:
# Libraries
import pandas as pd
from fuzzywuzzy import fuzz, process

In [4]:
# Data scraped from Team 3 last semester to test the fuzzy matching algorithm  
chemnet_df = pd.read_csv('data/chemnet_contactdetails.csv')
chinachemnet_df = pd.read_csv('data/chinachemnet_all.csv', encoding='latin1')
foreign_trade_df = pd.read_csv('data/Foreign Trade with Contact Info.csv')
guidechem_df = pd.read_csv('data/guidechem_all.csv')
toocle_df = pd.read_csv('data/toocle_webscraped_id.csv')
trade_asia_df = pd.read_csv('data/TradeAsia_with_contact_info.csv')
# in trade_asia_df, rename Unique Company Names to Company Name to match other dataframes
trade_asia_df.rename(columns={'Unique Company Names': 'Company Name'}, inplace=True)

# Combine the "Company Name" columns from each dataframe and convert to a DataFrame
combined_companies = pd.concat([
    chemnet_df['Company Name'],
    chinachemnet_df['Company Name'],
    foreign_trade_df['Company Name'],
    guidechem_df['Company Name'],
    toocle_df['Company Name'],
    trade_asia_df['Company Name']
], axis=0).reset_index(drop=True)
# Convert the result to a DataFrame with a single column 'Company Name'
combined_companies = pd.DataFrame(combined_companies, columns=['Company Name'])
# Add the index as a separate column called 'id'
combined_companies['id'] = combined_companies.index
# Reorder the columns to have 'id' first
combined_companies = combined_companies[['id', 'Company Name']]
company_names = combined_companies['Company Name'].dropna()

# Fuzzy Matching using Levenshtein Distance

In [25]:
def get_fuzzy_matches(company_name: str, name_list: str, threshold: int=85) -> list[tuple[str, int]]:
    """
    Find fuzzy matches for a given company name from a list of company names.
    Only return matches that have a ratio above the threshold and are not an exact match of the input company.
    
    :param company_name: The company name to fuzzy match
    :param name_list: The list of company names to match against
    :param threshold: The similarity threshold for matches (default 85%)
    :return: A list of tuples with matching company names and similarity scores
    """
    # Get fuzzy matches using Levenshtein Distance
    fuzzy_matches = process.extract(company_name, name_list, scorer=fuzz.ratio, limit=5)
    filtered_matches = [match for match in fuzzy_matches if match[1] < 100 and match[1] >= threshold]
    return filtered_matches
    

In [26]:
example_company = company_names[0]
example_company

'Yancheng Hongtai Bioengineering Co.,Ltd.'

In [27]:
fuzzy_matches = get_fuzzy_matches(example_company, company_names, threshold=85)
print("Fuzzy Match List")
for match in fuzzy_matches:
    print(f" Match: {match[0]}, 'Similarity Score: {match[1]}")

Fuzzy Match List


In [28]:
def deduplicate_company_names(company_names: pd.Series, threshold: int=85) -> pd.Series:
    """
    Deduplicate company names by fuzzy matching similar names and retaining one representative name.
    
    :param company_names: A list of company names to deduplicate
    :param threshold: The similarity threshold for merging companies (default 85%)
    :return: A pandas Series with deduplicated company names
    """
    deduplicated_names = company_names.copy()
    processed = set()  # Track companies that are already processed
    for i, company in enumerate(company_names):
        if company in processed:
            continue
        # Find fuzzy matches for the current company
        matches = get_fuzzy_matches(company, company_names[i+1:], threshold=threshold)
        # Replace similar names with the representative name
        for match in matches:
            similar_name = match[0]
            # Replace all occurrences of the similar name with the representative company name
            deduplicated_names = deduplicated_names.replace(similar_name, company)
            processed.add(similar_name)
        processed.add(company)  

    return deduplicated_names.drop_duplicates()

In [29]:
deduplicated_company_names = deduplicate_company_names(company_names, threshold=85)

In [30]:
print(deduplicated_company_names)

0                 Yancheng Hongtai Bioengineering Co.,Ltd.
1                           Smart Chemicals Group Co. Ltd.
2        Yancheng HuaDe (DanCheng) Biological Engineeri...
3                                  Spec-Chem Industry Inc.
4                          Taixing Chemical Co., Ltd.(TCC)
                               ...                        
21792                                          zhishangbio
21794                                     China RC company
21797                   Shanghai Hanhong Trading Co., Ltd.
21800                         Kunlun International CO.,LTD
21801                                        hebeiminshang
Name: Company Name, Length: 630, dtype: object


In [31]:
print(f"Number of Companies in List before Fuzzy Matching: {len(company_names)}")
print(f"Number of Companies in List after Fuzzy Matching: {len(deduplicated_company_names)}")

Number of Companies in List before Fuzzy Matching: 21791
Number of Companies in List after Fuzzy Matching: 630


In [32]:
# Uncomment to generate csv
# deduplicated_company_names.to_csv('outputs/deduplicated_company_names.csv', index=False)

In [None]:
# Do the company refernce sheet next