In [1]:
import pandas as pd 
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from typing import List, Dict

In [2]:
# Data Processing for Aggregating Contact Information
df = pd.read_csv('data/merged_dataset.csv', encoding='latin1')
# Columns with contact information
contact_columns = [
    'Company Name',
    'Address',
    'Zip',
    'Phone',
    'Email',
    'Company Website'
]
# Create a new dataframe with only the contact information
df_contact_info = df[contact_columns].copy()
df_contact_info['Company Name'] = df_contact_info['Company Name'].str.lower().str.strip()
df_unique_names = df_contact_info.drop_duplicates(
    subset = [
        'Company Name',
        'Address',
        'Phone',
        'Email',
        'Company Website'
    ]
)
df_unique_names['Company Name'] = df_unique_names['Company Name'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_names['Company Name'] = df_unique_names['Company Name'].str.title()


In [3]:
def get_best_matches(name: str, choices: List[str], threshold: int = 85) -> List[str]:
    """
    Return a list of best matches for a given company name based on fuzzy matching.
    
    Parameters:
    name (str): The company name to match.
    choices (List[str]): A list of company names to compare against.
    threshold (int): The minimum score for a match to be considered (default is 85).
    
    Returns:
    List[str]: A list of company names that match the input name based on the threshold.
    """
    matches = process.extractBests(name, choices, scorer=fuzz.token_sort_ratio, score_cutoff=threshold)
    return [match[0] for match in matches]


In [4]:
company_names = df_unique_names['Company Name'].unique()
company_names = [name for name in company_names if isinstance(name, str)]
company_map: Dict[str, str] = {}

# Maps similar company names to the base name
for name in company_names:
    if name not in company_map:
        best_matches = get_best_matches(name, company_names)
        for match in best_matches:
            company_map[match] = name  

# Replace company names in the dataframe with the mapped base names
df_unique_names['Standardized Company Name'] = df_unique_names['Company Name'].map(company_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_names['Standardized Company Name'] = df_unique_names['Company Name'].map(company_map)


In [5]:
def aggregate_contact_info(group: pd.DataFrame) -> pd.Series:
    """
    Aggregate phone numbers, emails, addresses, and websites into a single entry for each group.
    
    Parameters:
    group (pd.DataFrame): A group of rows representing a company with similar names.
    
    Returns:
    pd.Series: A series containing the aggregated contact information for the group.
    """
    aggregated = {
        'Address(s)': '; '.join(group['Address'].dropna().unique()),
        'Phone(s)': '; '.join(group['Phone'].dropna().unique()),
        'Email(s)': '; '.join(group['Email'].dropna().unique()),
        'Company Website': '; '.join(group['Company Website'].dropna().unique())
    }
    return pd.Series(aggregated)

# Apply the aggregation to the grouped data by 'Standardized Company Name'
df_aggregated = df_unique_names.groupby('Standardized Company Name').apply(aggregate_contact_info).reset_index()

  df_aggregated = df_unique_names.groupby('Standardized Company Name').apply(aggregate_contact_info).reset_index()


In [6]:
df_aggregated.head()

Unnamed: 0,Standardized Company Name,Address(s),Phone(s),Email(s),Company Website
0,3B Scientific Corporation,,8.62E+12,john@adarchn.com,
1,3Way Pharm Inc. (Shanghai),"Room D218-219, No. 128, Xiangyin Road, Yangpu ...",86-15618982688,sales@3wpharm.com,http://www.3wpharm.com
2,"A.M Food Chemical Co., Limited","No.12406,jing shi Road.Jinan,China, , Shandong...",86-13964066237,chenli@amfoodchem.com,http://www.am-chemical.com
3,"Advanced Technology & Industrial Co., Ltd.",", , , China",(852) 23902293,sales@advtechind.com,www.advtechind.com
4,Afine Chemicals Limited,"7-601 ,Xigang Xinjie, Xihu Industrial Park, Sa...",86-571-85232125Â 85232161Â 85134551; 86-571-...,info@afinechem.com; info@afinechem.com sales...,www.afinechem.com


In [7]:
{
    "ABC Corporation": "ABC Corporation",
    "ABC Corp.": "ABC Corporation",
    "Corp ABC": "ABC Corporation",
    "XYZ Ltd": "XYZ Ltd",
    "XYZ Limited": "XYZ Ltd"
}

{'ABC Corporation': 'ABC Corporation',
 'ABC Corp.': 'ABC Corporation',
 'Corp ABC': 'ABC Corporation',
 'XYZ Ltd': 'XYZ Ltd',
 'XYZ Limited': 'XYZ Ltd'}