In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
import numpy as np
import pandas as pd

In [9]:
# Load the datasets
name_variations = pd.read_csv('name_variations.csv')
base_names = pd.read_csv('base_names.csv')

In [10]:
# Preprocessing function for names
def preprocess_name(name):
    name = name.lower()  # Convert to lower case
    name = ''.join([char for char in name if char.isalnum() or char.isspace()])  # Remove punctuation
    return ' '.join(name.split())  # Remove extra spaces

In [11]:
# Apply preprocessing
name_variations['Processed_Variation'] = name_variations['Variation'].apply(preprocess_name)
base_names['Processed_Base_Name'] = base_names['Base_Name'].apply(preprocess_name)

In [12]:
# Fuzzy matching function for names
from fuzzywuzzy import fuzz

def fuzzy_name_match(variation, base_names_df):
    scores = [fuzz.ratio(variation, base_name) for base_name in base_names_df['Processed_Base_Name']]
    best_match_index = np.argmax(scores)
    best_match_name = base_names_df.iloc[best_match_index]['Base_Name']
    best_match_score = scores[best_match_index]
    return best_match_name, best_match_score

In [13]:
# Apply fuzzy matching to name variations
name_variations['Best_Match'], name_variations['Match_Score'] = zip(
    *name_variations['Processed_Variation'].apply(lambda variation: fuzzy_name_match(variation, base_names))
)

In [14]:
# Display final matching results
name_variations[['Variation', 'Matches_With_Base_Name', 'Best_Match', 'Match_Score']]

Unnamed: 0,Variation,Matches_With_Base_Name,Best_Match,Match_Score
0,Thomas King,Thomas King,Thomas King,100
1,ThomasKing,Thomas King,Thomas King,95
2,Maria Garcia,Maria Garcia,Maria Garcia,100
3,MaryLewis,Mary Lewis,Mary Lewis,95
4,Nancy W.,Nancy Wright,Nancy Wright,74
...,...,...,...,...
95,Jennifer- Brown,Jennifer Brown,Jennifer Brown,100
96,Daniel- Scott,Daniel Scott,Daniel Scott,100
97,David M.,David Martinez,David Martinez,67
98,Paul Allen.,Paul Allen,Paul Allen,100
