In [1]:
import pandas as pd
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from metaphone import doublemetaphone
from sklearn.cluster import AgglomerativeClustering
from Levenshtein import distance as levenshtein_distance
import numpy as np
import tensorflow

In [8]:
# Load the dataset
file_path = "male.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Ensure column is correctly named
if "names" not in df.columns:
    raise ValueError("The dataset must have a column named 'names'.")

In [9]:
# Step 1: Normalize the names (remove extra spaces and convert to lowercase)
df["Normalized_Names"] = df["names"].str.strip().str.lower()

In [10]:
# Step 2: Transliterate names to Tamil (if necessary)
# Comment this block if names are already in Tamil
df["Tamil_Names"] = df["Normalized_Names"].apply(
    lambda x: transliterate(x, sanscript.ITRANS, sanscript.TAMIL)
)

In [11]:
# Step 3: Generate phonetic keys using Double Metaphone
df["Phonetic_Keys"] = df["Normalized_Names"].apply(lambda x: doublemetaphone(x)[0])

In [12]:
# Step 4: Compute pairwise Levenshtein distances between phonetic keys
phonetic_keys = df["Phonetic_Keys"].tolist()
num_names = len(phonetic_keys)
dist_matrix = np.zeros((num_names, num_names))

for i in range(num_names):
    for j in range(num_names):
        dist_matrix[i, j] = levenshtein_distance(phonetic_keys[i], phonetic_keys[j])

In [13]:
# Step 5: Perform clustering
clustering = AgglomerativeClustering(
    n_clusters=None, metric="precomputed", linkage="average", distance_threshold=2.0
)
labels = clustering.fit_predict(dist_matrix)

In [14]:
# Step 6: Group names into clusters and find a representative name
df["Cluster_Labels"] = labels

clusters = {}
for label, name in zip(labels, df["names"]):
    clusters.setdefault(label, []).append(name)

# Select a representative name for each cluster (shortest name as default)
representative_names = {label: min(cluster, key=len) for label, cluster in clusters.items()}

# Map each name to its representative name
df["Representative_Name"] = df["Cluster_Labels"].apply(lambda x: representative_names[x])

In [10]:
# Save the updated dataset to a new CSV file
output_file_path = "processed_male_names_dataset.csv"
df.to_csv(output_file_path, index=False, encoding="utf-8")
print(f"Processed dataset saved to {output_file_path}")

# Display clusters and mappings (optional)
print("\nClusters and Representative Names:")
for label, cluster in clusters.items():
    print(f"Cluster {label}: {cluster} -> Representative: {representative_names[label]}")

Processed dataset saved to processed_male_names_dataset.csv

Clusters and Representative Names:
Cluster 46: ['Aaron', 'Adrian', 'Adrien', 'Aharon', 'Arne', 'Arnie', 'Aron', 'Arron', 'Barn', 'Barney', 'Barnie', 'Barny', 'Baron', 'Barron', 'Bernie', 'Bjorn', 'Bjorne', 'Brian', 'Bruno', 'Bryan', 'Bryn', 'Bryon', 'Byron', 'Darby', 'Daren', 'Darian', 'Darien', 'Darin', 'Dario', 'Darrel', 'Darrell', 'Darren', 'Darrin', 'Darryl', 'Darth', 'Darwin', 'Daryl', 'Daryle', 'Derby', 'Derrin', 'Derrol', 'Derron', 'Deryl', 'Dorian', 'Dory', 'Drew', 'Dru', 'Erin', 'Ernie', 'Erny', 'Erwin', 'Eugen', 'Eugene', 'Hadrian', 'Irwin', 'Lauren', 'Loren', 'Lorne', 'Marion', 'Marwin', 'Merwin', 'Myron', 'Oberon', 'Oran', 'Oren', 'Orin', 'Orion', 'Orren', 'Orrin', 'Sherwin', 'Stearn', 'Stearne', 'Stern', 'Sterne', 'Terrel', 'Terrell', 'Terri', 'Terrill', 'Terry', 'Tharen', 'Thorn', 'Thornie', 'Thorny', 'Tirrell', 'Tore', 'Torey', 'Torin', 'Torr', 'Torre', 'Torrey', 'Torrin', 'Torry', 'Tray', 'Tre', 'Trey', 'Trip'