This notebook is for the resolution of the ML project 

*Importing the data 

In [88]:
import pandas as pd 


In [89]:
df = pd.read_csv('nat2022.csv', sep=';')
df.head()

Unnamed: 0,sexe,preusuel,annais,nombre
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430


*start of data cleaning

In [90]:
#renaming columns
df.columns = ["sex", "name", "year", 'count']
df.head()

Unnamed: 0,sex,name,year,count
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430


In [91]:
#exclude all _PRENOMS_RARES
df = df[df['name'] != '_PRENOMS_RARES']
df.head()

Unnamed: 0,sex,name,year,count
123,1,A,1980,3
124,1,A,1998,3
125,1,A,XXXX,21
126,1,AADAM,2009,4
127,1,AADAM,2014,3


In [92]:
#exclude all names with '-'
df = df[df['name'].str.contains('-') == False]
df.head()

Unnamed: 0,sex,name,year,count
123,1,A,1980,3
124,1,A,1998,3
125,1,A,XXXX,21
126,1,AADAM,2009,4
127,1,AADAM,2014,3


In [93]:
#regroup all the name while ignoring sex
df = df.groupby(['name', 'year'])['count'].sum().reset_index()
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
2,A,XXXX,21
3,AADAM,2009,4
4,AADAM,2014,3


In [94]:
# delete all rows that have a non-numeric value in the year column
df = df[df['year'].str.isnumeric()]
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
3,AADAM,2009,4
4,AADAM,2014,3
5,AADAM,2015,3


In [95]:
#change type of year column to int
df['year'] = df['year'].astype(int)
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
3,AADAM,2009,4
4,AADAM,2014,3
5,AADAM,2015,3


In [96]:
# sort by year and name
df = df.sort_values(['year', 'name'])
df.head()

Unnamed: 0,name,year,count
4389,ABDON,1900,4
5400,ABEL,1900,428
5550,ABELINE,1900,3
5593,ABELLE,1900,3
6626,ABRAHAM,1900,19


*Regrouping similar names by using Levenshtein distance

In [97]:
from Levenshtein import distance
from sklearn.cluster import DBSCAN
from collections import defaultdict
import numpy as np


In [104]:
def group_similar_names(df, max_distance=2):
    """
    Group similar names using Levenshtein distance across all years,
    then sum counts by year for each name group.
    
    Parameters:
    - df: DataFrame with columns 'name', 'year', 'count'
    - max_distance: Maximum Levenshtein distance to consider names similar
    
    Returns:
    - DataFrame with grouped names and their combined counts by year
    """
    # Get unique names across all years
    unique_names = list(df['name'].unique())
    
    # Calculate total frequency for each name (across all years)
    name_frequencies = df.groupby('name')['count'].sum().to_dict()
    
    # Sort names by their total frequency (most frequent first)
    sorted_names = sorted(unique_names, key=lambda name: name_frequencies.get(name, 0), reverse=True)
    
    # Create name clusters
    name_clusters = []
    processed_names = set()
    
    for name in sorted_names:
        if name not in processed_names:
            cluster = [name]
            processed_names.add(name)
            
            # Find similar names to add to this cluster
            for other_name in sorted_names:
                if other_name != name and other_name not in processed_names:
                    if distance(name, other_name) <= max_distance:
                        cluster.append(other_name)
                        processed_names.add(other_name)
            
            name_clusters.append(cluster)
    
    # Create mapping from original name to representative name
    name_mapping = {}
    for cluster in name_clusters:
        representative = cluster[0]  # Use most frequent name as representative
        for name in cluster:
            name_mapping[name] = representative
    
    # Store similar names for each representative
    similar_names_dict = {}
    for cluster in name_clusters:
        if len(cluster) > 1:
            representative = cluster[0]
            similar_names_dict[representative] = cluster[1:]
        else:
            similar_names_dict[cluster[0]] = []
    
    # Apply mapping and group by representative name and year
    df_with_mapping = df.copy()
    df_with_mapping['representative_name'] = df_with_mapping['name'].map(name_mapping)
    
    # Group by representative name and year, summing the counts
    result = df_with_mapping.groupby(['representative_name', 'year'], as_index=False)['count'].sum()
    
    # Add similar names information
    result['similar_names'] = result['representative_name'].apply(
        lambda name: similar_names_dict.get(name, [])
    )
    
    # Rename representative_name back to name for consistency
    result = result.rename(columns={'representative_name': 'name'})
    
    # Check if all names are accounted for (for validation)
    original_total = df['count'].sum()
    result_total = result['count'].sum()
    
    if original_total != result_total:
        print(f"WARNING: Count totals don't match! Original: {original_total}, Result: {result_total}")
    else:
        print(f"All counts accounted for. Total: {original_total}")
    result = result.sort_values(['year', 'name'])
    return result


In [106]:
cluster= group_similar_names(df, max_distance=2)
cluster.head()

All counts accounted for. Total: 82844134


Unnamed: 0,name,year,count,similar_names
1935,ACHILLE,1900,266,"[RACHELLE, ACHILE, ACHILLES]"
2177,ADAM,1900,26,"[AMAR, AKIM, ADAMA, VADIM, AIDA, AKRAM, AYDAN,..."
2306,ADELAIDE,1900,197,"[ADÉLAÏDE, ADELIE, ADELAÏDE, ADÉLAIDE, ADELAIS..."
2429,ADELHEID,1900,3,[]
2442,ADELIA,1900,30,"[ADELIN, ODELIA, ACELYA, DELHIA, ADELINO, AZEL..."


In [107]:
cluster["cluster_size"] = cluster['similar_names'].apply(len)
cluster.head(20)

Unnamed: 0,name,year,count,similar_names,cluster_size
1935,ACHILLE,1900,266,"[RACHELLE, ACHILE, ACHILLES]",3
2177,ADAM,1900,26,"[AMAR, AKIM, ADAMA, VADIM, AIDA, AKRAM, AYDAN,...",126
2306,ADELAIDE,1900,197,"[ADÉLAÏDE, ADELIE, ADELAÏDE, ADÉLAIDE, ADELAIS...",6
2429,ADELHEID,1900,3,[],0
2442,ADELIA,1900,30,"[ADELIN, ODELIA, ACELYA, DELHIA, ADELINO, AZEL...",44
2580,ADHEMAR,1900,5,"[ADHÉMAR, SHEMAR]",2
2687,ADONIS,1900,22,"[ADONAÏ, ADONYS, ADONAI, ADONAY, DIONIS, APONI...",10
2843,ADRIEN,1900,1880,"[ADRIENNE, HADRIEN, ADRIAN, ADRIANA, ADRIANO, ...",31
3003,AGATHE,1900,115,"[AGATHA, AGATE, AGATA, AGATHON, AMATH, AMANTHE...",13
3157,AGLAE,1900,23,"[AGLAÉ, AHLAME, AGLAEE, ADLANE, ATLAS, ABLAYE,...",12


* Feature engineering 
  

Unnamed: 0,name,year,count,similar_names,cluster_size
22695,SEYDINA-MOUHAMED,2021,4,[SEYDINA-MOHAMED],1
22828,SIRAJEDDINE,2021,4,[],0
22851,SOFIA-MARIA,2021,5,[],0
22855,SOILAHOUDINE,2021,4,[],0
23011,SOSTHENE,2021,4,"[SOSTHENES, SOSTHÈNE]",2
...,...,...,...,...,...
23924,WANDRILLE,2022,20,[],0
24096,WILHELMINE,2022,4,[WILHELMINA],1
24255,YAVUZ-SELIM,2022,4,[],0
24297,YUNUS-EMRE,2022,3,[YUNUSEMRE],1
