This notebook is for the resolution of the ML project 

*Importing the data 

In [88]:
import pandas as pd 


In [89]:
df = pd.read_csv('nat2022.csv', sep=';')
df.head()

Unnamed: 0,sexe,preusuel,annais,nombre
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430


*start of data cleaning

In [90]:
#renaming columns
df.columns = ["sex", "name", "year", 'count']
df.head()

Unnamed: 0,sex,name,year,count
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430


In [91]:
#exclude all _PRENOMS_RARES
df = df[df['name'] != '_PRENOMS_RARES']
df.head()

Unnamed: 0,sex,name,year,count
123,1,A,1980,3
124,1,A,1998,3
125,1,A,XXXX,21
126,1,AADAM,2009,4
127,1,AADAM,2014,3


In [92]:
#exclude all names with '-'
df = df[df['name'].str.contains('-') == False]
df.head()

Unnamed: 0,sex,name,year,count
123,1,A,1980,3
124,1,A,1998,3
125,1,A,XXXX,21
126,1,AADAM,2009,4
127,1,AADAM,2014,3


In [93]:
#regroup all the name while ignoring sex
df = df.groupby(['name', 'year'])['count'].sum().reset_index()
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
2,A,XXXX,21
3,AADAM,2009,4
4,AADAM,2014,3


In [94]:
# delete all rows that have a non-numeric value in the year column
df = df[df['year'].str.isnumeric()]
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
3,AADAM,2009,4
4,AADAM,2014,3
5,AADAM,2015,3


In [95]:
#change type of year column to int
df['year'] = df['year'].astype(int)
df.head()

Unnamed: 0,name,year,count
0,A,1980,3
1,A,1998,3
3,AADAM,2009,4
4,AADAM,2014,3
5,AADAM,2015,3


In [96]:
# sort by year and name
df = df.sort_values(['year', 'name'])
df.head()

Unnamed: 0,name,year,count
4389,ABDON,1900,4
5400,ABEL,1900,428
5550,ABELINE,1900,3
5593,ABELLE,1900,3
6626,ABRAHAM,1900,19


*Regrouping similar names by using Levenshtein distance

In [97]:
from Levenshtein import distance
from sklearn.cluster import DBSCAN
from collections import defaultdict
import numpy as np


In [104]:
def group_similar_names(df, max_distance=2):
    """
    Group similar names using Levenshtein distance across all years,
    then sum counts by year for each name group.
    
    Parameters:
    - df: DataFrame with columns 'name', 'year', 'count'
    - max_distance: Maximum Levenshtein distance to consider names similar
    
    Returns:
    - DataFrame with grouped names and their combined counts by year
    """
    # Get unique names across all years
    unique_names = list(df['name'].unique())
    
    # Calculate total frequency for each name (across all years)
    name_frequencies = df.groupby('name')['count'].sum().to_dict()
    
    # Sort names by their total frequency (most frequent first)
    sorted_names = sorted(unique_names, key=lambda name: name_frequencies.get(name, 0), reverse=True)
    
    # Create name clusters
    name_clusters = []
    processed_names = set()
    
    for name in sorted_names:
        if name not in processed_names:
            cluster = [name]
            processed_names.add(name)
            
            # Find similar names to add to this cluster
            for other_name in sorted_names:
                if other_name != name and other_name not in processed_names:
                    if distance(name, other_name) <= max_distance:
                        cluster.append(other_name)
                        processed_names.add(other_name)
            
            name_clusters.append(cluster)
    
    # Create mapping from original name to representative name
    name_mapping = {}
    for cluster in name_clusters:
        representative = cluster[0]  # Use most frequent name as representative
        for name in cluster:
            name_mapping[name] = representative
    
    # Store similar names for each representative
    similar_names_dict = {}
    for cluster in name_clusters:
        if len(cluster) > 1:
            representative = cluster[0]
            similar_names_dict[representative] = cluster[1:]
        else:
            similar_names_dict[cluster[0]] = []
    
    # Apply mapping and group by representative name and year
    df_with_mapping = df.copy()
    df_with_mapping['representative_name'] = df_with_mapping['name'].map(name_mapping)
    
    # Group by representative name and year, summing the counts
    result = df_with_mapping.groupby(['representative_name', 'year'], as_index=False)['count'].sum()
    
    # Add similar names information
    result['similar_names'] = result['representative_name'].apply(
        lambda name: similar_names_dict.get(name, [])
    )
    
    # Rename representative_name back to name for consistency
    result = result.rename(columns={'representative_name': 'name'})
    
    # Check if all names are accounted for (for validation)
    original_total = df['count'].sum()
    result_total = result['count'].sum()
    
    if original_total != result_total:
        print(f"WARNING: Count totals don't match! Original: {original_total}, Result: {result_total}")
    else:
        print(f"All counts accounted for. Total: {original_total}")
    result = result.sort_values(['year', 'name'])
    return result


In [106]:
cluster= group_similar_names(df, max_distance=2)
cluster.head()

All counts accounted for. Total: 82844134


Unnamed: 0,name,year,count,similar_names
1935,ACHILLE,1900,266,"[RACHELLE, ACHILE, ACHILLES]"
2177,ADAM,1900,26,"[AMAR, AKIM, ADAMA, VADIM, AIDA, AKRAM, AYDAN,..."
2306,ADELAIDE,1900,197,"[ADÉLAÏDE, ADELIE, ADELAÏDE, ADÉLAIDE, ADELAIS..."
2429,ADELHEID,1900,3,[]
2442,ADELIA,1900,30,"[ADELIN, ODELIA, ACELYA, DELHIA, ADELINO, AZEL..."


In [107]:
cluster["cluster_size"] = cluster['similar_names'].apply(len)
cluster.head(20)

Unnamed: 0,name,year,count,similar_names,cluster_size
1935,ACHILLE,1900,266,"[RACHELLE, ACHILE, ACHILLES]",3
2177,ADAM,1900,26,"[AMAR, AKIM, ADAMA, VADIM, AIDA, AKRAM, AYDAN,...",126
2306,ADELAIDE,1900,197,"[ADÉLAÏDE, ADELIE, ADELAÏDE, ADÉLAIDE, ADELAIS...",6
2429,ADELHEID,1900,3,[],0
2442,ADELIA,1900,30,"[ADELIN, ODELIA, ACELYA, DELHIA, ADELINO, AZEL...",44
2580,ADHEMAR,1900,5,"[ADHÉMAR, SHEMAR]",2
2687,ADONIS,1900,22,"[ADONAÏ, ADONYS, ADONAI, ADONAY, DIONIS, APONI...",10
2843,ADRIEN,1900,1880,"[ADRIENNE, HADRIEN, ADRIAN, ADRIANA, ADRIANO, ...",31
3003,AGATHE,1900,115,"[AGATHA, AGATE, AGATA, AGATHON, AMATH, AMANTHE...",13
3157,AGLAE,1900,23,"[AGLAÉ, AHLAME, AGLAEE, ADLANE, ATLAS, ABLAYE,...",12


#Feature engineering 
  

In [113]:
from tqdm import tqdm
from scipy import stats

In [117]:
def engineer_features(df):
    """
    Create features for predicting when a name will become popular again.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns 'name', 'year', 'count'
    
    Returns:
    pandas.DataFrame: DataFrame with engineered features
    """
    # Make a copy of the dataframe to avoid modifying the original
    result_df = df.copy()
    
    # Group by name to process each name's time series
    name_groups = []
    
    for name, group in df.groupby('name'):
        # Sort by year to ensure chronological order
        group = group.sort_values('year')
        
        # --- Trend-based features ---
        
        # Rolling averages
        group['rolling_avg_3yr'] = group['count'].rolling(window=3, min_periods=1).mean()
        group['rolling_avg_5yr'] = group['count'].rolling(window=5, min_periods=1).mean()
        group['rolling_avg_10yr'] = group['count'].rolling(window=10, min_periods=1).mean()
        
        # Rate of change (year-over-year)
        group['yoy_change'] = group['count'].pct_change(periods=1)
        
        # Acceleration (change in rate of change)
        group['acceleration'] = group['yoy_change'].pct_change(periods=1)
        
        # Peak detection (1 if local maximum, 0 otherwise)
        # A point is a peak if it's greater than both neighbors
        group['is_peak'] = 0
        for i in range(1, len(group) - 1):
            if group['count'].iloc[i] > group['count'].iloc[i-1] and \
               group['count'].iloc[i] > group['count'].iloc[i+1]:
                group.loc[group.index[i], 'is_peak'] = 1
        
        # --- Cyclical features ---
        
        # Years since last peak
        group['years_since_peak'] = 0
        last_peak_idx = -1
        for i in range(len(group)):
            if group['is_peak'].iloc[i] == 1:
                last_peak_idx = i
            if last_peak_idx >= 0:
                group.loc[group.index[i], 'years_since_peak'] = i - last_peak_idx
        
        # Find local minimums (troughs)
        group['is_trough'] = 0
        for i in range(1, len(group) - 1):
            if group['count'].iloc[i] < group['count'].iloc[i-1] and \
               group['count'].iloc[i] < group['count'].iloc[i+1]:
                group.loc[group.index[i], 'is_trough'] = 1
        
        # Years since last trough
        group['years_since_trough'] = 0
        last_trough_idx = -1
        for i in range(len(group)):
            if group['is_trough'].iloc[i] == 1:
                last_trough_idx = i
            if last_trough_idx >= 0:
                group.loc[group.index[i], 'years_since_trough'] = i - last_trough_idx
        
        # --- Statistical features ---
        
        # Z-score of current count relative to historical data
        group['count_zscore'] = stats.zscore(group['count'], nan_policy='omit')
        
        # Percentile rank of current year compared to previous years
        group['percentile_rank'] = group['count'].rank(pct=True)
        
        # Volatility (rolling standard deviation)
        group['volatility_5yr'] = group['count'].rolling(window=5, min_periods=1).std()
        group['volatility_10yr'] = group['count'].rolling(window=10, min_periods=1).std()
        
        # --- Historical pattern features ---
        
        # Historical maximum up to current point
        group['historical_max'] = group['count'].cummax()
        
        # Ratio of current count to historical maximum
        group['ratio_to_max'] = group['count'] / group['historical_max']
        
        # Count trajectory (1: increasing, -1: decreasing, 0: stable)
        group['trajectory'] = np.sign(group['yoy_change'])
        
        # Moving average trajectory (smoother signal, less noise)
        group['ma_trajectory'] = np.sign(group['rolling_avg_5yr'].diff())
        
        # Cumulative years of consecutive increase/decrease
        group['consecutive_trend'] = 0
        current_trend = 0
        for i in range(1, len(group)):
            if group['yoy_change'].iloc[i] > 0:
                if current_trend > 0:
                    current_trend += 1
                else:
                    current_trend = 1
            elif group['yoy_change'].iloc[i] < 0:
                if current_trend < 0:
                    current_trend -= 1
                else:
                    current_trend = -1
            group.loc[group.index[i], 'consecutive_trend'] = current_trend
        
        # Generation length proxy (typically 20-30 years)
        # Hypothesis: Names might repeat in popularity every generation
        group['generation_cycle'] = np.sin(2 * np.pi * group['years_since_peak'] / 25)
        
        # Add to our collection
        name_groups.append(group)
    
    # Combine all processed groups back into a single DataFrame
    result_df = pd.concat(name_groups)
    
    # Fill NaN values that might have been created during calculations
    result_df = result_df.fillna(0)
    result_df = result_df.sort_values(['year', 'name'])
    return result_df

In [118]:
test = engineer_features(cluster)
test.head()

  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pct_change(periods=1)
  group['acceleration'] = group['yoy_change'].pc

Unnamed: 0,name,year,count,similar_names,cluster_size,rolling_avg_3yr,rolling_avg_5yr,rolling_avg_10yr,yoy_change,acceleration,...,count_zscore,percentile_rank,volatility_5yr,volatility_10yr,historical_max,ratio_to_max,trajectory,ma_trajectory,consecutive_trend,generation_cycle
1935,ACHILLE,1900,266,"[RACHELLE, ACHILE, ACHILLES]",3,266.0,266.0,266.0,0.0,0.0,...,0.526405,0.723577,0.0,0.0,266,1.0,0.0,0.0,0,0.0
2177,ADAM,1900,26,"[AMAR, AKIM, ADAMA, VADIM, AIDA, AKRAM, AYDAN,...",126,26.0,26.0,26.0,0.0,0.0,...,-0.541217,0.182927,0.0,0.0,26,1.0,0.0,0.0,0,0.0
2306,ADELAIDE,1900,197,"[ADÉLAÏDE, ADELIE, ADELAÏDE, ADÉLAIDE, ADELAIS...",6,197.0,197.0,197.0,0.0,0.0,...,0.869613,0.825203,0.0,0.0,197,1.0,0.0,0.0,0,0.0
2429,ADELHEID,1900,3,[],0,3.0,3.0,3.0,0.0,0.0,...,-0.824485,0.307692,0.0,0.0,3,1.0,0.0,0.0,0,0.0
2442,ADELIA,1900,30,"[ADELIN, ODELIA, ACELYA, DELHIA, ADELINO, AZEL...",44,30.0,30.0,30.0,0.0,0.0,...,-0.362074,0.583333,0.0,0.0,30,1.0,0.0,0.0,0,0.0


In [119]:
test.tail()


Unnamed: 0,name,year,count,similar_names,cluster_size,rolling_avg_3yr,rolling_avg_5yr,rolling_avg_10yr,yoy_change,acceleration,...,count_zscore,percentile_rank,volatility_5yr,volatility_10yr,historical_max,ratio_to_max,trajectory,ma_trajectory,consecutive_trend,generation_cycle
147989,ÉPHRAÏM,2022,3,[],0,5.666667,6.0,4.777778,-0.727273,-1.272727,...,-0.65871,0.333333,3.464102,2.862594,11,0.272727,-1.0,0.0,-1,0.24869
147999,ÉVIE,2022,32,"[ÉLIEL, ÉMMIE]",2,24.333333,21.4,15.4,0.777778,-4.577778,...,2.067904,1.0,7.021396,8.461678,32,1.0,1.0,1.0,1,0.481754
148009,ÉZÉQUIEL,2022,8,[],0,8.0,8.8,8.5,-0.272727,-1.227273,...,-0.286299,0.45,2.387467,1.840894,11,0.727273,-1.0,1.0,-1,0.24869
148049,ÖMER,2022,63,"[MEIR, MEÏR, UMEYR]",3,60.0,63.0,59.1,0.145455,-2.288312,...,1.758831,0.9,5.147815,12.4762,76,0.828947,1.0,-1.0,1,0.684547
148052,ÖZGÜR,2022,3,[],0,3.0,3.0,3.0,0.0,0.0,...,0.0,0.666667,0.0,0.0,3,1.0,0.0,0.0,0,0.0
