In [8]:
import json
import pandas as pd

from google.oauth2 import service_account

# Lire les données depuis BigQuery

In [9]:
with open("ensai-2024-630e074aa45c.json") as source:
    info = json.load(source)

credentials = service_account.Credentials.from_service_account_info(info)

prenoms = pd.read_gbq("ml.prenoms", project_id="ensai-2024", credentials=credentials)

In [10]:
prenoms.head()

Unnamed: 0,sexe,preusuel,annais,dpt,nombre
0,1,_PRENOMS_RARES,1900,13,37
1,1,_PRENOMS_RARES,1900,57,37
2,1,_PRENOMS_RARES,1900,59,44
3,1,_PRENOMS_RARES,1900,62,34
4,1,_PRENOMS_RARES,1900,75,45


# Pre-processing

1. Exclure les "_PRENOMS_RARES"
2. Aggréger les "annais" et les "dpt" pour avoir la fréquence par preusuel
3. Déterminer le genre d'un prénom (optionnel)

In [64]:
preprocessing = prenoms[prenoms["preusuel"] != '_PRENOMS_RARES']
preprocessing = prenoms[prenoms["preusuel"].str.len() > 2]
preprocessing["nombre"] = preprocessing["nombre"].astype(int)
preprocessing = preprocessing.groupby(["preusuel", "sexe"]).agg({"nombre": sum}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessing["nombre"] = preprocessing["nombre"].astype(int)


In [68]:
preprocessing.head()

Unnamed: 0,preusuel,sexe,nombre
0,AADAM,1,32
1,AADEL,1,57
2,AADHIRA,2,22
3,AADIL,1,190
4,AAHIL,1,28


In [69]:
data = (
    preprocessing
        .iloc[preprocessing.groupby("preusuel")["nombre"].idxmax()]
        .drop(["nombre"], axis=1)
)

In [70]:
data.head()

Unnamed: 0,preusuel,sexe
0,AADAM,1
1,AADEL,1
2,AADHIRA,2
3,AADIL,1
4,AAHIL,1


# Feature engineering

1. Encoder les prénoms dans un vecteur d'apparition des lettres (appelé X)
2. a -> 1, b -> 2, c -> 3 etc.
3. on fait quoi avec les - ? et les accents ? et les majuscules ?

In [86]:
def encode_prenom(prenom: str) -> pd.Series:
    """
        This function encode a given name into a pd.Series.
        
        For instance alain is encoded [1, 0, 0, 0, 0 ... 1, 0 ...].
    """
    alphabet = "abcdefghijklmnopqrstuvwxyzé-'"
    prenom = prenom.lower()
    
    return pd.Series([letter in prenom for letter in alphabet]).astype(int)

X = data["preusuel"].apply(encode_prenom)
#encode_prenom("Christophe")

In [91]:
X
Y = (data["sexe"].astype(int) - 1)

In [92]:
X.head(), Y.head()

(   0   1   2   3   4   5   6   7   8   9   ...  19  20  21  22  23  24  25  \
 0   1   0   0   1   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   
 1   1   0   0   1   1   0   0   0   0   0  ...   0   0   0   0   0   0   0   
 2   1   0   0   1   0   0   0   1   1   0  ...   0   0   0   0   0   0   0   
 3   1   0   0   1   0   0   0   0   1   0  ...   0   0   0   0   0   0   0   
 4   1   0   0   0   0   0   0   1   1   0  ...   0   0   0   0   0   0   0   
 
    26  27  28  
 0   0   0   0  
 1   0   0   0  
 2   0   0   0  
 3   0   0   0  
 4   0   0   0  
 
 [5 rows x 29 columns],
 0    0
 1    0
 2    1
 3    0
 4    0
 Name: sexe, dtype: int64)

# Modélisation

In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [94]:
regr = LogisticRegression()

In [95]:
regr.fit(X, Y)

In [111]:
regr.predict([encode_prenom("daeeeei")])

array([1])

In [113]:
y_pred = regr.predict(X)

In [114]:
accuracy_score(Y, y_pred)

0.6460831279594605

In [115]:
import joblib

### Dump du modèle

In [116]:
joblib.dump(regr, "model.v1.bin")

['model.v1.bin']

### Reload du modèle (exemple pour l'API)

In [119]:
regr_loaded = joblib.load("model.v1.bin")

In [120]:
regr_loaded.predict([encode_prenom("Toto")])

array([0])