In [1]:
import pandas as pd

from google.cloud import bigquery
from google.oauth2 import service_account

# Chargement des données

In [2]:
KEY_FILE_PATH = '/Users/blef/Downloads/moonlit-palace-338110-e5f03943d3d1.json'

In [3]:
credentials = service_account.Credentials.from_service_account_file(
    KEY_FILE_PATH, 
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

In [28]:
query = """
    SELECT preusuel, sexe, sum(nombre) as total
    FROM ml.prenoms
    GROUP BY preusuel, sexe
    ORDER BY total DESC
"""

df = pd.read_gbq(
    query=query,
    project_id=credentials.project_id,
    credentials=credentials,
)
df = df[df["preusuel"] != "_PRENOMS_RARES"]

In [29]:
df.head()

Unnamed: 0,preusuel,sexe,total
0,MARIE,2,2232238
1,JEAN,1,1914060
2,PIERRE,1,891611
4,MICHEL,1,820353
6,ANDRÉ,1,711950


# Modélisation

In [30]:
len(df)

37242

In [186]:
from unidecode import unidecode

In [187]:
def encoder(names):
    alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ-"

    features = pd.DataFrame()
    for letter in alphabet:
        features[letter] = (
            names.apply(unidecode).str.upper().str.count(letter).astype(int)
        )
        
    return features

In [188]:
X = encoder(df["preusuel"])
y = (df["sexe"] == 1).astype(int)

In [189]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [190]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True
)

In [191]:
regr = LogisticRegression()

In [192]:
regr.fit(X_train, y_train)

LogisticRegression()

In [193]:
regr.predict(encoder(pd.Series(["christian"])))

array([0])

In [194]:
regr.score(X_test, y_test)

0.6814642441600286

# Export du modèle

In [195]:
import joblib

In [196]:
joblib.dump(regr, "model.v1.pickle")

['model.v1.pickle']