# Train_model.py

In [1]:
import os
import joblib
import mlflow
import mlflow.sklearn
import pandas as pd
import sklearn 
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from mlflow.models.signature import infer_signature

# Importation des données nettoyées

In [2]:
# Données d'entrainement
df_train_cleaned = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_train_cleaned.csv")
print(df_train_cleaned.shape)
df_train_cleaned.head() # - Affichage des premières lignes du jeu de données pour visualiser rapidement la structure et les premières valeurs.

(667179, 12)


Unnamed: 0,gender,marital_status,employment_status,education_level,annual_income,number_of_children,country,subscription_type,age_group,children_per_age,log_annual_income,account_status
0,Male,Single,Self-employed,Bachelor,154424.71,0,Poland,Basic,65+,0.0,11.947468,Active
1,Male,Single,Unemployed,Other,172544.12,0,Hong Kong,Enterprise,65+,0.0,12.058414,Suspended
2,Male,Single,Retired,Master,23341.82,5,Australia,Premium,25-34,0.147059,10.058045,Active
3,Female,Married,Unemployed,Other,180506.33,3,Samoa,Basic,65+,0.034483,12.103527,Suspended
4,Other,Single,Employed,Other,25122.3,3,Zambia,Premium,25-34,0.115385,10.131551,Inactive


In [3]:
# Données test
df_test_cleaned = pd.read_csv("/home/sacko/Documents/ProjetAchats/Donnees/df_test_cleaned.csv")
print(df_test_cleaned.shape)
df_test_cleaned.head() # - Affichage des premières lignes du jeu de données pour visualiser rapidement la structure et les premières valeurs.

(167049, 12)


Unnamed: 0,gender,marital_status,employment_status,education_level,annual_income,number_of_children,country,subscription_type,age_group,children_per_age,log_annual_income,account_status
0,Male,Single,Unemployed,Master,42779.98,1,Albania,Basic,35-49,0.028571,10.663849,Active
1,Other,Single,Self-employed,Other,32041.68,1,Cyprus,Free,35-49,0.021739,10.374824,Active
2,Male,Divorced,Employed,High School,49148.52,1,Romania,Premium,35-49,0.027027,10.802622,Active
3,Other,Divorced,Retired,High School,25696.3,2,Sudan,Enterprise,65+,0.027397,10.154141,Inactive
4,Male,Divorced,Retired,PhD,146930.23,1,Belarus,Enterprise,35-49,0.020408,11.89772,Suspended


# Modélisation

In [5]:
# Séparation des variables explicatives et cible
X_train = df_train_cleaned.drop(["account_status"], axis=1)
y_train = df_train_cleaned["account_status"]

X_test = df_test_cleaned.drop(["account_status"], axis=1)
y_test = df_test_cleaned["account_status"]

# Fonction de target encoding
def target_encode_smooth(df, col, target, alpha=40):
    df_copy = df[[col, target]].copy()
    classes = df[target].unique()
    global_probas = df[target].value_counts(normalize=True)

    stats = df_copy.groupby(col)[target].value_counts().unstack().fillna(0)
    totals = stats.sum(axis=1)

    encoded = pd.DataFrame(index=df.index)

    for cls in classes:
        n_cy = stats[cls] if cls in stats.columns else 0
        p_y = global_probas[cls]
        smooth = (n_cy + alpha * p_y) / (totals + alpha)
        encoded[f"{col}_enc_{cls}"] = df[col].map(smooth)

    return encoded


def encode_features(df, target_col='account_status', alpha=10):
    df = df.copy()
    dummy_cols = ['gender', 'marital_status', 'employment_status', 
                  'education_level', 'subscription_type', 'age_group']
    
    df_dummies = pd.get_dummies(df[dummy_cols], prefix=dummy_cols)
    country_enc = target_encode_smooth(df, col='country', target=target_col, alpha=alpha)

    numeric_cols = df.drop(columns=dummy_cols + ['country', target_col]).copy()
    numeric_cols = numeric_cols.astype({col: 'float64' for col in numeric_cols.select_dtypes('int').columns})

    final_df = pd.concat([df_dummies, country_enc, numeric_cols], axis=1)
    final_df[target_col] = df[target_col]
    
    return final_df


# Préparation des dossiers
os.makedirs("artifacts", exist_ok=True)
os.makedirs("reports", exist_ok=True)

# Encodage des données
train_encoded = encode_features(X_train.assign(account_status=y_train), target_col='account_status')
test_encoded = encode_features(X_test.assign(account_status=y_test), target_col='account_status')

X_train_encoded = train_encoded.drop(columns='account_status')
y_train_encoded = train_encoded['account_status']
X_test_encoded = test_encoded.drop(columns='account_status')
y_test_encoded = test_encoded['account_status']

X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


# Entraînement avec MLflow
model = RandomForestClassifier(n_estimators=100, random_state=42)
mlflow.set_experiment("account_status_prediction")

with mlflow.start_run():
    model.fit(X_train_encoded, y_train_encoded)
    preds = model.predict(X_test_encoded)

    report = classification_report(y_test_encoded, preds, output_dict=True)
    acc = report['accuracy']

    with open("reports/evaluation_report.json", "w") as f:
        json.dump(report, f, indent=4)
    mlflow.log_artifact("reports/evaluation_report.json")

    input_example = X_train_encoded.iloc[:1]
    signature = infer_signature(X_train_encoded, model.predict(X_train_encoded))

    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",  
        input_example=input_example,
        signature=signature,
        registered_model_name="account_status_rf"
    )

    # Sauvegarde du modèle dans artifacts/ pour correspondre au workflow GitHub Actions
    joblib.dump(model, "artifacts/model.joblib")


# Données test pour API (exemple)
data = {
    "gender": "Male",
    "marital_status": "Single",
    "employment_status": "Employed",
    "education_level": "Bachelor",
    "subscription_type": "Standard",
    "age_group": "25-34",  
    "number_of_children": 2,
    "children_per_age": 0.5,
    "log_annual_income": 10.5,
    "country": "France"
}


Registered model 'account_status_rf' already exists. Creating a new version of this model...
Created version '2' of model 'account_status_rf'.
