In [48]:
import pandas as pd
import numpy as np
from pathlib import Path
# !pip install scikit-learn==1.7.2
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import sklearn
print(sklearn.__version__)

1.6.1


In [34]:
# ---- Config ----
DATA_PATH = Path("dataset_ml.csv")  # généré par aggregate_for_ml.py
MODEL_DIR = Path("model")
MODEL_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42

In [35]:
# ---- Load data ----
df = pd.read_csv("../data/dataset_ml.csv")

In [50]:
df.tail()

Unnamed: 0,user_id,depenses_mensuelles,profil,salaire,loyer,ville,objectif_financier,comportement,habitudes_sorties,habitudes_courses
2395,200,2146.14,jeune_salarie,2560,588,Nantes,investir,économe,244,411
2396,200,1709.23,jeune_salarie,2560,588,Nantes,investir,économe,244,411
2397,200,2026.29,jeune_salarie,2560,588,Nantes,investir,économe,244,411
2398,200,1422.2,jeune_salarie,2560,588,Nantes,investir,économe,244,411
2399,200,1217.93,jeune_salarie,2560,588,Nantes,investir,économe,244,411


In [36]:
# Si 'mois' existe, on peut supprimer. On prédit dépenses_mensuelles.
if "mois" in df.columns:
    df = df.drop(columns=["mois"])

In [37]:
# Drop rows with missing target
df = df.dropna(subset=["depenses_mensuelles"]).reset_index(drop=True)

In [38]:
# Simple feature selection: garder colonnes utiles
# On suppose que df contient: user_id, depenses_mensuelles, profil, salaire, loyer, ville, objectif_financier, comportement, habitudes_sorties, habitudes_courses
features = [
    "profil", "salaire", "loyer", "ville",
    "objectif_financier", "comportement",
    "habitudes_sorties", "habitudes_courses"
]
target = "depenses_mensuelles"

In [39]:
# Safety: garder seulement colonnes existantes
features = [f for f in features if f in df.columns]

X = df[features].copy()
y = df[target].astype(float)

In [40]:
# Train / test split (grouped by user_id to avoid leakage would be better — here simple split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

In [41]:
# Column types
categorical_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

In [42]:
# Preprocessing pipelines
cat_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

num_pipe = Pipeline([
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
], remainder="drop")

In [43]:
# Full pipeline
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1))
])

In [44]:
# Train
print("Training model...")
pipeline.fit(X_train, y_train)

Training model...


In [45]:
# Eval
preds = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)
print(f"Evaluation -> MAE: {mae:.2f} €, R2: {r2:.3f}")

Evaluation -> MAE: 605.85 €, R2: 0.360


In [46]:
# Save pipeline
model_path = MODEL_DIR / "finance_pipeline.pkl"
joblib.dump(pipeline, model_path)
print(f"Saved pipeline to {model_path}")

Saved pipeline to model\finance_pipeline.pkl


In [47]:
# Save simple feature metadata for the app
meta = {
    "features": features,
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols
}
joblib.dump(meta, MODEL_DIR / "meta.pkl")
print("Saved metadata.")

Saved metadata.
