In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [5]:
# --- 1. Charger le dataset ---
df = pd.read_csv("df_complet.csv")  # adapter le chemin si besoin

In [6]:
df.head()

Unnamed: 0,type_dossier,cumul_crd_immo,nbr_credit_immo,cumul_crd_conso,nbr_credit_conso,nbr_enfants,emprunteur_salaire,coemprunteur_salaire,emprunteur.situation_familiale.libelle,emprunteur.intitule,...,coemprunteur.anciennete,coemprunteur.nombre_rejets,coemprunteur.estimation_retraite,est_encaisse,emprunteur_subventions,coemprunteur_subventions,type_contrat_menage,CategorieDPT,age_cat,co_age_cat
0,proprietaire,0.0,0.0,68367.6,4.0,0,1900.0,0.0,Veuf/Veuve,Madame,...,0.0,0.0,0.0,non_encaisse,0.0,0.0,RETRAITE,2,70+,10
1,proprietaire,350000.0,1.0,20000.0,1.0,0,4500.0,0.0,Marié(e),Monsieur,...,0.0,0.0,0.0,non_encaisse,0.0,0.0,CDI,2,35-40,10
2,proprietaire,0.0,0.0,22500.0,1.0,2,0.0,0.0,Célibataire,Monsieur,...,0.0,0.0,0.0,non_encaisse,0.0,0.0,CDI,4,45-50,10
3,proprietaire,80000.0,1.0,9000.0,2.0,2,2302.0,0.0,Célibataire,Madame,...,0.0,0.0,0.0,non_encaisse,0.0,0.0,CDI,1,40-45,10
4,locataire,0.0,0.0,50900.0,5.0,0,1481.0,0.0,Célibataire,Monsieur,...,0.0,0.0,0.0,non_encaisse,0.0,0.0,INCONNU,0,30-35,10


In [7]:
# --- 2. Identifier la target ---
target_col = "est_encaisse"
if target_col not in df.columns:
    raise ValueError(f"Colonne cible '{target_col}' non trouvée dans le CSV.")

In [8]:
# --- 3. Nettoyage simple ---
# Exemples généraux — adapte selon ton jeu de données :
df = df.copy()
# Supprimer lignes entièrement vides
df.dropna(how="all", inplace=True)

In [11]:
# Séparer X / y
X = df.drop(columns=[target_col])
# Convertir la target en binaire 0/1 (valeur 'encaisse' -> 1, autre -> 0)
y = (df[target_col] == 'encaisse').astype(int)

In [13]:
# Détecter colonnes numériques/catégorielles
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category","bool"]).columns.tolist()

In [14]:
# Imputer valeurs manquantes simples
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

In [17]:
# Pipeline de préprocessing
numeric_pipeline = Pipeline(steps=[
    ("imputer", num_imputer),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", cat_imputer),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

In [20]:
# Pipeline complet avec modèle
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

In [21]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [22]:
# Entraînement
clf.fit(X_train, y_train)

In [24]:
# Évaluation
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]


In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9811802232854865


In [28]:
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Confusion matrix:
 [[3076    2]
 [  57    0]]


In [29]:
print("Classification report:\n", classification_report(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3078
           1       0.00      0.00      0.00        57

    accuracy                           0.98      3135
   macro avg       0.49      0.50      0.50      3135
weighted avg       0.96      0.98      0.97      3135



In [30]:
# Interpréter coefficients (approximation : pour variables encodées, noms générés)
# Récupérer noms des features après preprocessor
def get_feature_names(column_transformer):
    # pour sklearn >= 1.0
    feature_names = []
    for name, trans, cols in column_transformer.transformers_:
        if name == "remainder":
            continue
        if hasattr(trans, 'named_steps') and 'onehot' in trans.named_steps:
            ohe = trans.named_steps['onehot']
            in_cols = cols
            ohe_names = ohe.get_feature_names_out(in_cols)
            feature_names.extend(ohe_names.tolist())
        elif hasattr(trans, 'named_steps') and 'scaler' in trans.named_steps:
            feature_names.extend(cols.tolist())
        else:
            feature_names.extend(cols.tolist())
    return feature_names

In [31]:
feat_names = get_feature_names(clf.named_steps['preprocessor'])
coefs = clf.named_steps['model'].coef_[0]
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs})
coef_df = coef_df.reindex(coef_df.coef.abs().sort_values(ascending=False).index)

AttributeError: 'list' object has no attribute 'tolist'