In [62]:
# %%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [63]:
# Charger les données
# Chargement des données
df = pd.read_csv("credit_risk_dataset.csv", sep=";")  # Remplace par le bon chemin si nécessaire

# Aperçu des données
print(df.head())
print(df.info())
print(df.describe())


   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


In [64]:
# Gestion des valeurs manquantes
df.fillna(df.median(numeric_only=True), inplace=True)

In [65]:
# Liste des colonnes catégorielles et numériques
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
numerical_columns = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [66]:
# Préprocessing des données
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ])

In [67]:
# %%
# Séparation des données
X = df.drop(columns=['loan_status'])
y = df['loan_status']

# %%
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [68]:
# %%
# Créer le pipeline avec un classificateur (par exemple RandomForest)
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier())
# ])
# Reduce the number of trees and/or set a max depth
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42))  # Set lower values
])

# Entraînement du modèle
model.fit(X_train, y_train)

In [69]:
# %%
# Évaluation du modèle
y_pred_rf = model.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_pred_rf))

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      5072
           1       0.97      0.70      0.81      1445

    accuracy                           0.93      6517
   macro avg       0.95      0.85      0.88      6517
weighted avg       0.93      0.93      0.92      6517

AUC-ROC: 0.8461775348476744


In [70]:
# %%
# Sauvegarder le modèle complet (incluant le préprocesseur et le classificateur)
# joblib.dump(model, 'model.joblib')
joblib.dump(model, 'model.joblib', compress=3) 

# %%

['model.joblib']