In [1]:
# 📚 Importations
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib


In [None]:
#  1. Chargement des données
df = pd.read_csv('accepted.csv', low_memory=False)
print(f"Shape initial: {df.shape}")

In [None]:
#  2. Nettoyage de base
threshold = int(0.7 * len(df.columns))
df = df.dropna(thresh=threshold)

# Supprimer colonne inutile
if 'member_id' in df.columns:
    df = df.drop(columns=['member_id'])

# Garder seulement certaines features
features = ['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
            'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
            'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
            'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'last_pymnt_amnt',
            'loan_status']

# Réduction du dataset
df = df[features]
print(f"Shape après réduction: {df.shape}")

In [None]:
# 3. Cible binaire
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['loan_status'] = df['loan_status'].map({'Fully Paid': 1, 'Charged Off': 0})

In [None]:
# 4. Encodage des variables catégoriques
cat_features = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'purpose']
le = LabelEncoder()
for col in cat_features:
    df[col] = df[col].astype(str)  # Assurer que c'est du string
    df[col] = le.fit_transform(df[col])

In [None]:
# 5. Gestion des valeurs manquantes
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())

In [None]:
# 6. Séparation X / y
X = df.drop(columns=['loan_status'])
y = df['loan_status']


In [None]:

#  7. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#  8. Modèle Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
#  9. Évaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#  10. Sauvegarde du modèle
joblib.dump(model, 'loan_model.pkl')
print("\n✅ Modèle entraîné et sauvegardé sous loan_model.pkl")