Nous allons voir ici l'ensemble des méthodes pour gérer les valeurs manquantes lors du preprocessing dans le processus de Machine Learning 

In [0]:
import numpy as np 
import pandas as pd 

In [0]:
# Chargement des données 
data = pd.read_csv('Loan_Default.csv')

In [0]:
# Identification du nombre de vides sur l'ensemble des colonnes de notre dataset 
data.isna().sum()

ID                               0
year                             0
loan_limit                    3344
Gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              39642
term                            41
Neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9150
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_applic

In [0]:
# 1ère méthode : Suppression simple des vides dans notre dataset

data_suppress = data.dropna()
# Vérification
data_suppress.isna().sum()

ID                           0
year                         0
loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status  

In [0]:
# 2ème méthode : Séparation des données catégorielles et numériques 

# Variables catégoriques 
data_cat = data.select_dtypes(include=['object'])
# Variables numériques 
data_num = data.select_dtypes(include=['float64','int64'])


In [0]:
# Imputation des valeurs arbitraires sur les données manquantes dans les variables catégorielles et numériques par une constante 

data_cat.fillna('Unknown',inplace=True)
data_num.fillna(0,inplace=True)

In [0]:
data_imputation = pd.concat([data_cat,data_num],axis=1)
# Vérification sur l'ensemble du dataset 
data_imputation.isna().sum()

loan_limit                   0
Gender                       0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
credit_type                  0
co-applicant_credit_type     0
age                          0
submission_of_application    0
Region                       0
Security_Type                0
ID                           0
year                         0
loan_amount                  0
rate_of_interest             0
Interest_rate_spread         0
Upfront_charges              0
term                         0
property_value               0
income                       0
Credit_Score                 0
LTV                          0
Status  

In [0]:
data_imputation_stat = data.copy()

In [0]:
#  3ème méthode : Imputation des valeurs arbitraires sur les données manquantes dans les variables catégorielles et numériques par une statistique

# Pour les variables numériques on remplace les vides de la colonne par la moyenne de celle-ci

data_imputation_stat['term'] = data_imputation_stat['term'].fillna(data_imputation_stat['term'].mean())
data_imputation_stat['property_value'] = data_imputation_stat['property_value'].fillna(data_imputation_stat['property_value'].mean())
data_imputation_stat['income'] = data_imputation_stat['income'].fillna(data_imputation_stat['income'].mean())
data_imputation_stat['LTV'] = data_imputation_stat['LTV'].fillna(data_imputation_stat['LTV'].mean())
data_imputation_stat['dtir1'] = data_imputation_stat['dtir1'].fillna(data_imputation_stat['dtir1'].mean())

# Pour les variables catégorielles on remplace les vides de la colonne par l'élément le plus présent dans celle-ci
data_imputation_stat['Neg_ammortization'] = data_imputation_stat['Neg_ammortization'].fillna(data_imputation_stat['Neg_ammortization'].mode()[0])
data_imputation_stat['age'] = data_imputation_stat['age'].fillna(data_imputation_stat['age'].mode()[0])
data_imputation_stat['submission_of_application'] = data_imputation_stat['submission_of_application'].fillna(data_imputation_stat['submission_of_application'].mode()[0])

# Vérification sur l'ensemble du dataset 
data_imputation_stat.isna().sum()

ID                               0
year                             0
loan_limit                    3344
Gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              39642
term                             0
Neg_ammortization                0
interest_only                    0
lump_sum_payment                 0
property_value                   0
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                           0
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                              0
submission_of_applic

In [0]:
data_regr_imput = data.copy()

In [0]:
%pip install xgboost

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
data_num_1 = data_regr_imput.select_dtypes(include=['float64','int64'])
data_cat_1 = data_regr_imput.select_dtypes(include=['object'])

In [0]:
# 4ème méthode : Imputation des valeurs manquantes basées sur les relations entre les variables (régression d'imputation)
# Cette méthode s'applique exclusivement sur les données numériques 

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor  # ⚠️ Correction ici
import pandas as pd

# XGBRegressor (et non XGBClassifier) car l'imputation prédit des valeurs continues
xgb_estimator = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

imputer = IterativeImputer(
    estimator=None,
    max_iter=10,          
    random_state=42,
    verbose=2            
)

data_regr_imputed = imputer.fit_transform(data_num_1)

data_regr_imputed = pd.DataFrame(data_regr_imputed, columns=data_num_1.columns)

[IterativeImputer] Completing matrix with shape (148670, 13)
[IterativeImputer] Ending imputation round 1/10, elapsed time 3.69
[IterativeImputer] Change: 4260709.7269555945, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 2/10, elapsed time 6.50
[IterativeImputer] Change: 63008.531766758555, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 3/10, elapsed time 10.60
[IterativeImputer] Change: 37357.83035659518, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 4/10, elapsed time 13.70
[IterativeImputer] Change: 29751.300449213933, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 5/10, elapsed time 17.30
[IterativeImputer] Change: 26129.702612112334, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 6/10, elapsed time 20.74
[IterativeImputer] Change: 23026.283265433456, scaled tolerance: 16508.0 
[IterativeImputer] Ending imputation round 7/10, elapsed time 24.30
[IterativeImputer] Cha

In [0]:
# Vérification de la suppresion des vides sur les colonnes numériques de notre dataset 
data_regr_imputed.isna().sum()


ID                      0
year                    0
loan_amount             0
rate_of_interest        0
Interest_rate_spread    0
Upfront_charges         0
term                    0
property_value          0
income                  0
Credit_Score            0
LTV                     0
Status                  0
dtir1                   0
dtype: int64

In [0]:
data_knn_imput = data.copy()

In [0]:
data_num_2 = data_knn_imput.select_dtypes(include=['float64','int64'])
data_cat_2 = data_knn_imput.select_dtypes(include=['object'])

In [0]:
data_num_2 = list(data_num_2)

In [0]:
# 5ème méthode : Mise en place de la méthode des plus proches voisins pour pallier aux valeurs manquantes dans notre Dataset
# Cette méthode s'applique exclusivement sur les données numériques

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

scaler = StandardScaler()
data_knn_num = data[data_num_2]
data_knn_num_scaled = scaler.fit_transform(data_knn_num)

knn_imputer = KNNImputer(n_neighbors=5) 
knn_imputer.fit(data_knn_num_scaled)
 
data_knn_num_imputed = knn_imputer.fit_transform(data_knn_num_scaled)

In [0]:
# Vérification de la suppresion des vides sur les colonnes numériques de notre dataset 
data_knn_num_final = pd.DataFrame(data_knn_num_imputed, columns=data_num_2)
data_knn_num_final.isna().sum()

ID                      0
year                    0
loan_amount             0
rate_of_interest        0
Interest_rate_spread    0
Upfront_charges         0
term                    0
property_value          0
income                  0
Credit_Score            0
LTV                     0
Status                  0
dtir1                   0
dtype: int64