In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from scipy.stats import uniform, randint
import gc
from contextlib import contextmanager


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.dummy import DummyClassifier


from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


import joblib
import mlflow
import mlflow.sklearn
import shap


from imblearn.over_sampling import SMOTE


from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
application_test = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/application_test.csv', sep=',', encoding='ISO-8859-1')
application_train = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/application_train.csv', sep=',', encoding='ISO-8859-1')

In [3]:
application_test["type_data"]="test"
application_train["type_data"]="train"

In [4]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    # Exclure la colonne 'Type_data' des colonnes catégorielles
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object' and col != 'type_data']
    # Encodage one-hot pour les autres colonnes catégorielles
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def application_train_test(num_rows=None, nan_as_category=False):
    # Read data and merge
    df = application_train.copy()
    test_df = application_test.copy()

    # Ensure 'TARGET' column exists in test data by filling it with NaN
    test_df['TARGET'] = np.nan

    # Combine train and test datasets
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = pd.concat([df, test_df], axis=0).reset_index(drop=True)

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])

    # Categorical features with One-Hot encode (excluding 'Type_data')
    df, cat_cols = one_hot_encoder(df, nan_as_category)

    # Assurer que 'Type_data' est bien une colonne catégorielle
    df['type_data'] = df['type_data'].astype('category')

    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    # Clean up
    del test_df
    gc.collect()

    return df


# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/bureau.csv', sep=',', encoding='ISO-8859-1')
    bb = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/bureau_balance.csv', sep=',', encoding='ISO-8859-1')
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/previous_application.csv', sep=',', encoding='ISO-8859-1')
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg


# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos= pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/POS_CASH_balance.csv', sep=',', encoding='ISO-8859-1')
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/installments_payments.csv', sep=',', encoding='ISO-8859-1')
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc= pd.read_csv('/Users/amira/Documents/OPC/Projet8/data/credit_card_balance.csv', sep=',', encoding='ISO-8859-1')
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg


In [5]:
train_test_df = application_train_test()           # application_train.csv et application_test.csv
bureau_agg_df = bureau_and_balance()               # bureau.csv et bureau_balance.csv
prev_app_agg_df = previous_applications()          # previous_application.csv
pos_cash_agg_df = pos_cash()                       # POS_CASH_balance.csv
installments_agg_df = installments_payments()      # installments_payments.csv
credit_card_agg_df = credit_card_balance()         # credit_card_balance.csv

# Le DataFrame principal est `train_test_df`, et nous fusionnons chaque ensemble basé sur `SK_ID_CURR`.
train_test_df = train_test_df.merge(bureau_agg_df, on='SK_ID_CURR', how='left')
train_test_df = train_test_df.merge(prev_app_agg_df, on='SK_ID_CURR', how='left')
train_test_df = train_test_df.merge(pos_cash_agg_df, on='SK_ID_CURR', how='left')
train_test_df = train_test_df.merge(installments_agg_df, on='SK_ID_CURR', how='left')
train_test_df = train_test_df.merge(credit_card_agg_df, on='SK_ID_CURR', how='left')

Train samples: 307511, test samples: 48744


In [6]:
train_test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [7]:
# Séparer le DataFrame selon la colonne 'Type_data'
train_df = train_test_df[train_test_df['type_data'] == 'train']
train_df = train_df.drop(columns=['type_data'])

In [8]:
# Calculer les corrélations avec la variable TARGET
corr_with_target = train_df.corr()['TARGET'].sort_values(ascending=False)

# Filtrer les variables ayant une corrélation significative avec TARGET
threshold = 0.05  
significant_vars = corr_with_target[abs(corr_with_target) > threshold].index.tolist()

# Créer un sous-ensemble du DataFrame avec ces variables
df_significant = train_df[significant_vars]

# Calculer la matrice de corrélation pour ces variables
corr_matrix = df_significant.corr().abs()

# Supprimer les variables corrélées entre elles 
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Trouver les variables ayant une corrélation supérieure au seuil
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7)]

# Supprimer les variables corrélées entre elles
df_reduced = df_significant.drop(columns=to_drop)

# Créer un DataFrame sans valeurs manquantes pour le calcul du VIF
X = df_reduced.dropna().assign(constant=1)

# Calculer les VIF pour chaque variable sur les données sans valeurs manquantes
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Filtrer les variables ayant un VIF élevé 
final_vars = vif_data[vif_data['VIF'] < 5]['feature'].tolist()

# Supprimer la constante du résultat final si présente
if 'constant' in final_vars:
    final_vars.remove('constant')

# Les variables finales sélectionnées
print("Variables sélectionnées :", final_vars)

# Créer un dataframe qui regroupe les variables sélectionnées
train_df_2 = train_df[final_vars]

# Ajouter SK_ID_CURR au DataFrame final
if 'SK_ID_CURR' in train_df.columns:
    train_df_2['SK_ID_CURR'] = train_df['SK_ID_CURR']

Variables sélectionnées : ['TARGET', 'CC_CNT_DRAWINGS_ATM_CURRENT_MEAN', 'CC_CNT_DRAWINGS_CURRENT_MAX', 'BURO_DAYS_CREDIT_MEAN', 'CC_AMT_BALANCE_MEAN', 'DAYS_BIRTH', 'PREV_NAME_CONTRACT_STATUS_Refused_MEAN', 'BURO_CREDIT_ACTIVE_Active_MEAN', 'DAYS_EMPLOYED', 'REFUSED_DAYS_DECISION_MAX', 'CC_AMT_BALANCE_MIN', 'ACTIVE_DAYS_CREDIT_MEAN', 'CC_CNT_DRAWINGS_ATM_CURRENT_MAX', 'CC_MONTHS_BALANCE_MEAN', 'BURO_STATUS_1_MEAN_MEAN', 'CC_CNT_DRAWINGS_ATM_CURRENT_VAR', 'REGION_RATING_CLIENT_W_CITY', 'CC_AMT_DRAWINGS_CURRENT_MEAN', 'NAME_INCOME_TYPE_Working', 'PREV_NAME_PRODUCT_TYPE_walk-in_MEAN', 'PREV_CODE_REJECT_REASON_SCOFR_MEAN', 'DAYS_LAST_PHONE_CHANGE', 'APPROVED_DAYS_DECISION_MIN', 'DAYS_ID_PUBLISH', 'REG_CITY_NOT_WORK_CITY', 'REFUSED_HOUR_APPR_PROCESS_START_MIN', 'CODE_GENDER', 'BURO_STATUS_C_MEAN_MEAN', 'NAME_EDUCATION_TYPE_Higher education', 'PREV_NAME_CONTRACT_STATUS_Approved_MEAN', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_2['SK_ID_CURR'] = train_df['SK_ID_CURR']


In [15]:
train_df_2

Unnamed: 0,TARGET,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_MAX,BURO_DAYS_CREDIT_MEAN,CC_AMT_BALANCE_MEAN,DAYS_BIRTH,PREV_NAME_CONTRACT_STATUS_Refused_MEAN,BURO_CREDIT_ACTIVE_Active_MEAN,DAYS_EMPLOYED,REFUSED_DAYS_DECISION_MAX,...,REG_CITY_NOT_WORK_CITY,REFUSED_HOUR_APPR_PROCESS_START_MIN,CODE_GENDER,BURO_STATUS_C_MEAN_MEAN,NAME_EDUCATION_TYPE_Higher education,PREV_NAME_CONTRACT_STATUS_Approved_MEAN,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,SK_ID_CURR
0,1.0,,,-874.000000,,-9461,0.000000,0.250000,-637.0,,...,0,,0,0.175426,0,1.000000,0.083037,0.262949,0.139376,100002
1,0.0,,,-1400.750000,,-16765,0.000000,0.250000,-1188.0,,...,0,,1,,1,1.000000,0.311267,0.622246,,100003
2,0.0,,,-867.000000,,-19046,0.000000,0.000000,-225.0,,...,0,,0,,0,1.000000,,0.555912,0.729567,100004
3,0.0,,0.0,,0.0,-19005,0.111111,,-3039.0,-181.0,...,0,15.0,1,,0,0.555556,,0.650442,,100006
4,0.0,,,-1149.000000,,-19932,0.000000,0.000000,-3038.0,,...,1,,0,,0,1.000000,,0.322738,,100007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,0.0,,,,,-9327,0.000000,,-236.0,,...,0,,0,,0,1.000000,0.145570,0.681632,,456251
307507,0.0,,,,,-20775,0.000000,,,,...,0,,1,,0,1.000000,,0.115992,,456252
307508,0.0,,,-867.500000,,-14966,0.000000,0.500000,-7921.0,,...,1,,1,0.459677,1,1.000000,0.744026,0.535722,0.218859,456253
307509,1.0,,,-1104.000000,,-11961,0.000000,0.000000,-4786.0,,...,1,,1,0.783784,0,1.000000,,0.514163,0.661024,456254


In [10]:
quantitative_columns = train_df_2.select_dtypes(include=[np.number]).columns
quantitative_columns = quantitative_columns.drop('SK_ID_CURR')

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Supposons que train_df_2 contient déjà la variable 'TARGET' et 'SK_ID_CURR'

# Faire le split directement sur le DataFrame complet avec 'TARGET'
df_train, test_df = train_test_split(train_df_2, test_size=0.2, random_state=42)

# Ensuite, on sépare les colonnes après le split
X_test = test_df.drop(columns='TARGET')
y_test = test_df['TARGET']

# Sélectionner uniquement les colonnes quantitatives, en excluant SK_ID_CURR
quantitative_columns = X_test.select_dtypes(include=[np.number]).columns
quantitative_columns = quantitative_columns.drop('SK_ID_CURR', errors='ignore')

# Initialiser l'imputer pour remplacer NaN par 0
imputer = SimpleImputer(strategy='constant', fill_value=0)

# Fit l'imputer sur les données d'entraînement
imputer.fit(df_train[quantitative_columns])

# Appliquer l'imputer sur les colonnes quantitatives des jeux de données de test
X_test[quantitative_columns] = imputer.transform(X_test[quantitative_columns])

# Standardiser les données avec MinMaxScaler
scaler = MinMaxScaler()

# Fit le scaler sur les colonnes quantitatives des données d'entraînement
scaler.fit(df_train[quantitative_columns])

# **Appliquer la standardisation uniquement sur les colonnes quantitatives**
X_test_scaled = scaler.transform(X_test[quantitative_columns])

# Créer un DataFrame à partir de X_test_scaled
X_test_df = pd.DataFrame(X_test_scaled, columns=quantitative_columns)

# Ajouter la colonne 'SK_ID_CURR' sans prétraitement
X_test_df['SK_ID_CURR'] = test_df['SK_ID_CURR'].values

# Ajouter la colonne 'TARGET' à partir de test_df
X_test_df['TARGET'] = test_df['TARGET'].values

# Renommer les colonnes si nécessaire
X_test_df.rename(columns={
    'PREV_NAME_PRODUCT_TYPE_walk-in_MEAN': 'PREV_NAME_PRODUCT_TYPE_walk_in_MEAN',
    'NAME_EDUCATION_TYPE_Higher education': 'NAME_EDUCATION_TYPE_Higher_education'
}, inplace=True)

# Enregistrer les 5 premières lignes dans un fichier CSV
X_test_df.head(5).to_csv('data_illustration_dashboard.csv', index=False)

print("Le jeu de test a été enregistré sous 'data_illustration_dashboard.csv'.")


Le jeu de test a été enregistré sous 'data_illustration_dashboard.csv'.


In [20]:
data = pd.read_csv('data_illustration_dashboard.csv', sep=',')

In [21]:
data

Unnamed: 0,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_MAX,BURO_DAYS_CREDIT_MEAN,CC_AMT_BALANCE_MEAN,DAYS_BIRTH,PREV_NAME_CONTRACT_STATUS_Refused_MEAN,BURO_CREDIT_ACTIVE_Active_MEAN,DAYS_EMPLOYED,REFUSED_DAYS_DECISION_MAX,CC_AMT_BALANCE_MIN,...,REFUSED_HOUR_APPR_PROCESS_START_MIN,CODE_GENDER,BURO_STATUS_C_MEAN_MEAN,NAME_EDUCATION_TYPE_Higher_education,PREV_NAME_CONTRACT_STATUS_Approved_MEAN,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,SK_ID_CURR,TARGET
0,0.0,0.0,0.608878,0.003145,0.672604,0.6,0.428571,0.957459,0.558562,0.230011,...,0.478261,0.0,0.153917,0.0,0.4,0.705732,0.707479,0.0,384575,0.0
1,0.025,0.054545,0.711978,0.031674,0.589121,0.166667,0.4,0.9363,0.741438,0.230011,...,0.26087,0.5,0.0,1.0,0.833333,0.44422,0.497486,0.794687,214010,0.0
2,0.0,0.0,0.705966,0.003145,0.41274,0.142857,0.5,0.964326,0.851027,0.230011,...,0.652174,0.5,0.462302,0.0,0.428571,0.547108,0.621942,0.231648,142232,0.0
3,0.0,0.0,0.171458,0.003145,0.31584,0.0,0.0,0.610205,1.000685,0.230011,...,0.0,0.5,0.0,0.0,1.0,-0.015547,0.811136,0.685538,389171,0.0
4,0.0,0.0,0.583961,0.003145,0.276325,0.333333,0.333333,0.93831,0.595205,0.230011,...,0.434783,0.0,0.0,0.0,0.555556,0.683325,0.655778,0.710063,283617,0.0
