### Importation des librairies

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#Traitement des messages d'erreur :
warnings.filterwarnings('ignore')

import sklearn
#Librairie pour créer un dataset de train et un de test
from sklearn.model_selection import train_test_split
#Librairie pour l'encodage des variables catégorielles
from sklearn.preprocessing import LabelEncoder
#Librairies pour réaliser un pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

### Fonctions utilisées dans le notebook

In [2]:
#Fonction pour préparer le pipeline qui traitera les variables numériques
def Preprocessing (numeric):
    """Fonction pour préparer le pipeline afin de pouvoir traiter les variables numériques"""
    #On prend la médiane concernant le traitement des Nan et on standardise les données à l'aide de RobustScaler
    #Ce Scaler supprime la médiane et met à l'échelle les données en fonction de l'intervalle des quantiles
    numeric_transfs = [('imputer',SimpleImputer(missing_values= np.NAN, strategy= 'median')),('scaler', RobustScaler())]
    numeric_pipeline = Pipeline(numeric_transfs)
    all_transfs = [("numeric",numeric_pipeline,numeric)]
    full_preprocessor = ColumnTransformer(all_transfs, remainder='passthrough')
    return full_preprocessor

### Lecture du dataset

In [3]:
path = 'Projet_7/'
data_work = pd.read_csv(path + "data_final.csv")

In [4]:
display(data_work.head())
print("Voici la taille du dataset : ", data_work.shape)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,DAYS_DECISION,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_LAST_DUE,DAYS_TERMINATION,NAME_CONTRACT_STATUS,CODE_REJECT_REASON,NAME_CLIENT_TYPE,CNT_INSTALMENT_FUTURE,SK_DPD_y
0,100002,1,Cash loans,M,0,202500.0,406597.5,Unaccompanied,working,low_level_education,...,-606.0,24.0,365243.0,-25.0,-17.0,Approved,XAP,New,15.0,0.0
1,100003,0,Cash loans,F,0,270000.0,1293502.5,Family,working,high_level_education,...,-1305.0,10.0,365243.0,-1054.333333,-1047.333333,Approved,XAP,Refreshed,4.909091,0.0
2,100004,0,Revolving loans,M,0,67500.0,135000.0,Unaccompanied,working,low_level_education,...,-815.0,4.0,365243.0,-724.0,-714.0,Approved,XAP,New,0.0,0.0
3,100006,0,Cash loans,F,0,135000.0,312682.5,Unaccompanied,working,low_level_education,...,-272.444444,23.0,365243.0,182477.5,182481.75,Approved,XAP,Repeater,8.65,0.0
4,100007,0,Cash loans,M,0,121500.0,513000.0,Unaccompanied,working,low_level_education,...,-1222.833333,20.666667,365243.0,72136.2,72143.8,Approved,XAP,Repeater,11.666667,0.0


Voici la taille du dataset :  (307511, 47)


### Préparation des différents datasets

In [5]:
#On crée une variable pour stocker le nombre de lignes du dataset
train_len = data_work.shape[0]
print(train_len)

307511


In [6]:
#On crée un dataset qui stockera la variable SK_ID_CURR et on la supprimera du dataset de train
train_dataset = data_work[:train_len]
train_ids = train_dataset['SK_ID_CURR']
train_dataset.drop(columns=['SK_ID_CURR'], axis = 1, inplace=True)

In [7]:
#On crée 2 datasets, un pour les variables utilisées pour la modélisation et un autre pour stocker la variable
#que l'on souhaite prédire
train_dataset['TARGET'] = train_dataset['TARGET'].astype(int)
target = train_dataset['TARGET']
features = train_dataset.drop(columns=['TARGET'], axis = 1)
print('x_train data shape: ', features.shape)
print('y_train data shape: ', target.shape)

#Export des dataset target et features en CSV pour utilisation de la fonction SMOTE ultérieurement
target.to_csv(path + 'target.csv',index=False)
features.to_csv(path + 'features.csv',index=False)

x_train data shape:  (307511, 45)
y_train data shape:  (307511,)


In [98]:
target.info()

<class 'pandas.core.series.Series'>
RangeIndex: 307511 entries, 0 to 307510
Series name: TARGET
Non-Null Count   Dtype
--------------   -----
307511 non-null  int32
dtypes: int32(1)
memory usage: 1.2 MB


In [99]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   NAME_CONTRACT_TYPE           307511 non-null  object 
 1   CODE_GENDER                  307511 non-null  object 
 2   CNT_CHILDREN                 307511 non-null  int64  
 3   AMT_INCOME_TOTAL             307511 non-null  float64
 4   AMT_CREDIT_x                 307511 non-null  float64
 5   NAME_TYPE_SUITE              306219 non-null  object 
 6   NAME_INCOME_TYPE             307511 non-null  object 
 7   NAME_EDUCATION_TYPE          307511 non-null  object 
 8   NAME_FAMILY_STATUS           307511 non-null  object 
 9   REGION_POPULATION_RELATIVE   307511 non-null  float64
 10  DAYS_BIRTH                   307511 non-null  int64  
 11  DAYS_EMPLOYED                307511 non-null  int64  
 12  OWN_CAR_AGE                  104582 non-null  float64
 13 

### Preprocessing

In [100]:
#On commence par diviser le dataset en un dataset de train et un dataset de test avec une proportion de 80% / 20%
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [101]:
#On remet la variable concernant une période sous un format positif
X_test['DAYS_INSTALMENT_delay'] = X_test['DAYS_INSTALMENT_delay'].mul(-1)

In [102]:
#On remet la variable concernant une période sous un format positif
X_train['DAYS_INSTALMENT_delay'] = X_train['DAYS_INSTALMENT_delay'].mul(-1)

In [103]:
#Création de dataframe avec X_test, y_train et y_test que l'on sauvegarde au format .CSV
pd.DataFrame(X_test).to_csv(path + 'X_test.csv',index=False)
pd.DataFrame(y_train).to_csv(path + 'y_train.csv',index=False)
pd.DataFrame(y_test).to_csv(path + 'y_test.csv',index=False)

In [104]:
#Création d'une variable avec la liste des colonnes catégorielles du dataset features
data_categ = list(features.select_dtypes(exclude=["number"]).columns)
#Création d'une variable avec la liste des colonnes numériques du dataset features
data_num = list(features.select_dtypes(exclude=["bool_","object_"]).columns)

In [105]:
print("Voici la liste des colonnes catégorielles : ")
data_categ

Voici la liste des colonnes catégorielles : 


['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'OCCUPATION_TYPE',
 'CREDIT_ACTIVE',
 'NAME_CONTRACT_STATUS',
 'CODE_REJECT_REASON',
 'NAME_CLIENT_TYPE']

In [106]:
print("Voici la liste des colonnes numériques : ")
data_num

Voici la liste des colonnes numériques : 


['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT_x',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_CITY_NOT_WORK_CITY',
 'TOTALAREA_MODE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'FLAG_DOCUMENT_3',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'DAYS_CREDIT',
 'AMT_CREDIT_SUM',
 'AMT_BALANCE',
 'AMT_PAYMENT_CURRENT',
 'SK_DPD_x',
 'SK_DPD_DEF',
 'DAYS_INSTALMENT_delay',
 'AMT_INSTALMENT_delta',
 'AMT_ANNUITY',
 'AMT_CREDIT_y',
 'AMT_DOWN_PAYMENT',
 'DAYS_DECISION',
 'CNT_PAYMENT',
 'DAYS_FIRST_DRAWING',
 'DAYS_LAST_DUE',
 'DAYS_TERMINATION',
 'CNT_INSTALMENT_FUTURE',
 'SK_DPD_y']

In [107]:
#On va maintenant encoder les variables catégorielles
encoder = LabelEncoder()

for col in data_categ:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.fit_transform(X_test[col])

display(X_train)

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,...,DAYS_DECISION,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_LAST_DUE,DAYS_TERMINATION,NAME_CONTRACT_STATUS,CODE_REJECT_REASON,NAME_CLIENT_TYPE,CNT_INSTALMENT_FUTURE,SK_DPD_y
182078,0,0,0,130500.0,1170000.0,6,1,1,0,0.035792,...,-1899.000000,10.000000,365243.000000,-1582.000000,-1577.000000,0,7,2,,
171974,0,0,1,225000.0,432661.5,6,1,1,1,0.031329,...,-1229.250000,9.333333,365243.000000,120299.333333,120303.000000,0,7,2,11.000000,0.0
197505,0,0,0,157500.0,342000.0,6,1,1,0,0.018209,...,-785.916667,8.000000,313024.428571,103756.714286,103765.285714,0,7,2,10.700000,0.0
131372,0,0,0,360000.0,1305000.0,6,1,1,0,0.020246,...,-1313.571429,16.571429,365243.000000,51203.857143,51209.571429,0,7,2,15.272727,0.0
140700,0,0,0,108000.0,450000.0,6,1,1,1,0.011657,...,-1677.800000,2.800000,242682.000000,-1572.333333,120890.000000,0,7,2,5.090909,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180786,0,1,1,90000.0,807534.0,6,1,1,0,0.020246,...,-688.000000,16.000000,365243.000000,-293.000000,-287.500000,0,7,2,7.307692,0.0
21579,1,0,0,67500.0,202500.0,6,1,0,0,0.014520,...,-1496.000000,24.000000,365243.000000,-895.000000,-891.000000,0,7,1,,
96283,0,0,2,90000.0,550489.5,6,1,1,0,0.010500,...,-1638.666667,11.333333,365243.000000,-1376.666667,-1371.666667,0,7,2,8.400000,0.0
118942,0,0,0,121500.0,239850.0,6,1,1,1,0.024610,...,-351.833333,19.250000,365243.000000,145734.600000,145740.400000,0,7,2,10.846154,0.0


In [108]:
#Utilisation de la fonction avec le pipeline traitant les variables numériques
#Entrainement des variables numériques
preprocessor_fitted = Preprocessing(data_num).fit(X_train)
X_train_transformed = preprocessor_fitted.transform(X_train)
X_test_transformed = preprocessor_fitted.transform(X_test)

In [109]:
#On vérifie maintenant que les différentes transformations des types des colonnes ont bien eu lieu
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246008 entries, 182078 to 134878
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   NAME_CONTRACT_TYPE           246008 non-null  int32  
 1   CODE_GENDER                  246008 non-null  int32  
 2   CNT_CHILDREN                 246008 non-null  int64  
 3   AMT_INCOME_TOTAL             246008 non-null  float64
 4   AMT_CREDIT_x                 246008 non-null  float64
 5   NAME_TYPE_SUITE              246008 non-null  int32  
 6   NAME_INCOME_TYPE             246008 non-null  int32  
 7   NAME_EDUCATION_TYPE          246008 non-null  int32  
 8   NAME_FAMILY_STATUS           246008 non-null  int32  
 9   REGION_POPULATION_RELATIVE   246008 non-null  float64
 10  DAYS_BIRTH                   246008 non-null  int64  
 11  DAYS_EMPLOYED                246008 non-null  int64  
 12  OWN_CAR_AGE                  83633 non-null   float64

On voit que cela est conforme car nous n'avons plus que des variables numériques

In [110]:
print("Voici la taille du dataset X_test_transformed : ", X_test_transformed.shape)

Voici la taille du dataset X_test_transformed :  (61503, 45)


In [111]:
print("Voici la taille du dataset X_train_transformed : ", X_train_transformed.shape)

Voici la taille du dataset X_train_transformed :  (246008, 45)


In [112]:
#Création de dataframe avec X_train_transformed et X_test_transformed que l'on sauvegarde au format .CSV
pd.DataFrame(X_train_transformed).to_csv(path + 'X_train_transformed.csv',index=False)
pd.DataFrame(X_test_transformed).to_csv(path + 'X_test_transformed.csv',index=False)

### Conclusion

On a donc pu réaliser un split avec un dataset de train et un de test que nous avons sauvegarder en CSV.
Les datasets X_train_transformed et X_test_transformed permmettent d'avoir des datasets avec des variables catégorielles encodées pour préparer l'entrainement des modèles.