**Accueil Risque de défaut de crédit Concurrence**





#**Importation les données**

Nous utilisons une pile typique de science des données : numpy, pandas, sklearn, matplotlib.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Imports libs
import pandas as pd
import numpy as np
import os, warnings, csv, pickle
warnings.filterwarnings('ignore')
from joblib import dump, load

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split


In [None]:
# Import data set
path ="/content/drive/MyDrive/Data.Projet3/Projet7/Projet+credit/"

app_train = pd.read_csv(path + "application_train.csv", index_col = "SK_ID_CURR")
print('Training data shape: ', app_train.shape)

Training data shape:  (307511, 121)


In [None]:
app_train['AGE'] = (app_train['DAYS_BIRTH']/-365).astype('int64')
app_train['DAYS_EMPLOYED'].replace({365243: 0}, inplace = True)
app_train['YEARS_EMPLOYED'] = round((app_train['DAYS_EMPLOYED']/-365).astype('int64'), 0)
app_train['YEARS_EMPLOYED_PERCENT'] = app_train['YEARS_EMPLOYED'] / app_train['AGE']
app_train.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

app_train["CREDIT_ANNUITY_RATIO"] = app_train["AMT_CREDIT"] / app_train["AMT_ANNUITY"]
app_train["INCOME_ANNUITY_RATIO"] = app_train["AMT_INCOME_TOTAL"] / app_train["AMT_ANNUITY"]
app_train["INCOME_CREDIT_RATIO"] = app_train["AMT_INCOME_TOTAL"] / app_train["AMT_CREDIT"]
app_train["CREDIT_GOODS_PRICE_RATIO"] = app_train["AMT_CREDIT"] / app_train["AMT_GOODS_PRICE"]
app_train["CREDIT_DOWNPAYMENT"] = app_train["AMT_GOODS_PRICE"] / app_train["AMT_CREDIT"]
app_train["CREDIT_INCOME_PERCENT"] = app_train["AMT_CREDIT"] / app_train["AMT_INCOME_TOTAL"]
app_train["ANNUITY_INCOME_PERCENT"] = app_train["AMT_ANNUITY"] / app_train["AMT_INCOME_TOTAL"]
app_train["RATIO_CREDIT_GOODS_PRICE"] = app_train["AMT_CREDIT"] / app_train["AMT_GOODS_PRICE"]
app_train["DIFF_GOODS_PRICE_CREDIT"] = app_train["AMT_CREDIT"] - app_train["AMT_GOODS_PRICE"]
app_train['CREDIT_TERM'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']

In [None]:
def sampling(df, sample_size):
    df = df.copy()
    print('la taill de dataset initial : ', df.shape)
    df = df.reset_index(drop = True)
    df_sample0 = df[df['TARGET'] == 0]
    df_sample1 = df[df['TARGET'] == 1]
    n0 = round(len(df_sample0)*sample_size)
    n1 = round(len(df_sample1)*sample_size)
    print('le nombre de lignes de df_sample0: ', n0)
    print('le nombre de lignes de df_sample1: ', n1)

    df_sample0 = df_sample0.sample(n = n0)
    df_sample1 = df_sample1.sample(n = n1)

    df_sample = pd.concat([df_sample0, df_sample1], axis=0)
    df_sample = df_sample.reset_index(drop=True)
    print('Random under-sampling:')
    print(df_sample.shape)
    print(df_sample.TARGET.value_counts())
    return df_sample

def split_stratified(df, test_size):
    X = df.drop(columns = ['TARGET'])
    y = df['TARGET']
    X = pd.DataFrame(X, columns = X.columns)
    sss = StratifiedShuffleSplit(n_splits=2, test_size=test_size, random_state=0)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_stratified(app_train, test_size = 0.25)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(app_train.drop(columns = ['TARGET']), app_train['TARGET'], test_size = 0.25)

In [None]:
#FEATURES
col_qunti = sorted(list(set(app_train.select_dtypes('float64').columns).union(set(app_train.select_dtypes('int64').columns))))
col_qunti.remove("TARGET")

poly_features = sorted(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'AGE'])

col_qunti_except_poly = [x for x in col_qunti if x not in poly_features]

col_category = sorted(list(set(app_train.select_dtypes('object').columns)))
label_encode_features = sorted(["NAME_CONTRACT_TYPE", "FLAG_OWN_CAR", "FLAG_OWN_REALTY"])
one_hot_encode_features = sorted(list(set(col_category)-set(label_encode_features)))

In [None]:
#preprocessor_dashboard
quanti_processor_dashboard = Pipeline(steps=[('impute_quanti', SimpleImputer(missing_values = np.nan,
                                                                                 strategy='median'))])
category_preprocessor_dashboard = Pipeline(steps=[('impute_category', SimpleImputer(missing_values = np.nan,
                                                                                    strategy = "constant",
                                                                                    fill_value = "NC"))])
preprocessor_dashboard = ColumnTransformer(
    transformers=[
        ('quanti_processor', quanti_processor_dashboard, col_qunti),
        ('category_processor', category_preprocessor_dashboard, col_category)
    ], verbose = False)

pretraitement_dashboard = Pipeline(steps=[('preprocessor', preprocessor_dashboard)])
#col_name for dataframe - dashboard
dashboard_col_name = [*col_qunti, *col_category]
with open('dashboard_col_name.pkl', 'wb') as f:
    pickle.dump(dashboard_col_name, f)

In [None]:
X_train = X_train[dashboard_col_name]
pretraitement_dashboard.fit(X_train, y_train)
dump(pretraitement_dashboard, 'pretraitement_dashboard.joblib')

['pretraitement_dashboard.joblib']

In [None]:
#preprocessor_prediction
quanti_processor_prediction = Pipeline(steps=[('impute_quanti', SimpleImputer(missing_values = np.nan,
                                                                                  strategy='median')),
                                              ('standard', MinMaxScaler(feature_range = (0, 1)))])
poly_processor = Pipeline(steps=[('impute_quanti', SimpleImputer(missing_values = np.nan,
                                                                     strategy='median')),
                                 ('polynomial', PolynomialFeatures(degree = 3))])
label_encode_preprocessor = Pipeline(steps=[('impute_category', SimpleImputer(missing_values = np.nan,
                                                                              strategy = "constant",
                                                                              fill_value = "NC")),
                                            ('label_encode', OrdinalEncoder())])
one_hot_encode_preprocessor = Pipeline(steps=[('impute_category', SimpleImputer(missing_values = np.nan,
                                                                                strategy = "constant",
                                                                                fill_value = "NC")),
                                              ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))])
preprocessor_prediction = ColumnTransformer(
    transformers=[
        ('quanti_processor', quanti_processor_prediction, col_qunti_except_poly),
        ('poly_processor', poly_processor, poly_features), 
        ('label_encode_processor', label_encode_preprocessor, label_encode_features),
        ('one_hot_encode_processor', one_hot_encode_preprocessor, one_hot_encode_features)
    ], verbose = False)

pretraitement_prediction = Pipeline(steps=[('preprocessor', preprocessor_prediction)])

In [None]:
pretraitement_prediction.fit(X_train, y_train)
dump(pretraitement_prediction, 'pretraitement_prediction.joblib')
#col_name for dataframe - prediction
poly_col = pretraitement_prediction.named_steps['preprocessor'].transformers_[1][1].named_steps['polynomial'].get_feature_names(poly_features)
one_hot_encode_col = pretraitement_prediction.named_steps['preprocessor'].transformers_[3][1].named_steps['one_hot_encode'].get_feature_names(one_hot_encode_features)
col_name = [*col_qunti_except_poly, *poly_col, *label_encode_features, *one_hot_encode_col]
with open('col_name_list.pkl', 'wb') as f:
    pickle.dump(col_name, f)

In [None]:
model = Pipeline(steps=[('classifier', LogisticRegression(max_iter=1000,
                                                          solver='sag',
                                                          multi_class = 'ovr',
                                                          class_weight='balanced',
                                                          C = 0.1,
                                                          penalty = "l2"))])
X_train_prediction = pretraitement_prediction.fit_transform(X_train, y_train)
model.fit(X_train_prediction, y_train)
dump(model, 'model.joblib')

['model.joblib']

In [None]:
sample_data = app_train.iloc[0:2500, :]
sample_data.to_csv('sample_data.csv', index=True)