<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Objectif" data-toc-modified-id="Objectif-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Objectif</a></span></li><li><span><a href="#Chargement-des-packages" data-toc-modified-id="Chargement-des-packages-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Chargement des packages</a></span></li><li><span><a href="#Chargement-des-datasets" data-toc-modified-id="Chargement-des-datasets-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Chargement des datasets</a></span></li><li><span><a href="#Fonctions-personnalisées" data-toc-modified-id="Fonctions-personnalisées-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Fonctions personnalisées</a></span></li><li><span><a href="#Découverte-des-datasets" data-toc-modified-id="Découverte-des-datasets-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Découverte des datasets</a></span></li><li><span><a href="#Séparation-train_test" data-toc-modified-id="Séparation-train_test-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Séparation train_test</a></span></li></ul></div>

# Objectif

# Chargement des packages

In [17]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits import mplot3d
import pandas as pd
import seaborn as sns
import os
import missingno as msno
import warnings
import time
from urllib import request
from zipfile import ZipFile
from sklearn.model_selection import GroupShuffleSplit

pd.options.mode.chained_assignment = None 
warnings.simplefilter(action='ignore', category=FutureWarning)

# ----- Paramètres -----

## Nom du dossier data
path = "./data"
## Nom du fichier zip
filepath = path + "/prj7_data.zip"
## Source du fichier zip
zip_url = "https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/9120/860599/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1620896494&Signature=sExz0pdzSGvmb6V0LSOB27qrh%2BIB2ZaVOYLGj2wAds8eSSdEn%2FhQEN%2FXh1vPXEiXLYwoVM1QLc5E6t7ujJnJuspXqhqRD9zUSSar6y0F%2Fi1gDMk1klzSCL0mLUNr1%2BkLohz3L%2BNikWROA7rXceZH7Hi5nL7QBluzkL2G8bNseDn6YFp%2FlRkK%2FY18Plyxx2mU3%2FuDmAeamc6NZENSDcPgKoxWxz0wWBw%2BXFcsIBvcLx3tSs%2Bdg4HDzi0xyFtBtXMyiACmFTZyLcwkDfqFAAUZSBYsV%2Bvp%2Foe7aPC0jYLYo7O9Nv40wjfhj3QRa95ipqDOxB4cSXW%2FqlT0Ai4isYNuPA%3D%3D&response-content-disposition=attachment%3B+filename%3Dhome-credit-default-risk.zip"

# Chargement des datasets

In [2]:
# Création du dossier data
if not os.path.exists(path):
    os.makedirs(path)

# Téléchargement et dézippage des datasets
if not os.path.exists(filepath):
    r = requests.get(zip_url, allow_redirects=True)

    open(filepath, 'wb').write(r.content)

    with ZipFile(filepath, "r") as zipObj:
        zipObj.extractall(path)

In [4]:
application_test = pd.read_csv("./data/application_test.csv")
application_train = pd.read_csv("./data/application_train.csv")
bureau = pd.read_csv("./data/bureau.csv")
bureau_balance = pd.read_csv("./data/bureau_balance.csv")
credit_card_balance = pd.read_csv("./data/credit_card_balance.csv")
HomeCredit_columns_description = pd.read_csv("./data/HomeCredit_columns_description.csv", engine = "python")
installments_payments = pd.read_csv("./data/installments_payments.csv")
POS_CASH_balance = pd.read_csv("./data/POS_CASH_balance.csv")
previous_application = pd.read_csv("./data/previous_application.csv")

# Fonctions personnalisées

In [35]:
def describe_dataset(dataset):
    print("Le dataset contient {} observations et {} variables".format(dataset.shape[0], dataset.shape[1]))
    print("\n")
    if dataset.shape[1] <= 20:
        print("Variables du dataset :")
        print(dataset.columns.tolist())
        print("\n")
    print("Vue d'ensemble des valeurs manquantes :")
    print(dataset.isna().sum())
    


# Découverte des datasets

In [7]:
dataset_list = [application_test, application_train, bureau, bureau_balance, credit_card_balance, 
               HomeCredit_columns_description, installments_payments, POS_CASH_balance, previous_application]
dataset_list_names = ["application_test", "application_train", "bureau", "bureau_balance", "credit_card_balance", 
               "HomeCredit_columns_description", "installments_payments", "POS_CASH_balance", "previous_application"]

In [11]:
for dataset, dataset_name in zip(dataset_list, dataset_list_names):
    print("Exploration du dataset {}".format(dataset_name))
    describe_dataset(dataset)
    print("\n")
    print("---------------------------------")
    print("\n")

Exploration du dataset application_test
Le dataset contient 48744 observations et 121 variables


Vue d'ensemble des valeurs manquantes :
SK_ID_CURR                       0
NAME_CONTRACT_TYPE               0
CODE_GENDER                      0
FLAG_OWN_CAR                     0
FLAG_OWN_REALTY                  0
                              ... 
AMT_REQ_CREDIT_BUREAU_DAY     6049
AMT_REQ_CREDIT_BUREAU_WEEK    6049
AMT_REQ_CREDIT_BUREAU_MON     6049
AMT_REQ_CREDIT_BUREAU_QRT     6049
AMT_REQ_CREDIT_BUREAU_YEAR    6049
Length: 121, dtype: int64


---------------------------------


Exploration du dataset application_train
Le dataset contient 307511 observations et 122 variables


Vue d'ensemble des valeurs manquantes :
SK_ID_CURR                        0
TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
                              ...  
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU

In [14]:
if not os.path.exists("./databook.xlsx"):
    HomeCredit_columns_description.to_excel("./databook.xlsx")

In [15]:
application_test.columns

Index(['SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=121)

In [16]:
application_train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

# Séparation train_test

In [43]:
inTrain , inTest = next(GroupShuffleSplit(train_size = 0.7, random_state = 42).\
                        split(application_train, groups = application_train["SK_ID_CURR"])
                       )
def trainset(dataset, subsetter=inTrain):
    return dataset.iloc[subsetter]

In [44]:
X = application_train.drop(columns = "TARGET")
y_train = trainset(application_train["TARGET"])
y_test = application_train["TARGET"].iloc[inTest]

In [45]:
X.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,100007,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
application_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
X.shape

(307511, 121)

In [48]:
y_train.shape

(215257,)

In [49]:
y_test.shape

(92254,)

In [50]:
trainset(X).shape

(215257, 121)