In [2]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [3]:
"""
Le code pour la construction des données a été repris (et adapté) de :
https://www.kaggle.com/code/jsaguiar/lightgbm-with-simple-features

La création de nouvelles features a aussi été reprise en partie de :
https://github.com/js-aguiar/home-credit-default-competition

"""

import cupy as cp
import gc
import os
import numpy as np
import pandas as pd
import joblib
import warnings


warnings.simplefilter(action="ignore", category=FutureWarning)
## np.warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
np.seterr(all="ignore", invalid="ignore")

{'divide': 'ignore', 'over': 'ignore', 'under': 'ignore', 'invalid': 'ignore'}

In [None]:
from src.modeling.p7_simple_kernel import DataSimple
from src.modeling.p7_util import timer
from src.modeling.p7_file import files_list_pattern, make_dir
from src.modeling.p7_file import download_file
from src.modeling.p7_file import dezip

from src.modeling.p7_constantes import (
    DATA_URL, 
    FILE_ZIP,
    DATA_DIR,
    DATA_BASE,
    DATA_INTERIM,
    DATA_CLEAN_DIR,
    MODEL_DIR,
)

%load_ext autoreload
%autoreload 2
#autoreload 2 = Reload all modules (except those excluded by %aimport)
# every time before executing the Python code typed.

# Télécharger les données

## Liens de téléchargement et fichier à télécharger

In [5]:
DATA_URL

'https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/Parcours_data_scientist/Projet+-+Impl%C3%A9menter+un+mod%C3%A8le+de+scoring/Projet+Mise+en+prod+-+home-credit-default-risk.zip'

In [6]:
FILE_ZIP

'Projet+Mise+en+prod+-+home-credit-default-risk.zip'

## Création de directories si nécessaire

In [7]:
# Si les répertoires pour stocker les données n'existent pas, on les crée
dir_to_create = [DATA_DIR, DATA_BASE, DATA_CLEAN_DIR, MODEL_DIR]
make_dir(dir_to_create)

data/ dossier déjà existant
data/base/ dossier déjà existant
data/cleaned/ dossier déjà existant
models/ dossier déjà existant


## Téléchargement et dézippage

In [8]:
# Télécharge le fichier archive du premier jeu de données
download_file(url_download=DATA_URL, dir_output=DATA_BASE, to_download=FILE_ZIP)

Projet+Mise+en+prod+-+home-credit-default-risk.zip téléchargé dans le répertoire data/base/
Archive zip dézippée dans le répertoire data/base/


In [9]:
pattern = f"{DATA_BASE}*.csv"
files = files_list_pattern(pattern)
print(f"{len(files)}  fichier dézippés :")
for file in files:
    print(f"{file}")

10  fichier dézippés :
data/base/bureau_balance.csv
data/base/credit_card_balance.csv
data/base/installments_payments.csv
data/base/HomeCredit_columns_description.csv
data/base/application_train.csv
data/base/POS_CASH_balance.csv
data/base/application_test.csv
data/base/bureau.csv
data/base/previous_application.csv
data/base/sample_submission.csv


# Construction des données avec Simple Kernel

In [10]:
data_builder = DataSimple()

## Configuration

In [11]:
data_builder.init_config(debug=False)

Paramètres ['debug'] mis à jour


In [12]:
data_builder.print_config()

Configuration :
	dataset_num : 01
	debug : False
	n_rows : None
	drop_first : True
	frac_test : 0.25
	test_is_stratified : True
	input_dir : data/base/
	output_dir : data/interim/
	train_name : 01_v0_built_train.csv
	test_name : 01_v0_built_test.csv
	na_value : nan
	features_label_encoding : []
	features_oh_encoded : []
	random_state : 42


## Aggrégation des données

In [13]:
with timer(f"Pipeline d'agrégation des données sur {data_builder._device.upper()}"):
    df = data_builder.get_data()

Nombre de lignes Application train + test: 307511
Process Application - duration (hh:mm:ss) : 0:00:01
Bureau df shape: (305811, 112)
Process bureau and bureau_balance - duration (hh:mm:ss) : 0:00:01
Previous applications df shape: (338857, 235)
Process previous_applications - duration (hh:mm:ss) : 0:00:02
Pos-cash balance df shape: (337252, 17)
Process POS-CASH balance - duration (hh:mm:ss) : 0:00:01
Installments payments df shape: (339587, 26)
Process installments payments - duration (hh:mm:ss) : 0:00:01
Credit card balance df shape: (103558, 122)
Process credit card balance - duration (hh:mm:ss) : 0:00:01
All data shape : (307511, 751)
Pipeline d'agrégation des données sur CUDA - duration (hh:mm:ss) : 0:00:07


In [14]:
data_builder.features_oh_encoded

['NAME_CONTRACT_TYPE_Revolving loans',
 'NAME_TYPE_SUITE_Family',
 'NAME_TYPE_SUITE_Group of people',
 'NAME_TYPE_SUITE_Other_A',
 'NAME_TYPE_SUITE_Other_B',
 'NAME_TYPE_SUITE_Spouse, partner',
 'NAME_TYPE_SUITE_Unaccompanied',
 'NAME_TYPE_SUITE_<NA>',
 'NAME_INCOME_TYPE_Commercial associate',
 'NAME_INCOME_TYPE_Maternity leave',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_INCOME_TYPE_State servant',
 'NAME_INCOME_TYPE_Student',
 'NAME_INCOME_TYPE_Unemployed',
 'NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Higher education',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'NAME_EDUCATION_TYPE_Lower secondary',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_FAMILY_STATUS_Married',
 'NAME_FAMILY_STATUS_Separated',
 'NAME_FAMILY_STATUS_Single / not married',
 'NAME_FAMILY_STATUS_Widow',
 'NAME_FAMILY_STATUS_<NA>',
 'NAME_HOUSING_TYPE_House / apartment',
 'NAME_HOUSING_TYPE_Municipal apartment',
 'NAME_HOUSING_TYPE_Office apartment',
 'NAME_HOUSING_TYPE_Rented apartment',
 'NAME_HOU

In [15]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Refused_MEAN,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Refused_VAR,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_SUM,CC_NAME_CONTRACT_STATUS_Sent proposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_COUNT
25936,130161,0,1,0,1,1,157500.0,339948.0,26437.5,315000.0,...,,,,,,,,,,
25937,130162,0,0,0,1,0,108000.0,657702.0,21343.5,549000.0,...,,,,,,,,,,
25938,130163,0,1,0,0,0,157500.0,765261.0,30478.5,684000.0,...,,,,,,,,,,
25939,130164,0,0,1,0,0,270000.0,1125000.0,37309.5,1125000.0,...,,,,,,,,,,
25941,130166,0,0,1,0,1,315000.0,270000.0,13500.0,270000.0,...,,,,,,,,,,


## Nettoyage basique

In [16]:
df = data_builder.clean_data(df)

Renommage des colonnes

Traitement des valeurs inf
4 features comportent des valeurs infinies :
['REFUSED_APP_CREDIT_PERC_MAX', 'INSTAL_PAYMENT_PERC_MAX', 'INSTAL_PAYMENT_PERC_MEAN', 'INSTAL_PAYMENT_PERC_SUM']
Aucun doublon dans la colonne SK_ID_CURR

Création des index Train / Test 25.0%

Suppression des colonnes de variance nulle sur le Train
4 features de variance nulle dans le Train supprimées
['BURO_CREDIT_TYPE_Mobileoperatorloan_MEAN', 'PREV_NAME_GOODS_CATEGORY_HouseConstruction_MEAN', 'CC_SK_DPD_MIN', 'CC_SK_DPD_DEF_MIN']

Suppression des colonnes vides sur le Train
Aucune colonne complètement vide sur le Train

all_data.info :
<class 'cudf.core.dataframe.DataFrame'>
Index: 307511 entries, 25936 to 295365
Columns: 747 entries, SK_ID_CURR to CC_COUNT
dtypes: bool(124), float64(518), int32(1), int64(100), int8(4)
memory usage: 1.5 GB


## Cast et optimisation mémoire pour CUDA

In [17]:
df = data_builder.cast_and_optimize(df)

34 features binaires castées en bool (exclusion _SUM, _MAX, _MIN):
['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

Cast des autres types pour optimisation
Utilisation mémoire du cuDF 1446.29 Mo
Consommation Mémoire après optimisation : 802.86 Mo
Réduction de 44.5%


## Récupérer les données de Train et de test - Sauvegarde

In [18]:
train, test = data_builder.get_train_and_test(df)


Séparation réelle du train et du test
Train : 75.0% des données - Forme : (230634, 747) - Mem : 602.16 Mo
Test : 25.0% des données - Forme : (76877, 747) - Mem : 200.72 Mo


In [19]:
train, test = data_builder.save_train_and_test(train, test)

Train enregistré dans data/interim/01_v0_built_train.csv. Forme : (230634, 747)

Informations sur le jeu de Train :
<class 'cudf.core.dataframe.DataFrame'>
Index: 230634 entries, 46603 to 207914
Columns: 747 entries, SK_ID_CURR to CC_COUNT
dtypes: bool(158), float32(497), float64(21), int32(56), int64(15)
memory usage: 602.2 MB

Test enregistré dans data/interim/01_v0_built_test.csv. Forme : (76877, 747)


In [20]:
# On sauvegarde aussi le data buider pour les noms des features en OneHot et les First Categories
joblib.dump(data_builder, os.path.join(DATA_INTERIM, "01_v0_built_data_builder.pkl"))

['data/interim/01_v0_built_data_builder.pkl']

In [21]:
del df
del test
gc.collect()
cp._default_memory_pool.free_all_blocks()

In [22]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Refused_MEAN,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Refused_VAR,CC_NAME_CONTRACT_STATUS_Sentproposal_MEAN,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_COUNT
46603,153986,0,1,True,False,0,225000.0,298512.0,31801.5,270000.0,...,,,,,,,,,,
94032,209194,0,0,True,False,1,337500.0,724261.5,39420.0,576000.0,...,,,,,,,,,,
142448,265165,0,1,False,True,0,112500.0,134775.0,5836.5,112500.0,...,,,,,,,,,,
54650,163323,0,0,True,False,0,202500.0,123637.5,11470.5,112500.0,...,,,,,,,,,,
66209,176788,0,1,False,False,0,135000.0,770292.0,30676.5,688500.0,...,,,,,,,,,,


In [23]:
del train
gc.collect()
cp._default_memory_pool.free_all_blocks()