In [1]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [2]:
"""
code du lightgbm repris (et adapté) de :
Lien : https://www.kaggle.com/code/jsaguiar/lightgbm-7th-place-solution

KAGGLE HOME CREDIT DEFAULT RISK COMPETITION
Adapted from one of the models used in 7th place solution ensemble.
For more details about our solution please check this discussion:
https://www.kaggle.com/c/home-credit-default-risk/discussion/64580

Another similar version is also available at GitHub:
https://github.com/js-aguiar/home-credit-default-competition

This model uses LightGBM with goss and label encode for the application's 
categorical features. Other tables are using one-hot encode with mean, 
sum and a few different functions to aggregate. The main ideia was to add 
more time related features like last application and last X months aggregations.
There are also aggregations for specific loan types and status as well as
ratios between tables. Configurations are in line 785
"""

import os
import gc
import numpy as np
import pandas as pd
import joblib
import warnings


warnings.simplefilter(action="ignore", category=FutureWarning)
## np.warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
np.seterr(all="ignore", invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
from src.p7_full_kernel import get_full_data, kfold_lightgbm_sklearn
from src.p7_simple_kernel import get_simple_data
from src.p7_simple_kernel import CONFIG_SIMPLE
from src.p7_util import timer
from src.p7_constantes import (
    NUM_THREADS,
    DATA_BASE,
    DATA_INTERIM,
)

%load_ext autoreload
%autoreload 2
#autoreload 2 = Reload all modules (except those excluded by %aimport)
# every time before executing the Python code typed.

# Configuration

In [4]:
print("NUM_THREADS :", NUM_THREADS)


NUM_THREADS : 16


In [5]:
shape_application_train = pd.read_csv(os.path.join(DATA_BASE, "application_train.csv")).shape
print(shape_application_train)

(307511, 122)


# Full Kernel

## One Hot uniquement

Joindre et nettoyer les données

In [6]:
with timer("Pipeline total time"):
    df = get_full_data(debug=False, ohe=True)

  df[feature_name] = eval("np.{}".format(function_name))(


Application dataframe shape:  (356250, 211)
application_train and application_test - duration (hh:mm:ss) : 0:00:06
Bureau dataframe shape:  (305811, 156)
Bureau and bureau_balance data - duration (hh:mm:ss) : 0:00:14
Previous dataframe shape:  (338857, 225)
previous_application - duration (hh:mm:ss) : 0:00:20
Pos-cash dataframe shape:  (337252, 24)
Installments dataframe shape:  (339587, 101)
Credit card dataframe shape:  (103558, 59)
previous applications balances - duration (hh:mm:ss) : 0:07:42
Initial df memory usage is 1806.09 MB for 787 columns
Final memory usage is: 805.54 MB - decreased by 55.4%
Pipeline total time - duration (hh:mm:ss) : 0:08:26


Information

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356250 entries, 0 to 356249
Columns: 787 entries, SK_ID_CURR to DAYS_CREDIT_MEAN_TO_EMPLOYED
dtypes: float16(482), float32(235), float64(56), int16(2), int32(1), int8(11)
memory usage: 805.5 MB


In [8]:
df.shape

(356250, 787)

Sauvegarde

In [9]:
df.to_csv(os.path.join(DATA_INTERIM, "all_data_full_kernel_ohe.csv"))

## Label Encoding in application table

Joindre et nettoyer les données

In [10]:
with timer("Pipeline total time"):
    df = get_full_data(debug=False, ohe=False)

  df[feature_name] = eval("np.{}".format(function_name))(


Application dataframe shape:  (356250, 83)
application_train and application_test - duration (hh:mm:ss) : 0:00:06
Bureau dataframe shape:  (305811, 156)
Bureau and bureau_balance data - duration (hh:mm:ss) : 0:00:14
Previous dataframe shape:  (338857, 225)
previous_application - duration (hh:mm:ss) : 0:00:21
Pos-cash dataframe shape:  (337252, 24)
Installments dataframe shape:  (339587, 101)
Credit card dataframe shape:  (103558, 59)
previous applications balances - duration (hh:mm:ss) : 0:07:35
Initial df memory usage is 1791.14 MB for 659 columns
Final memory usage is: 714.49 MB - decreased by 60.1%
Pipeline total time - duration (hh:mm:ss) : 0:08:19


Information

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356250 entries, 0 to 356249
Columns: 659 entries, SK_ID_CURR to DAYS_CREDIT_MEAN_TO_EMPLOYED
dtypes: float16(342), float32(235), float64(56), int16(2), int32(1), int8(23)
memory usage: 714.5 MB


In [12]:
df.shape

(356250, 659)

In [13]:
cat_features = df.select_dtypes(include='object').columns.tolist()
cat_features

[]

In [14]:
df.to_csv(os.path.join(DATA_INTERIM, "all_data_full_kernel_le.csv"))
joblib.dump(cat_features, os.path.join(DATA_INTERIM, "cat_features_full_kernel_le.pkl"))

['data/interim/cat_features_full_kernel_le.pkl']

# Simple Kernel (OHE)

Configuration

In [6]:
CONFIG_SIMPLE

{'debug': False,
 'nan_as_cat': True,
 'data_output_dir': 'data/interim/',
 'data_filename': 'all_data_simple_kernel_ohe.csv',
 'generate_submission_files': True,
 'model_dir': 'models/',
 'model_subdir': 'light_simple/',
 'importance_filename': 'feature_importance.csv',
 'submission_filename': 'lightgbm_simple_submission.csv',
 'num_threads': 16,
 'stratified_kfold': True,
 'num_folds': 10,
 'early_stopping': 100,
 'random_seed': 1001}

In [7]:
config = CONFIG_SIMPLE
print("Debug =", config["debug"])
print("NaN as category =", config["nan_as_cat"])
print("Data filepath =", os.path.join(config["data_output_dir"], config["data_filename"]))

Debug = False
NaN as category = True
Data filepath = data/interim/all_data_simple_kernel_ohe.csv


In [8]:
config["debug"] = False

Joindre et nettoyer les données

In [23]:
with timer("Pipeline total time"):
    df, test = get_simple_data(config)

data/interim/ dossier déjà existant
Data samples: 307511
Application df shape: (307511, 258)
Process Application - duration (hh:mm:ss) : 0:00:02
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - duration (hh:mm:ss) : 0:00:12
Previous applications df shape: (338857, 249)
Process previous_applications - duration (hh:mm:ss) : 0:00:11
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - duration (hh:mm:ss) : 0:00:06
Installments payments df shape: (339587, 26)
Process installments payments - duration (hh:mm:ss) : 0:00:14
Credit card balance df shape: (103558, 125)
0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Process credit card balance - duration (hh:mm:ss) : 0:00:09
write data
Test shape : (61503, 792), saved in data/interim/test.csv
Train shape : (246008, 792), saved in data/interim/train.csv
Pipeline total time - duration (hh:mm:ss) : 0:02:02


Information

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 246008 entries, 302149 to 42906
Columns: 792 entries, SK_ID_CURR to TARGET
dtypes: bool(144), float64(606), int64(42)
memory usage: 1.2 GB


In [25]:
df.shape

(246008, 792)

In [27]:
df['CODE_GENDER'].value_counts(dropna=False)

CODE_GENDER
1.0    161963
0.0     84043
NaN         2
Name: count, dtype: int64

In [22]:
# Doublons dans les noms de features ?
features = df.columns.tolist()
print(len(features))
print(len(set(features)))

794
794


Sauvegarde

In [13]:
#df.to_csv(config["data_filepath"])

In [20]:
df.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT,TARGET
202026,334191,0,1,0,0,238477.5,157500.0,7875.0,157500.0,0.01885,...,,,,,,,,,,0
93913,209052,1,0,0,0,180000.0,450000.0,21109.5,450000.0,0.018209,...,,,,,,,,,,0
132834,254074,1,0,0,0,112500.0,269550.0,10264.5,225000.0,0.010643,...,,,,,,,,,,0
156041,280892,0,1,0,0,112500.0,271066.5,21861.0,234000.0,0.018029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0
271996,415335,1,0,1,3,180000.0,204858.0,16555.5,171000.0,0.014464,...,,,,,,,,,,1
