In [144]:
import pandas as pd
import numpy as np
import requests
import optbinning as optbin
import psutil
import json
import os
import datetime as dt
import logging
import re
import joblib
import gc
import warnings
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
pd.set_option('max_columns', 300)
pd.set_option('max_rows', 300)
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

### 1.Download data

In [145]:
data = pd.read_pickle('dataset.pkl')


### 2. Making train/test 

In [146]:
%%time
y = data['target']
X = data.drop(['target','id','default_date'],axis = 1 )
X_1, X_2, y_1, y_2 = train_test_split(X, y,  
                                      test_size=0.3,
                                      random_state=23)
print(X_1.shape, X_2.shape, sum(y_1), sum(y_2))


(2742, 260) (1176, 260) 153 70
Wall time: 23 ms


In [147]:
 train = X_1.join(y_1).reset_index(drop=True)
test = X_2.join(y_2).reset_index(drop=True)

### 3. primary selection number

In [148]:
def attributes_list(data, columns):
    attribute_list = pd.DataFrame()
    attribute_list['attribute'] = columns
    count_miss = []
    count_unique = []

    for i in tqdm(attribute_list['attribute']):
        count_miss.append(data[i].isna().sum())
        count_unique.append(data[i].nunique())
    attribute_list['count_unique'] = count_unique
    attribute_list['count_miss'] = count_miss
    attribute_list['persent_miss'] = round(100*attribute_list['count_miss']/data.shape[0], 2)
    gc.collect()
    
    return attribute_list

In [149]:
attribute_list = attributes_list(train, list(train.columns))

100%|██████████████████████████████████████████████████████████████████████████████| 261/261 [00:00<00:00, 3077.30it/s]


In [150]:
#Removing columns from 99% of gaps
attribute_list_many_misses = attribute_list.attribute[(attribute_list['count_miss'] >= train.shape[0]*0.99)].values.tolist() 

In [151]:
attributes2drop = attribute_list_many_misses
train = train.drop(attributes2drop, axis = 1)

Defining parameters for classifying variables

Using state of art methods in credit scoring: <br>
Feature selection according to Information value <br>
Encoding with WOE encoder <br>

In [152]:
target_col = ['target']

attribute_list_stage_2 = attribute_list[~attribute_list.attribute.isin(attributes2drop)]

categorical_cols = list(attribute_list_stage_2.attribute[attribute_list_stage_2.count_unique<=4].values)

numerical_cols = list(attribute_list_stage_2.attribute[~attribute_list_stage_2.attribute.isin(categorical_cols)].values)

In [153]:
classing_parameters = pd.DataFrame(index=numerical_cols)

In [154]:
def calc_approx_num_prebins(x):
    num_bins = np.histogram_bin_edges(
        x.dropna(), bins='sqrt'
    ).shape[0]

    return num_bins

In [155]:
# For numerical columns we approximate num of prebins using histogram method
num_max_n_prebins = np.array(
    [calc_approx_num_prebins(train[col]) for col in tqdm(numerical_cols)], dtype=np.int
)
num_max_n_prebins = np.clip(num_max_n_prebins, 20, 500)

100%|██████████████████████████████████████████████████████████████████████████████| 245/245 [00:00<00:00, 3149.55it/s]


In [156]:
classing_parameters.loc[numerical_cols, 'max_n_prebins'] = num_max_n_prebins

In [157]:
_parameters = {
    'prebinning_method': 'mdlp',
    'divergence': 'iv', 
    #'min_prebin_size': 0.07, # Keep default 0.05, because larger values tend to small num of bins
    'min_n_bins': 2, # Let it calculate automatically, but check later
    'max_n_bins': 6, # More bins will cause ovefit, so keep it small 
    #'min_bin_size': 0.10, # To force pre-bins merge together  
    'min_bin_n_nonevent': 10, # Important to tune it carefully, larger values tend to small num of bins and otherwise
    'min_bin_n_event': 10, # Important to tune it carefully, larger values tend to small num of bins and otherwise
    'monotonic_trend': 'auto_asc_desc', # Variable trend, let it selected automatically
    'max_pvalue_policy': 'all', # Compare all bins
    #'outlier_detector': 'range', # Helps to get more robust soluton, but not this time 
    'cat_cutoff': 0.02, # Larger values tend to small num of bins and otherwise
    'gamma': 0.02, # Tune it carefully
    'split_digits': 2, # The significant digits of the split points.
    'time_limit': 60*20   
    }

In [158]:
for name, parameter in _parameters.items():
    classing_parameters[name] = parameter
    
# Ensure dtypes are proper
classing_parameters['max_n_prebins'] = classing_parameters['max_n_prebins'].astype(np.int)

In [159]:
# classing_parameters.to_excel('coarse_classing_parameters_full.xlsx', index=False)

##### IV. Classing parameters for model

In [160]:
_classing_params = classing_parameters.to_dict('index')

In [161]:
coarse_classing_est = optbin.BinningProcess(
    variable_names=numerical_cols, # All variable names which be used in classings 
    binning_fit_params=_classing_params, # Params for each variable
    max_pvalue_policy='all', # Ensure. In case of something goes wrong 
    verbose=True, n_jobs=-1 # Print information and use all cores
)

In [162]:
X_train, y_train = (
    train[numerical_cols].fillna(np.NaN), 
    train[target_col].squeeze().astype(np.int)
)

# Dypes conversion
X_train[numerical_cols] = X_train[numerical_cols].astype(np.float)

gc.collect();

In [163]:
# Supress Warnings 
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    # Fit Binner
    coarse_classing_est.fit(X_train, y_train);

2021-10-09 17:03:20,455 | INFO : Binning process started.
2021-10-09 17:03:20,457 | INFO : Options: check parameters.
2021-10-09 17:03:20,459 | INFO : Dataset: number of samples: 2742.
2021-10-09 17:03:20,460 | INFO : Dataset: number of variables: 245.
2021-10-09 17:03:20,462 | INFO : Options: number of jobs (cores): 8.
2021-10-09 17:03:21,821 | INFO : Binning process variable selection...
2021-10-09 17:03:23,130 | INFO : Binning process terminated. Time: 2.6752s


In [164]:
all_vars_cls_summary = coarse_classing_est.summary()
possible_vars_cls_summary = all_vars_cls_summary.query('iv > 0.1 and n_bins > 1')
possible_vars_names = possible_vars_cls_summary['name'].to_list()

In [165]:
# Get train encodings
train_encoded = coarse_classing_est.transform(
    train[numerical_cols], 
    metric='woe',
    metric_missing='empirical',
    show_digits=0, check_input=True
)[possible_vars_names]

# Add target and id_cols
train_encoded['target'] = train[target_col[0]].to_numpy().astype(np.int)

In [166]:
# Get test encodings
test_encoded = coarse_classing_est.transform(
    test[numerical_cols], 
    metric='woe',
    metric_missing='empirical',
    show_digits=0, check_input=True
)[possible_vars_names]

# Add target and id_cols
test_encoded['target'] = test[target_col[0]].to_numpy().astype(np.int)

#### Выборки для валидации

In [167]:
train_encoded.to_pickle('train_encoded.pkl')
test_encoded.to_pickle('test_encoded.pkl')