In [2]:
import os
import pickle

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from ydata_profiling import ProfileReport

SEED = 20020906

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
dx = pd.read_csv('../data/credit_train.csv')

In [7]:
ProfileReport(dx).to_file('../data/report.html')

Summarize dataset: 100%|██████████| 474/474 [01:37<00:00,  4.84it/s, Completed]               
Generate report structure: 100%|██████████| 1/1 [00:13<00:00, 13.19s/it]
Render HTML: 100%|██████████| 1/1 [00:18<00:00, 18.73s/it]
Export report to file: 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]


In [22]:
data = dx.copy()

In [23]:
data.shape

(25000, 24)

In [24]:
data.drop_duplicates(inplace = True)

In [25]:
data.shape

(24978, 24)

In [26]:
drop_cols = []
for col in data.columns:
    x = data[col]
    az = len(x.loc[x.values == 0])
    crit = az/len(data)*100
    if crit > 45:
        drop_cols.append(col)

drop_cols[:-1]

['X6', 'X7', 'X8', 'X9', 'X10', 'X11']

In [28]:
data = data.drop(drop_cols[:-1], axis = 1)

In [29]:
x = data.drop('Y', axis = 1)
y = data.Y

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SEED)

In [63]:
x

Unnamed: 0,X1,X2,X3,X4,X5,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,20000,2,2,2,44,18578,19928,18677,18503,17231,15410,2000,2000,2000,1100,1586,506
1,100000,2,2,1,49,5532,5325,5423,29628,13252,3723,5325,5423,29628,0,0,3513
2,210000,2,2,1,29,42140,31938,45502,41704,40509,41305,2000,20020,1672,1600,1600,2000
3,120000,2,2,2,28,200,200,0,0,0,0,0,0,0,0,0,0
4,50000,2,2,2,47,50883,51117,51125,50497,50544,50555,2200,5300,2000,2000,2000,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,90000,2,3,2,22,68663,65244,52452,35001,29675,28667,4341,2068,3315,1029,1334,1673
24996,100000,1,2,2,35,8102,10252,5607,59265,62554,55392,2500,0,59265,4100,2500,0
24997,50000,2,2,1,42,0,0,0,0,0,0,0,0,0,0,0,0
24998,240000,1,1,2,29,64826,72951,62848,60645,57554,57375,10589,34,4400,0,2000,1900


In [54]:
from sklearn.metrics import precision_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import HistGradientBoostingClassifier

In [55]:
def objective(params):
    # Train model
    booster = HistGradientBoostingClassifier(**params).fit(x_train, y_train)

    # Predict in the val dataset
    y_pred = booster.predict(x_test)

    # Calculate metric
    ps = precision_score(y_test, y_pred)
    rmse = 1 - ps

    return {'loss': rmse, 'status': STATUS_OK}

In [60]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'max_leaf_nodes': scope.int(hp.quniform('max_leaf_nodes', 4, 100, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 4, 100, 1)),
    'max_bins': scope.int(hp.quniform('max_bins', 2, 255, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'l2_regularization': hp.loguniform('l2_regularization', -6, -1),
    'random_state': SEED
}

best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=100,
    trials=Trials()
)

best_params["max_depth"] = int(best_params["max_depth"])
best_params["max_leaf_nodes"] = int(best_params["max_leaf_nodes"])
best_params["max_bins"] = int(best_params["max_bins"])
best_params["min_samples_leaf"] = int(best_params["min_samples_leaf"])
best_params["random_state"] = SEED

100%|██████████| 100/100 [01:26<00:00,  1.15trial/s, best loss: 0.3746898263027295]


In [62]:
best_params

{'l2_regularization': np.float64(0.023155505334482192),
 'learning_rate': np.float64(0.050253208819036124),
 'max_bins': 225,
 'max_depth': 53,
 'max_leaf_nodes': 21,
 'random_state': 20020906}