In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn')
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report

import os

# Работа с данными

## Иморт данных

In [2]:
data = pd.read_csv('bank-full.csv', sep=';')

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
data.shape

(45211, 17)

## Подготовка данных

### Чистка данных

Уберем данные с нулевой длительностью звонка:

In [5]:
data.drop(data[data.duration == 0].index, inplace=True)

## Кодировка строковых данных

Закодируем строковые данные стандартным образом:

In [6]:
col_names = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
    'month', 'poutcome', 'y'
]

for col in col_names:
    data[col] = data[col].astype('category')
    data[col] = data[col].cat.codes

In [7]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [8]:
data.shape

(45208, 17)

## Балансирование выборки

Разделим данные на предикторы и отклики:

In [9]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 16].values

Разделим данные на обучающее и тестовое множество:

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=123)

Сбалансирована ли выборка?

In [12]:
from collections import Counter

In [13]:
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 26746, 1: 3543})
Testing target statistics: Counter({0: 13173, 1: 1746})


Выборка не сбалансирована. Проведем **undersampling**:

In [14]:
from imblearn.under_sampling import RandomUnderSampler

In [15]:
under_sampler = RandomUnderSampler(random_state=123)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 3543, 1: 3543})
Testing target statistics: Counter({0: 13173, 1: 1746})


## Стандартизация данных

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# XGBoost

In [18]:
from xgboost import XGBClassifier

In [31]:
xgb_model = XGBClassifier()

In [41]:
parameters = {
    'nthread': [4],
    'objective': ['binary:logistic'],
    'learning_rate': [0.05, 0.1, 0.3],
    'max_depth': [3, 4, 6],
    'min_child_weight': [11],
    'silent': [1],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'n_estimators': [100, 300, 500, 800, 1000],
    'missing': [-999],
    'seed': [1337]
}

clf = GridSearchCV(xgb_model,
                   parameters,
                   n_jobs=5,
                   cv=2,
                   scoring='f1',
                   verbose=2,
                   refit=True)

In [49]:
%%capture --no-display
clf.fit(X_train, y_train)

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,...
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.05, 0.1, 0.3]

In [50]:
best_hyperparams = clf.best_params_
print('Лучшие значения гиперпараметров:\n', best_hyperparams)

Лучшие значения гиперпараметров:
 {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 11, 'missing': -999, 'n_estimators': 300, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 1337, 'silent': 1, 'subsample': 0.8}


In [51]:
# Лучшая модель
best_model = clf.best_estimator_

## Оценка классификации

In [52]:
y_pred_test = best_model.predict(X_test)

In [58]:
print('Таблица сопряженности для тестовой выборки:')
conf_mat = confusion_matrix(y_test, y_pred_test)
conf_mat_test = pd.DataFrame(conf_mat,
                             index=best_model.classes_,
                             columns=best_model.classes_)
conf_mat_test

Таблица сопряженности для тестовой выборки:


Unnamed: 0,0,1
0,10997,2176
1,229,1517


In [54]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90     13173
           1       0.41      0.87      0.56      1746

    accuracy                           0.84     14919
   macro avg       0.70      0.85      0.73     14919
weighted avg       0.91      0.84      0.86     14919



In [55]:
print('XGBoost weighted avg F1-score:',
      round(f1_score(y_test, y_pred_test, average='weighted'), 3))

XGBoost weighted avg F1-score: 0.861


# Калибровка модели

In [80]:
from sklearn.calibration import CalibratedClassifierCV

In [81]:
model_sigmoid = CalibratedClassifierCV(best_model, cv=2, method='sigmoid')

In [63]:
%%capture --no-display
# Calibrate probabilities
model_sigmoid.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=XGBClassifier(base_score=0.5,
                                                    booster='gbtree',
                                                    colsample_bylevel=1,
                                                    colsample_bynode=1,
                                                    colsample_bytree=0.7,
                                                    enable_categorical=False,
                                                    gamma=0, gpu_id=-1,
                                                    importance_type=None,
                                                    interaction_constraints='',
                                                    learning_rate=0.1,
                                                    max_delta_step=0,
                                                    max_depth=3,
                                                    min_child_weight=11,
                                                    missing=

In [64]:
probas = model_sigmoid.predict_proba(X_test)

In [76]:
x = len(list(filter(lambda proba: proba[0] > 0.8 or proba[1] > 0.8, probas)))

In [77]:
y = len(probas)

Доля наблюдений в тестовой выборке, для которых есть класс, вероятность принадлежать которому больше **0.8**:

In [85]:
round(x/y, 2)

0.78