In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

# Подготовим данные

In [2]:
bank = pd.read_csv('bank/bank-full.csv', sep=';', decimal=",")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

bank['job'] = labelencoder.fit_transform(bank['job'])
bank['marital'] = labelencoder.fit_transform(bank['marital'])
bank['education'] = labelencoder.fit_transform(bank['education'])
bank['default'] = labelencoder.fit_transform(bank['default'])
bank['housing'] = labelencoder.fit_transform(bank['housing'])
bank['loan'] = labelencoder.fit_transform(bank['loan'])
bank['contact'] = labelencoder.fit_transform(bank['contact'])
bank['month'] = labelencoder.fit_transform(bank['month'])
bank['poutcome'] = labelencoder.fit_transform(bank['poutcome'])
bank['y'] = labelencoder.fit_transform(bank['y'])

bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [4]:
X = bank.iloc[:, :-1]
y = bank.iloc[:, 16]

In [5]:
# Разделим на обучающую и тестовую
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 1234)  

# Строим модель градиентный бустинг

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

Согласно результатам GridSearch из Lab_4, лучшими значениями гиперпараметров являются: 
    'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 15, 'n_estimators': 500

In [7]:
model = GradientBoostingClassifier(random_state=42,
                                   subsample=0.66,
                                   max_features='sqrt', 
                                   n_estimators=500,
                                   loss='deviance', 
                                   learning_rate=0.01, 
                                   criterion='friedman_mse', 
                                   min_impurity_decrease=0.001, 
                                   
                                   min_samples_leaf = 6, 
                                   min_samples_split = 15,
                                   max_depth = 6,
                                   verbose=0
                                   )

In [8]:
model.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.01, max_depth=6, max_features='sqrt',
                           min_impurity_decrease=0.001, min_samples_leaf=6,
                           min_samples_split=15, n_estimators=500,
                           random_state=42, subsample=0.66)

In [9]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

In [10]:
print("Accuracy on train set is ", metrics.accuracy_score(y_train, y_pred_train)*100)
print("Accuracy on test set is ", metrics.accuracy_score(y_test, y_pred_test)*100)

Accuracy on train set is  92.34095936086626
Accuracy on test set is  90.78418230563003


In [11]:
from sklearn import metrics
conf_mat = metrics.confusion_matrix(y_test, y_pred_test)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
conf_mat

Unnamed: 0,0,1
0,12801,377
1,998,744


In [12]:
conf_mat = metrics.confusion_matrix(y_train, y_pred_train)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_)
conf_mat

Unnamed: 0,0,1
0,26220,524
1,1796,1751


In [13]:
#  Вероятности принадлежать классам.

y_pred_train2 = model.predict_proba(X_train)
y_pred_test2 = model.predict_proba(X_test)

y_pred_test2

array([[0.96761534, 0.03238466],
       [0.53930545, 0.46069455],
       [0.93365602, 0.06634398],
       ...,
       [0.9844331 , 0.0155669 ],
       [0.98556791, 0.01443209],
       [0.98497171, 0.01502829]])

In [14]:
y_pred_train2

array([[0.9642535 , 0.0357465 ],
       [0.97505296, 0.02494704],
       [0.99094038, 0.00905962],
       ...,
       [0.97852741, 0.02147259],
       [0.96166253, 0.03833747],
       [0.4470205 , 0.5529795 ]])

# Калибровка

In [15]:
from sklearn.calibration import CalibratedClassifierCV

In [16]:
model_sigmoid = CalibratedClassifierCV(model, cv=2, method='sigmoid')

In [17]:
model_sigmoid.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(learning_rate=0.01,
                                                                 max_depth=6,
                                                                 max_features='sqrt',
                                                                 min_impurity_decrease=0.001,
                                                                 min_samples_leaf=6,
                                                                 min_samples_split=15,
                                                                 n_estimators=500,
                                                                 random_state=42,
                                                                 subsample=0.66),
                       cv=2)

In [18]:
model_sigmoid.predict_proba(X_test)

array([[0.97783144, 0.02216856],
       [0.4394268 , 0.5605732 ],
       [0.94869416, 0.05130584],
       ...,
       [0.98896633, 0.01103367],
       [0.9896549 , 0.0103451 ],
       [0.98928769, 0.01071231]])

На самом деле калибровка тут не сильно-то и нужна была, но все равно видно, что вероятности изменились, хоть и незначительно

In [19]:
calibration = model_sigmoid.predict_proba(X_test)
k = 0

In [20]:
calibration[0][0]

0.9778314415568243

In [21]:
for i in range(len(calibration)):
    if calibration[i][0] > 0.8:
        k += 1

In [23]:
k / len(calibration)

0.8217828418230563

0.82 - доля наблюдений в тестовой выборке, для которых вероятность отнесения к нулевому (y = "no") классу больше 0.8.              
Вывод: 82% клиентов дали отказ в подписании срочного депозита. Что логично - большинство отказывается.