In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn')
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report

import os

# Работа с данными

## Иморт данных

In [2]:
data = pd.read_csv('bank-full.csv', sep=';')

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
data.shape

(45211, 17)

## Подготовка данных

### Чистка данных

Уберем данные с нулевой длительностью звонка:

In [5]:
data.drop(data[data.duration == 0].index, inplace=True)

### Кодировка строковых данных

Закодируем строковые данные стандартным образом:

In [6]:
col_names = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
    'month', 'poutcome', 'y'
]

for col in col_names:
    data[col] = data[col].astype('category')
    data[col] = data[col].cat.codes

In [7]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [8]:
data.shape

(45208, 17)

### Балансирование выборки

Разделим данные на предикторы и отклики:

In [9]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 16].values

Разделим данные на обучающее и тестовое множество:

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=123)

Сбалансирована ли выборка?

In [12]:
from collections import Counter

In [13]:
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 26746, 1: 3543})
Testing target statistics: Counter({0: 13173, 1: 1746})


Выборка не сбалансирована. Проведем **undersampling**:

In [14]:
from imblearn.under_sampling import RandomUnderSampler

In [15]:
under_sampler = RandomUnderSampler(random_state=123)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 3543, 1: 3543})
Testing target statistics: Counter({0: 13173, 1: 1746})


### Стандартизация данных

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Деревья классификации CART

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
model_cart = DecisionTreeClassifier(random_state=42,
                                    criterion='gini',
                                    max_depth=6,
                                    max_leaf_nodes=None,
                                    min_samples_split=10,
                                    min_samples_leaf=8,
                                    min_impurity_decrease=0.001,
                                    class_weight=None,
                                    min_weight_fraction_leaf=0,
                                    splitter='best',
                                    max_features=None)

В этом и последующих методах будем подбирать оптимальные значения параметров с помощью **GridSearchCV**, опираясь на метрику **f1-score**.

In [20]:
#   Определяем интересные значения гиперпараметров
params_set = {
    'max_depth': [3, 4, 6],
    'min_samples_leaf': [2, 4, 8],
    'min_samples_split': [8, 10, 14]
}

grid_CV_cart = GridSearchCV(estimator=model_cart,
                            param_grid=params_set,
                            scoring='f1',
                            cv=5,
                            n_jobs=-1)

#   Обучение на обучающей выборке
grid_CV_cart.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(max_depth=6,
                                              min_impurity_decrease=0.001,
                                              min_samples_leaf=8,
                                              min_samples_split=10,
                                              min_weight_fraction_leaf=0,
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [3, 4, 6], 'min_samples_leaf': [2, 4, 8],
                         'min_samples_split': [8, 10, 14]},
             scoring='f1')

In [21]:
best_hyperparams_cart = grid_CV_cart.best_params_
print('Лучшие значения гиперпараметров:\n', best_hyperparams_cart)

Лучшие значения гиперпараметров:
 {'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 8}


In [22]:
# Лучшая модель
best_model_cart = grid_CV_cart.best_estimator_

## Оценка классификации

In [23]:
y_pred_test = best_model_cart.predict(X_test)

In [24]:
print('Таблица сопряженности для тестовой выборки:')
conf_mat = confusion_matrix(y_test, y_pred_test)
conf_mat_test = pd.DataFrame(conf_mat,
                             index=best_model_cart.classes_,
                             columns=best_model_cart.classes_)
conf_mat_test

Таблица сопряженности для тестовой выборки:


Unnamed: 0,0,1
0,10450,2723
1,339,1407


In [25]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87     13173
           1       0.34      0.81      0.48      1746

    accuracy                           0.79     14919
   macro avg       0.65      0.80      0.68     14919
weighted avg       0.90      0.79      0.83     14919



In [26]:
print('CART weighted avg F1-score:',
      round(f1_score(y_test, y_pred_test, average='weighted'), 3))

CART weighted avg F1-score: 0.826


# Random forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
model_rfc = RandomForestClassifier(random_state=42,
                                   n_estimators=80,
                                   criterion='gini',
                                   max_depth=5,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   min_impurity_decrease=0.001,
                                   min_samples_leaf=10,
                                   min_samples_split=15,
                                   min_weight_fraction_leaf=0.0,
                                   verbose=1,
                                   oob_score=True,
                                   warm_start=False,
                                   class_weight=None)

In [29]:
#   Определяем интересные значения гиперпараметров
params_set = {
    'n_estimators': [30, 40, 50, 100],
    'max_depth': [4, 5, 6],
    "max_features": [1, 3, 10],
    "min_samples_split": [3, 5, 10],
}

grid_CV_rfc = GridSearchCV(model_rfc,
                           param_grid=params_set,
                           cv=2,
                           scoring='f1',
                           error_score='raise',
                           refit=True,
                           pre_dispatch=None,
                           verbose=0)

In [30]:
%%capture --no-display

#   Обучение на обучающей выборке
grid_CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
             estimator=RandomForestClassifier(max_depth=5,
                                              min_impurity_decrease=0.001,
                                              min_samples_leaf=10,
                                              min_samples_split=15,
                                              n_estimators=80, oob_score=True,
                                              random_state=42, verbose=1),
             param_grid={'max_depth': [4, 5, 6], 'max_features': [1, 3, 10],
                         'min_samples_split': [3, 5, 10],
                         'n_estimators': [30, 40, 50, 100]},
             pre_dispatch=None, scoring='f1')

In [31]:
best_hyperparams_rfc = grid_CV_rfc.best_params_
print('Лучшие значения гиперпараметров:\n', best_hyperparams_rfc)

Лучшие значения гиперпараметров:
 {'max_depth': 6, 'max_features': 3, 'min_samples_split': 3, 'n_estimators': 100}


In [32]:
# Лучшая модель
best_model_rfc = grid_CV_rfc.best_estimator_

## Оценка классификации

In [33]:
y_pred_test = best_model_rfc.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [34]:
print('Таблица сопряженности для тестовой выборки:')
conf_mat = confusion_matrix(y_test, y_pred_test)
conf_mat_test = pd.DataFrame(conf_mat,
                             index=best_model_cart.classes_,
                             columns=best_model_cart.classes_)
conf_mat_test

Таблица сопряженности для тестовой выборки:


Unnamed: 0,0,1
0,10427,2746
1,244,1502


In [35]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      0.79      0.87     13173
           1       0.35      0.86      0.50      1746

    accuracy                           0.80     14919
   macro avg       0.67      0.83      0.69     14919
weighted avg       0.90      0.80      0.83     14919



In [36]:
print('RFC weighted avg F1-score:',
      round(f1_score(y_test, y_pred_test, average='weighted'), 3))

RFC weighted avg F1-score: 0.831


# Gradient boosting

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
model_gbc = GradientBoostingClassifier(random_state=42,
                                   subsample=0.66,
                                   max_features='sqrt',
                                   n_estimators=500,
                                   loss='deviance',
                                   learning_rate=0.01,
                                   criterion='friedman_mse',
                                   min_impurity_decrease=0.001,
                                   min_samples_leaf=5,
                                   min_samples_split=10,
                                   max_depth=5,
                                   verbose=0)

In [39]:
#   Определяем интересные значения гиперпараметров
params_set = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'max_depth': [3, 4, 6],
    'learning_rate': [0.05, 0.1, 0.3]
}

grid_CV_gbc = GridSearchCV(model_gbc,
                               param_grid=params_set,
                               cv=2,
                               scoring='f1',
                               error_score='raise',
                               refit=True,
                               pre_dispatch=None,
                               verbose=0)

#   Обучение на обучающей выборке
grid_CV_gbc.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
             estimator=GradientBoostingClassifier(learning_rate=0.01,
                                                  max_depth=5,
                                                  max_features='sqrt',
                                                  min_impurity_decrease=0.001,
                                                  min_samples_leaf=5,
                                                  min_samples_split=10,
                                                  n_estimators=500,
                                                  random_state=42,
                                                  subsample=0.66),
             param_grid={'learning_rate': [0.05, 0.1, 0.3],
                         'max_depth': [3, 4, 6],
                         'n_estimators': [100, 300, 500, 800, 1000]},
             pre_dispatch=None, scoring='f1')

In [40]:
best_hyperparams_gbc = grid_CV_gbc.best_params_
print('Лучшие значения гиперпараметров:\n', best_hyperparams_gbc)

Лучшие значения гиперпараметров:
 {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 800}


In [41]:
# Лучшая модель
best_model_gbc = grid_CV_gbc.best_estimator_

## Оценка классификации

In [42]:
y_pred_test = best_model_gbc.predict(X_test)

In [43]:
print('Таблица сопряженности для тестовой выборки:')
conf_mat = confusion_matrix(y_test, y_pred_test)
conf_mat_test = pd.DataFrame(conf_mat,
                             index=best_model_cart.classes_,
                             columns=best_model_cart.classes_)
conf_mat_test

Таблица сопряженности для тестовой выборки:


Unnamed: 0,0,1
0,11043,2130
1,230,1516


In [44]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      0.84      0.90     13173
           1       0.42      0.87      0.56      1746

    accuracy                           0.84     14919
   macro avg       0.70      0.85      0.73     14919
weighted avg       0.91      0.84      0.86     14919



In [45]:
print('GBC weighted avg F1-score:',
      round(f1_score(y_test, y_pred_test, average='weighted'), 3))

GBC weighted avg F1-score: 0.864


# Вывод

Результаты методов классифкации на тестовом множестве:
1. **Деревья решений CART** weighted avg F1-score: 0.826
2. **Random forest** weighted avg F1-score: 0.831
3. **Gradient boosting** weighted avg F1-score: 0.864

Исходя из предложенных результатов, можно сделать вывод, что лучше всего с задачей справляется метод **Gradient boosting** с метрикой **weighted avg F1-score=0.864**.