In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

In [2]:
df = pd.read_csv("datasets/earLobe.csv")
df.sample(5)

Unnamed: 0,patientID,has_DM2,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var3152,Var3153,Var3154,Var3155,Var3156,Var3157,Var3158,Var3159,Var3160,Var3161
10,DM210,1.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,...,0,0,0,0,0,0,0,0,0,0
4,DM204,1.0,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,...,0,0,0,0,0,0,0,0,0,0
1,DM201,1.0,181.8,181.8,181.8,181.8,181.8,181.8,181.8,181.8,...,0,0,0,0,0,0,0,0,0,0
20,Ctrl09,0.0,256.6,256.6,256.6,256.6,256.6,256.6,256.6,256.6,...,0,0,0,0,0,0,0,0,0,0
15,Ctrl04,0.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,...,0,0,0,0,0,0,0,0,0,0


Вырезаем колонку с 'patientID'

In [3]:
patientID = df.pop('patientID')

Удаляем строку с частотами

In [4]:
df.drop(0, inplace=True)

Разделяем данные на матрицу с признаками X и на столбец с целевой переменной (метками)

In [5]:
X, y = df.drop('has_DM2', axis=1), df['has_DM2']

#### Оценку качества модели (accuracy)  будем проводить методом отложенной выборки (hold-out set)
При таком подходе мы оставляем какую-то долю обучающей выборки (как правило от 20% до 40%), обучаем модель на остальных данных (60-80% исходной выборки) и считаем некоторую метрику качества модели (например, самое простое – долю правильных ответов в задаче классификации) на отложенной выборке.

In [6]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=.3, random_state=7)

In [7]:
y.value_counts(normalize=True)

1.0    0.55
0.0    0.45
Name: has_DM2, dtype: float64

#### Точность предсказания должна быть лучше как минимум чем 55%

### XGBoost

In [8]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train);

### Качество моделей (оценка на основе отложенной выборки)

In [9]:
pred_holdout_xgbc = xgbc.predict(X_holdout)
XGB_accuracy = accuracy_score(y_holdout, pred_holdout_xgbc)
XGB_accuracy

0.8333333333333334

### Кросс-валидация

In [10]:
skf = StratifiedKFold(n_splits = 5, random_state=1, shuffle = True)

In [11]:
xgb_cvs = cross_val_score(xgbc, X_train, y_train, scoring='accuracy', cv = skf).mean()

In [16]:
xgb_cvs

0.5

In [18]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}

In [19]:
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

In [21]:
optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1) 

In [22]:
optimized_GBM.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [32]:
print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)

{'max_depth': 3, 'min_child_weight': 3}
0.5


In [33]:
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}


optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.01], 'subsample': [0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [34]:
print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)

{'learning_rate': 0.1, 'subsample': 0.7}
0.42857142857142855


In [35]:
xgbc_opt = XGBClassifier(max_depth=3, min_child_weight=3, learning_rate=0.1, subsample=0.7)
xgbc_opt.fit(X_train, y_train);

In [36]:
pred_holdout_xgbc_opt = xgbc_opt.predict(X_holdout)
accuracy_score(y_holdout, pred_holdout_xgbc)

0.8333333333333334

### Classification Report

In [37]:
print(classification_report(y_holdout, pred_holdout_xgbc_opt, target_names=['0 - healthy','1 - diabet ']))

              precision    recall  f1-score   support

 0 - healthy       0.33      1.00      0.50         2
 1 - diabet        0.00      0.00      0.00         4

   micro avg       0.33      0.33      0.33         6
   macro avg       0.17      0.50      0.25         6
weighted avg       0.11      0.33      0.17         6



  'precision', 'predicted', average, warn_for)


### Матрица ошибок (confusion matrix) 
<img src = 'confusion_matrix.png'>

In [38]:
print(confusion_matrix(y_holdout, pred_holdout_xgbc_opt))

[[2 0]
 [4 0]]
