In [56]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
df = pd.read_csv("datasets/earLobe.csv")
df.sample(5)

Unnamed: 0,patientID,has_DM2,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var3152,Var3153,Var3154,Var3155,Var3156,Var3157,Var3158,Var3159,Var3160,Var3161
19,Ctrl08,0.0,228.666667,228.666667,228.666667,228.666667,228.666667,228.666667,228.666667,228.666667,...,0,0,0,0,0,0,0,0,0,0
0,ramanShift,,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,...,3150,3151,3152,3153,3154,3155,3156,3157,3158,3159
11,DM211,1.0,105.8,105.8,105.8,105.8,105.8,105.8,105.8,105.8,...,0,0,0,0,0,0,0,0,0,0
12,Ctrl01,0.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,...,0,0,0,0,0,0,0,0,0,0
8,DM208,1.0,55.833333,55.833333,55.833333,55.833333,55.833333,55.833333,55.833333,55.833333,...,0,0,0,0,0,0,0,0,0,0


Вырезаем колонку с 'patientID'

In [3]:
patientID = df.pop('patientID')

Удаляем строку с частотами

In [4]:
df.drop(0, inplace=True)

Разделяем данные на матрицу с признаками X и на столбец с целевой переменной (метками)

In [5]:
X, y = df.drop('has_DM2', axis=1), df['has_DM2']

#### Оценку качества модели (accuracy)  будем проводить методом отложенной выборки (hold-out set)
При таком подходе мы оставляем какую-то долю обучающей выборки (как правило от 20% до 40%), обучаем модель на остальных данных (60-80% исходной выборки) и считаем некоторую метрику качества модели (например, самое простое – долю правильных ответов в задаче классификации) на отложенной выборке.

In [6]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=.3, random_state=7)

In [7]:
y.value_counts(normalize=True)

1.0    0.55
0.0    0.45
Name: has_DM2, dtype: float64

#### Точность предсказания должна быть лучше как минимум чем 55%

### Logistic Regression

In [40]:
logreg = LogisticRegression(random_state=17, solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train);

In [10]:
pred_holdout_logreg = logreg.predict(X_holdout)
logreg_accuracy = accuracy_score(y_holdout, pred_holdout_logreg)
logreg_accuracy

0.6666666666666666

### Random Forest

In [11]:
rf_clf = RandomForestClassifier(n_estimators=10, n_jobs=2, random_state=7)
rf_clf.fit(X_train, y_train);

In [12]:
pred_holdout_rf_clf = rf_clf.predict(X_holdout)
rf_accuracy = accuracy_score(y_holdout, pred_holdout_rf_clf)
rf_accuracy

0.6666666666666666

### XGBoost

In [13]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train);

In [14]:
pred_holdout_xgbc = xgbc.predict(X_holdout)
XGB_accuracy = accuracy_score(y_holdout, pred_holdout_xgbc)
XGB_accuracy

0.8333333333333334

### Support Vector Machines

In [15]:
clf_svm = svm.SVC(gamma='scale')
clf_svm.fit(X_train, y_train);

In [16]:
pred_holdout_svm = clf_svm.predict(X_holdout)
svm_accuracy = accuracy_score(y_holdout, pred_holdout_svm)
svm_accuracy

0.6666666666666666

### Качество моделей (оценка на основе отложенной выборки)

In [17]:
accuracy = pd.DataFrame({
    'xgboost': [XGB_accuracy],
    'logreg': [logreg_accuracy],
    'random forest': [rf_accuracy],
    'SVM': [svm_accuracy]
    }, index=['accuracy'])

In [18]:
accuracy

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.833333,0.666667,0.666667,0.666667


### Кросс-валидация

In [20]:
skf = StratifiedKFold(n_splits = 5, random_state=1, shuffle = True)

In [25]:
svm_cvs = cross_val_score(clf_svm, X_train, y_train, scoring='accuracy', cv = skf).mean()
xgb_cvs = cross_val_score(xgbc, X_train, y_train, scoring='accuracy', cv = skf).mean()
logreg_cvs = cross_val_score(logreg, X_train, y_train, scoring='accuracy', cv = skf).mean()
rf_cvs = cross_val_score(rf_clf, X_train, y_train, scoring='accuracy', cv = skf).mean()

In [26]:
accuracy_cvs = pd.DataFrame({
    'xgboost': [xgb_cvs],
    'logreg': [logreg_cvs],
    'random forest': [rf_cvs],
    'SVM': [svm_cvs]
    }, index=['cvs_mean'])

In [27]:
accuracy = accuracy.append(accuracy_cvs)

In [28]:
accuracy

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.833333,0.666667,0.666667,0.666667
cvs_mean,0.5,0.5,0.55,0.45
cvs_mean,0.5,0.5,0.55,0.45


### Classification Report

In [29]:
print('**Logistic Regression**\n',classification_report(y_holdout, pred_holdout_logreg, target_names=['0 - healthy','1 - diabet ']))

**Logistic Regression**
               precision    recall  f1-score   support

 0 - healthy       0.50      0.50      0.50         2
 1 - diabet        0.75      0.75      0.75         4

   micro avg       0.67      0.67      0.67         6
   macro avg       0.62      0.62      0.62         6
weighted avg       0.67      0.67      0.67         6



In [30]:
print('**Random Forest**\n',classification_report(y_holdout, pred_holdout_rf_clf, target_names=['0 - healthy','1 - diabet ']))

**Random Forest**
               precision    recall  f1-score   support

 0 - healthy       0.50      1.00      0.67         2
 1 - diabet        1.00      0.50      0.67         4

   micro avg       0.67      0.67      0.67         6
   macro avg       0.75      0.75      0.67         6
weighted avg       0.83      0.67      0.67         6



### Матрица ошибок (confusion matrix) 
<img src = 'confusion_matrix.png'>

In [31]:
print('**Logistic Regression**\n',confusion_matrix(y_holdout, pred_holdout_logreg))

**Logistic Regression**
 [[1 1]
 [1 3]]


In [32]:
print('**Random Forest**\n',confusion_matrix(y_holdout, pred_holdout_rf_clf))

**Random Forest**
 [[2 0]
 [2 2]]


In [42]:
from sklearn.model_selection import GridSearchCV

In [49]:
grid={"C":np.logspace(-3,3,7), "penalty":["l2"]}

In [50]:
logreg_cv = GridSearchCV(logreg,grid,cv=4)
logreg_cv.fit(X_train,y_train)



GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [51]:
logreg_cv.best_params_

{'C': 0.001, 'penalty': 'l2'}

In [52]:
logreg_cv.best_score_

0.5714285714285714

In [54]:
pred_holdout_logreg_cv = logreg_cv.best_estimator_.predict(X_holdout)

In [55]:
accuracy_score(y_holdout, pred_holdout_logreg_cv)

0.6666666666666666