In [37]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_score, StratifiedKFold

In [2]:
df = pd.read_csv("datasets/earLobe.csv")

In [3]:
df.head()

Unnamed: 0,patientID,has_DM2,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,...,Var3152,Var3153,Var3154,Var3155,Var3156,Var3157,Var3158,Var3159,Var3160,Var3161
0,ramanShift,,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,...,3150,3151,3152,3153,3154,3155,3156,3157,3158,3159
1,DM201,1.0,181.8,181.8,181.8,181.8,181.8,181.8,181.8,181.8,...,0,0,0,0,0,0,0,0,0,0
2,DM202,1.0,162.8,162.8,162.8,162.8,162.8,162.8,162.8,162.8,...,0,0,0,0,0,0,0,0,0,0
3,DM203,1.0,107.4,107.4,107.4,107.4,107.4,107.4,107.4,107.4,...,0,0,0,0,0,0,0,0,0,0
4,DM204,1.0,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,290.166667,...,0,0,0,0,0,0,0,0,0,0


Вырезаем колонку с 'patientID'

In [4]:
patientID = df.pop('patientID')

Удаляем строку с частотами

In [5]:
df.drop(0, inplace=True)

Разделяем данные на матрицу с признаками X и на столбец с целевой переменной (метками)

In [6]:
X, y = df.drop('has_DM2', axis=1), df['has_DM2']

#### Оценку качества модели (accuracy)  будем проводить методом отложенной выборки (hold-out set)
При таком подходе мы оставляем какую-то долю обучающей выборки (как правило от 20% до 40%), обучаем модель на остальных данных (60-80% исходной выборки) и считаем некоторую метрику качества модели (например, самое простое – долю правильных ответов в задаче классификации) на отложенной выборке.

In [7]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=.3, random_state=7)

In [8]:
y.value_counts(normalize=True)

1.0    0.55
0.0    0.45
Name: has_DM2, dtype: float64

#### Точность предсказания должна быть лучше как минимум чем 55%

### Logistic Regression

In [46]:
logreg = LogisticRegression(random_state=17, solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train);

In [47]:
pred_holdout_logreg = logreg.predict(X_holdout)
logreg_accuracy = accuracy_score(y_holdout, pred_holdout_logreg)
logreg_accuracy

0.6666666666666666

### Random Forest

In [44]:
rf_clf = RandomForestClassifier(n_estimators=10, n_jobs=2, random_state=7)
rf_clf.fit(X_train, y_train);

In [41]:
pred_holdout_rf_clf = rf_clf.predict(X_holdout)
rf_accuracy = accuracy_score(y_holdout, pred_holdout_rf_clf)
rf_accuracy

0.6666666666666666

### XGBoost

In [13]:
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train);

In [14]:
pred_holdout_xgbc = xgbc.predict(X_holdout)
XGB_accuracy = accuracy_score(y_holdout, pred_holdout_xgbc)
XGB_accuracy

0.8333333333333334

### Support Vector Machines

In [15]:
clf_svm = svm.SVC(gamma='scale')
clf_svm.fit(X_train, y_train);

In [16]:
pred_holdout_svm = clf_svm.predict(X_holdout)
svm_accuracy = accuracy_score(y_holdout, pred_holdout_svm)
svm_accuracy

0.6666666666666666

### Качество моделей (оценка на основе отложенной выборки)

In [24]:
accuracy = pd.DataFrame({
    'xgboost': [XGB_accuracy],
    'logreg': [logreg_accuracy],
    'random forest': [rf_accuracy],
    'SVM': [svm_accuracy]
    }, index=['accuracy'])

In [25]:
accuracy

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.833333,0.666667,0.666667,0.666667


### Кросс-валидация

In [42]:
skf = StratifiedKFold(n_splits = 5, random_state=1, shuffle = True)

In [48]:
svm_cvs = cross_val_score(clf_svm, X, y, scoring='accuracy', cv = skf).mean()
xgb_cvs = cross_val_score(xgbc, X, y, scoring='accuracy', cv = skf).mean()
logreg_cvs = cross_val_score(logreg, X, y, scoring='accuracy', cv = skf).mean()
rf_cvs = cross_val_score(rf_clf, X, y, scoring='accuracy', cv = skf).mean()

In [49]:
accuracy_cvs = pd.DataFrame({
    'xgboost': [xgb_cvs],
    'logreg': [logreg_cvs],
    'random forest': [rf_cvs],
    'SVM': [svm_cvs]
    }, index=['accuracy'])

In [50]:
accuracy_cvs

Unnamed: 0,xgboost,logreg,random forest,SVM
accuracy,0.613333,0.436667,0.536667,0.553333
