# Задание
* Применить алгоритм балансировки классов к набору данных, использовавшемуся в заданиях 1 - 3.
* Оценить качество классификаторов из заданий 1 - 3 на сбалансированном наборе данных.
* Сравнить качество классификаторов, разработанных с применением несбалансированного и сбалансированного набора данных.

In [99]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

Загружаем датасет.

In [100]:
data = pd.read_csv('./Data/hepatitis.data', index_col=False, names=['Class', 'Age', 'Sex', 'Steroid', 'Antivrals', 'Fatigue', 'Malaise', 'Anorexia',
                                                                    'Liver big', 'Liver firm', 'Spleen palpable', 'Spiders', 'Ascites', 'Varices', 'Bilirubin', 'Alk phosphate', 'Sgot', 'Albumin', 'Protime', 'Histology'])

In [101]:
data.head()

Unnamed: 0,Class,Age,Sex,Steroid,Antivrals,Fatigue,Malaise,Anorexia,Liver big,Liver firm,Spleen palpable,Spiders,Ascites,Varices,Bilirubin,Alk phosphate,Sgot,Albumin,Protime,Histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1


Удаляем столбец классов, заменяем пропущенные данные средним значением по столбцу и переназначаем идентификаторы классов.

In [124]:
x = data.drop(['Class'], axis=1)
y = data['Class']
x = x.replace('?', np.nan)
y = y.replace(1, 0)
y = y.replace(2, 1)
for (colName, colData) in x.iteritems():
    x[colName] = x[colName].astype(float)
    x[colName] = x[colName].fillna(x[colName].mean())
    
scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)

Разбиваем выборку на обучающую и тестовую.

In [291]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=10)

## KNN без балансировки

In [292]:
knn_model = KNeighborsClassifier()

neighbours = [i+1 for i in range(1, 15)]
algorithm = ['ball_tree', 'kd_tree', 'brute']
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']

gs_clf = GridSearchCV(knn_model, param_grid={'n_neighbors': neighbours, 'algorithm': algorithm,
                                             'weights': weights, 'metric': metric})

gs_clf.fit(X_train, y_train)
best_cv_err = 1 - gs_clf.best_score_
best_param_neighbours = gs_clf.best_estimator_.n_neighbors
best_param_algorithm = gs_clf.best_estimator_.algorithm
best_param_weights = gs_clf.best_estimator_.weights
best_param_metric = gs_clf.best_estimator_.metric

knn_model = KNeighborsClassifier(n_neighbors=best_param_neighbours, algorithm=best_param_algorithm,
                                 weights=best_param_weights, metric=best_param_metric)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

KNN_acc = accuracy_score(y_test, y_pred)
KNN_conf = confusion_matrix(y_test, y_pred)
KNN_rec = recall_score(y_test, y_pred)
print(KNN_acc)
print(KNN_conf)
print(KNN_rec)

0.8297872340425532
[[ 4  5]
 [ 3 35]]
0.9210526315789473


## SVM без балансировки

In [293]:
parameters = {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': [
    0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}
clf = GridSearchCV(svm.SVC(), parameters)
clf.fit(X_train, y_train)

sv = clf.best_estimator_

sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)

SVM_acc = accuracy_score(y_test, y_pred)
SVM_conf = confusion_matrix(y_test, y_pred)
SVM_rec = recall_score(y_test, y_pred)
print(SVM_acc)
print(SVM_conf)
print(SVM_rec)

0.851063829787234
[[ 6  3]
 [ 4 34]]
0.8947368421052632


## Random Forest без балансировки

In [294]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])
params = {
    'rf__n_estimators': [120, 140],
    'rf__max_depth': [30, 50],
    'rf__min_samples_split': [2, 3],
    'rf__min_samples_leaf': [3, 5],
}

RF_gs = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=3)
RF_gs.fit(X_train, y_train)

RF = RF_gs.best_estimator_
RF.fit(X_train, y_train)

y_pred = RF.predict(X_test)

RF_acc = accuracy_score(y_test, y_pred)
RF_conf = confusion_matrix(y_test, y_pred)
RF_rec = recall_score(y_test, y_pred)
print(RF_acc)
print(RF_conf)
print(RF_rec)

0.8297872340425532
[[ 4  5]
 [ 3 35]]
0.9210526315789473


## Производим балансировку классов.

In [295]:
smt = SMOTE(random_state=10)
X_train, y_train = smt.fit_sample(X_train, y_train)

### KNN с балансировкой

In [296]:
gs_clf.fit(X_train, y_train)
best_cv_err = 1 - gs_clf.best_score_
best_param_neighbours = gs_clf.best_estimator_.n_neighbors
best_param_algorithm = gs_clf.best_estimator_.algorithm
best_param_weights = gs_clf.best_estimator_.weights
best_param_metric = gs_clf.best_estimator_.metric

knn_model = KNeighborsClassifier(n_neighbors=best_param_neighbours, algorithm=best_param_algorithm,
                                 weights=best_param_weights, metric=best_param_metric)
knn_model.fit(X_train, y_train)

KNN_acc_b = accuracy_score(y_test, y_pred)
KNN_conf_b = confusion_matrix(y_test, y_pred)
KNN_rec_b = recall_score(y_test, y_pred)
print(KNN_acc)
print(KNN_conf)
print(KNN_rec)

0.8297872340425532
[[ 4  5]
 [ 3 35]]
0.9210526315789473


### SVM с балансировкой

In [297]:
clf.fit(X_train, y_train)

sv = clf.best_estimator_

sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)

SVM_acc_b = accuracy_score(y_test, y_pred)
SVM_conf_b = confusion_matrix(y_test, y_pred)
SVM_rec_b = recall_score(y_test, y_pred)
print(SVM_acc)
print(SVM_conf)
print(SVM_rec)

0.851063829787234
[[ 6  3]
 [ 4 34]]
0.8947368421052632


### Random Forest с балансировкой

In [298]:
RF_gs.fit(X_train, y_train)

RF = RF_gs.best_estimator_
RF.fit(X_train, y_train)

y_pred = RF.predict(X_test)

RF_acc_b = accuracy_score(y_test, y_pred)
RF_conf_b = confusion_matrix(y_test, y_pred)
RF_rec_b = recall_score(y_test, y_pred)
print(RF_acc)
print(RF_conf)
print(RF_rec)

0.8297872340425532
[[ 4  5]
 [ 3 35]]
0.9210526315789473


## Подготовим данные и  сравним результаты

In [299]:
result_dic = {"Default RF": pd.Series([RF_acc, RF_conf, RF_rec], index=['accuracy_score', 'confusion_matrix', 'recall_score']), 
      "Balansed RF": pd.Series([RF_acc_b, RF_conf_b, RF_rec_b], index=['accuracy_score', 'confusion_matrix', 'recall_score']),
      "Default SVM": pd.Series([SVM_acc, SVM_conf, SVM_rec], index=['accuracy_score', 'confusion_matrix', 'recall_score']),
      "Balansed SVM": pd.Series([SVM_acc_b, SVM_conf_b, SVM_rec_b], index=['accuracy_score', 'confusion_matrix', 'recall_score']),   
      "Default KNN": pd.Series([KNN_acc, KNN_conf, KNN_rec], index=['accuracy_score', 'confusion_matrix', 'recall_score']),   
      "Balansed KNN": pd.Series([KNN_acc_b, KNN_conf_b, KNN_rec_b], index=['accuracy_score', 'confusion_matrix', 'recall_score'])}

In [300]:
result = pd.DataFrame(result_dic)
result

Unnamed: 0,Default RF,Balansed RF,Default SVM,Balansed SVM,Default KNN,Balansed KNN
accuracy_score,0.829787,0.808511,0.851064,0.808511,0.829787,0.829787
confusion_matrix,"[[4, 5], [3, 35]]","[[7, 2], [7, 31]]","[[6, 3], [4, 34]]","[[3, 6], [3, 35]]","[[4, 5], [3, 35]]","[[4, 5], [3, 35]]"
recall_score,0.921053,0.815789,0.894737,0.921053,0.921053,0.921053


**Вывод:** как видно из таблицы - балансировка классов улучшила результаты только для алгоритма SVM.