In [10]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from KNN import KNN
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

import sys
sys.path.append('./models/')
import warnings
warnings.simplefilter("ignore")

In [5]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_train, x_test)
    y_pred_train = model.predict(x_train, x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def sklearn_retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def search_n_neighbors(x_train, y_train, x_test, y_test, n_neighbors_list):
    best_n = None
    best_roc = -10
    for n_n in n_neighbors_list:
        model = KNN(n_n, 5)
        accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
        if roc_auc > best_roc:
            best_n = n_n

In [6]:
def cross_val_score(model, x_train, y_train, x_test, y_test):
    acc_scores = []
    prec_scores = []
    roc_scores = []

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    sc = model.fit(x_train, y_train)
    
    print(f'CV scores: {sc}')
    mn = sum(sc) / float(len(sc))
    print(f'Train data accuracy: {mn}')
    
    y_pred = model.predict(x_train, x_test)
    
    acc_scores.append(accuracy_score(y_test, y_pred))
    prec_scores.append(precision_score(y_test, y_pred, pos_label='positive', average='micro'))
    roc_scores.append(roc_auc_score(pd.get_dummies(y_test), 
                                    pd.get_dummies(y_pred), multi_class='ovr'))
    
    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

# Подготовка данных

In [8]:
data = pd.read_csv('models/StudentsPerformance.csv.xls')

In [11]:
label_encoder = LabelEncoder()

mapped_education = pd.Series(label_encoder.fit_transform(data['parental level of education']))
mapped_education.value_counts().plot.barh()
print(dict(enumerate(label_encoder.classes_)))

{0: "associate's degree", 1: "bachelor's degree", 2: 'high school', 3: "master's degree", 4: 'some college', 5: 'some high school'}


In [12]:
categorical_columns = data.columns[data.dtypes == 'object'].union(['gender'])
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


In [13]:
X, Y = data.drop(['test preparation course'], axis=1), data['test preparation course']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# KNN 

In [14]:
n_neighbors = search_n_neighbors(x_train, y_train, x_test, y_test, np.arange(3,12,2))
print(f'Best n: {n_neighbors}')

CV scores: [0.6492537313432836, 0.6268656716417911, 0.6492537313432836, 0.582089552238806, 0.5671641791044776]
Train data accuracy: 0.6149253731343284
CV scores: [0.7014925373134329, 0.6119402985074627, 0.582089552238806, 0.6492537313432836, 0.6268656716417911]
Train data accuracy: 0.6343283582089552
CV scores: [0.5895522388059702, 0.6791044776119403, 0.6791044776119403, 0.6268656716417911, 0.6716417910447762]
Train data accuracy: 0.6492537313432836
CV scores: [0.6716417910447762, 0.6119402985074627, 0.664179104477612, 0.6567164179104478, 0.7014925373134329]
Train data accuracy: 0.6611940298507463
CV scores: [0.6492537313432836, 0.6343283582089553, 0.6716417910447762, 0.6492537313432836, 0.7014925373134329]
Train data accuracy: 0.6611940298507463
Best n: None


In [15]:
model = KNN(5)
accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
print_error_validation(accuracy, precision, roc_auc)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)

CV scores: [0.6194029850746269, 0.6343283582089553, 0.7388059701492538, 0.6567164179104478, 0.5895522388059702]
Train data accuracy: 0.6477611940298507
Ошибки на кросс валидации:
Accuracy  = 0.6393939393939394
Precision = 0.6393939393939394
Roc auc   = 0.5793650793650793


In [16]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.7313432835820896  |  0.6393939393939394
Precision = 0.7313432835820896  |  0.6393939393939394
Roc auc   = 0.6698452999527633  |  0.5793650793650793


# KNN с Sklearn

In [17]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)

print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.6515151515151515


In [18]:
sklearn_retrain(neigh, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.7373134328358208  |  0.6515151515151515
Precision = 0.7373134328358208  |  0.6515151515151515
Roc auc   = 0.6744115099984255  |  0.5997899159663865
