In [1]:
from sklearn import linear_model, tree, ensemble, svm, neighbors
from sklearn import metrics
from sklearn.datasets.samples_generator import make_classification
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

import pickle
import os

In [2]:
seed = 223

# Load Data

In [3]:
train = pd.read_csv('../data/train(classification).csv')

# Preprocessing

In [4]:
X = train.drop(['id','y'],axis=1)
y = train['y']

## Split data

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

print('x_train.shape: ',x_train.shape)
print('x_test.shape: ',x_test.shape)

x_train.shape:  (4900, 10)
x_test.shape:  (2100, 10)


# Building Models

In [6]:
ols = linear_model.LogisticRegression()
ridge = linear_model.RidgeClassifier(random_state=seed)

dt = tree.DecisionTreeClassifier(random_state=seed)
rf = ensemble.RandomForestClassifier(random_state=seed)
ada = ensemble.AdaBoostClassifier(random_state=seed)
gt = ensemble.GradientBoostingClassifier(random_state=seed)

svc = svm.SVC(probability=True, random_state=seed)
knn = neighbors.KNeighborsClassifier()

# Training

In [7]:
ols.fit(x_train,y_train)
ridge.fit(x_train,y_train)

dt.fit(x_train,y_train)
rf.fit(x_train,y_train)
ada.fit(x_train,y_train)
gt.fit(x_train,y_train)

svc.fit(x_train,y_train)
knn.fit(x_train,y_train)



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# Evaluation

In [8]:
ols_prob = ols.predict_proba(x_test)[:,1]
ridge_prob = ridge.predict(x_test)

dt_prob = dt.predict_proba(x_test)[:,1]
rf_prob = rf.predict_proba(x_test)[:,1]
ada_prob = ada.predict_proba(x_test)[:,1]
gt_prob = gt.predict_proba(x_test)[:,1]

svc_prob = svc.predict_proba(x_test)[:,1]
knn_prob = knn.predict_proba(x_test)[:,1]

In [9]:
model_name = ['ols','ridge','dt','rf','ada','gt','svc','knn']
model = [ols, ridge, dt, rf, ada, gt, svc, knn]
prob_lst = [ols_prob, ridge_prob, dt_prob, rf_prob, ada_prob, gt_prob, svc_prob, knn_prob]
pred_lst = [m.predict(x_test) for m in model]

In [10]:
def model_eval(y_true, y_pred, y_prob):
    acc = round(metrics.accuracy_score(y_true, y_pred), 3)
    precision = round(metrics.precision_score(y_true, y_pred), 3)
    recall = round(metrics.recall_score(y_true, y_pred), 3)
    f1_score = round(metrics.f1_score(y_true, y_pred), 3)
    auc = round(metrics.roc_auc_score(y_true, y_prob), 3)
    
    return acc, precision, recall, f1_score, auc

In [11]:
eval_df = pd.DataFrame(dict([model_name[i],model_eval(y_test, pred_lst[i], prob_lst[i])] for i in range(len(model_name))),
                       index=['Accuracy','Precision','Recall','F1_score','AUC'])

In [12]:
eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
gt,0.924,0.907,0.947,0.927,0.972
rf,0.923,0.917,0.933,0.925,0.959
ols,0.918,0.905,0.936,0.92,0.97
ridge,0.91,0.874,0.961,0.916,0.91
svc,0.91,0.899,0.927,0.913,0.966
ada,0.906,0.89,0.93,0.909,0.968
knn,0.896,0.885,0.913,0.899,0.942
dt,0.886,0.887,0.888,0.888,0.886


# Save & Load

## Save

In [13]:
for i in range(len(model_name)):
    with open(f'../save/notebook_classification/{model_name[i]}_nb_cls.pkl', 'wb') as file:
        pickle.dump(model[i], file)

In [14]:
eval_df.T.sort_values(by='Accuracy', ascending=False).to_csv('../save/notebook_classification/metrics.csv')

## Load

In [15]:
load_model = []
for i in range(len(model_name)):
    with open(f'../save/notebook_classification/{model_name[i]}_nb_cls.pkl', 'rb') as file:
        load_model.append(pickle.load(file))

In [16]:
ols_prob = load_model[0].predict_proba(x_test)[:,1]
ridge_prob = load_model[1].predict(x_test)

dt_prob = load_model[2].predict_proba(x_test)[:,1]
rf_prob = load_model[3].predict_proba(x_test)[:,1]
ada_prob = load_model[4].predict_proba(x_test)[:,1]
gt_prob = load_model[5].predict_proba(x_test)[:,1]

svc_prob = load_model[6].predict_proba(x_test)[:,1]
knn_prob = load_model[7].predict_proba(x_test)[:,1]

load_prob = [ols_prob, ridge_prob, dt_prob, rf_prob, ada_prob, gt_prob, svc_prob, knn_prob]

In [17]:
load_eval_df = pd.DataFrame(dict([model_name[i],
                                  model_eval(y_test, load_model[i].predict(x_test), load_prob[i])] 
                                  for i in range(len(model_name))),
                            index=['Accuracy','Precision','Recall','F1_score','AUC'])
load_eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
gt,0.924,0.907,0.947,0.927,0.972
rf,0.923,0.917,0.933,0.925,0.959
ols,0.918,0.905,0.936,0.92,0.97
ridge,0.91,0.874,0.961,0.916,0.91
svc,0.91,0.899,0.927,0.913,0.966
ada,0.906,0.89,0.93,0.909,0.968
knn,0.896,0.885,0.913,0.899,0.942
dt,0.886,0.887,0.888,0.888,0.886


In [18]:
load_eval_df.T.sort_values(by='Accuracy', ascending=False) == eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
gt,True,True,True,True,True
rf,True,True,True,True,True
ols,True,True,True,True,True
ridge,True,True,True,True,True
svc,True,True,True,True,True
ada,True,True,True,True,True
knn,True,True,True,True,True
dt,True,True,True,True,True
