In [1]:
from sklearn import linear_model, tree, ensemble, svm, neighbors
from sklearn import metrics
from sklearn.datasets.samples_generator import make_classification
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd

import pickle
import os

In [2]:
seed = 223

# Load Data

In [4]:
X, y = make_classification(n_samples=1000, n_features=10, random_state=223)
print('X.shape: ',X.shape)

X.shape:  (1000, 10)


In [8]:
np.unique(y, return_counts=True)

(array([0, 1]), array([497, 503]))

## Split data

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

print('x_train.shape: ',x_train.shape)
print('x_test.shape: ',x_test.shape)

x_train.shape:  (700, 10)
x_test.shape:  (300, 10)


# Building Models

In [28]:
ols = linear_model.LogisticRegression()
ridge = linear_model.RidgeClassifier(random_state=seed)

dt = tree.DecisionTreeClassifier(random_state=seed)
rf = ensemble.RandomForestClassifier(random_state=seed)
ada = ensemble.AdaBoostClassifier(random_state=seed)
gt = ensemble.GradientBoostingClassifier(random_state=seed)

svc = svm.SVC(probability=True)
knn = neighbors.KNeighborsClassifier()

# Training

In [29]:
ols.fit(x_train,y_train)
ridge.fit(x_train,y_train)

dt.fit(x_train,y_train)
rf.fit(x_train,y_train)
ada.fit(x_train,y_train)
gt.fit(x_train,y_train)

svc.fit(x_train,y_train)
knn.fit(x_train,y_train)



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# Evaluation

In [43]:
ols_prob = ols.predict_proba(x_test)[:,1]
ridge_prob = ridge.predict(x_test)

dt_prob = dt.predict_proba(x_test)[:,1]
rf_prob = rf.predict_proba(x_test)[:,1]
ada_prob = ada.predict_proba(x_test)[:,1]
gt_prob = gt.predict_proba(x_test)[:,1]

svc_prob = svc.predict_proba(x_test)[:,1]
knn_prob = knn.predict_proba(x_test)[:,1]

In [56]:
model_name = ['ols','ridge','dt','rf','ada','gt','svc','knn']
model = [ols, ridge, dt, rf, ada, gt, svc, knn]
prob_lst = [ols_prob, ridge_prob, dt_prob, rf_prob, ada_prob, gt_prob, svc_prob, knn_prob]
pred_lst = [list(map(int, prob > 0.5)) for prob in prob_lst]

In [68]:
def model_eval(y_true, y_pred, y_prob):
    acc = round(metrics.accuracy_score(y_true, y_pred), 3)
    precision = round(metrics.precision_score(y_true, y_pred), 3)
    recall = round(metrics.recall_score(y_true, y_pred), 3)
    f1_score = round(metrics.f1_score(y_true, y_pred), 3)
    auc = round(metrics.roc_auc_score(y_true, y_prob), 3)
    
    return acc, precision, recall, f1_score, auc

In [90]:
eval_df = pd.DataFrame(dict([model_name[i],model_eval(y_test, pred_lst[i], prob_lst[i])] for i in range(len(model_name))),
                       index=['Accuracy','Precision','Recall','F1_score','AUC'])

In [91]:
eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
ols,0.963,0.936,0.985,0.96,0.985
rf,0.963,0.956,0.963,0.959,0.98
gt,0.963,0.942,0.978,0.96,0.987
ridge,0.957,0.923,0.985,0.953,0.959
svc,0.95,0.922,0.97,0.945,0.98
dt,0.947,0.928,0.955,0.941,0.947
ada,0.94,0.926,0.94,0.933,0.98
knn,0.923,0.872,0.97,0.919,0.972


# Save & Load

In [76]:
os.mkdir('../save/notebook_classification')

## Save

In [77]:
for i in range(len(model_name)):
    with open(f'../save/notebook_classification/{model_name[i]}_nb_cls.pkl', 'wb') as file:
        pickle.dump(model[i], file)

In [83]:
eval_df.T.sort_values(by='Accuracy', ascending=False).to_csv('../save/notebook_classification/metrics.csv')

## Load

In [78]:
load_model = []
for i in range(len(model_name)):
    with open(f'../save/notebook_classification/{model_name[i]}_nb_cls.pkl', 'rb') as file:
        load_model.append(pickle.load(file))

In [80]:
ols_prob = load_model[0].predict_proba(x_test)[:,1]
ridge_prob = load_model[1].predict(x_test)

dt_prob = load_model[2].predict_proba(x_test)[:,1]
rf_prob = load_model[3].predict_proba(x_test)[:,1]
ada_prob = load_model[4].predict_proba(x_test)[:,1]
gt_prob = load_model[5].predict_proba(x_test)[:,1]

svc_prob = load_model[6].predict_proba(x_test)[:,1]
knn_prob = load_model[7].predict_proba(x_test)[:,1]

load_prob = [ols_prob, ridge_prob, dt_prob, rf_prob, ada_prob, gt_prob, svc_prob, knn_prob]

In [86]:
load_eval_df = pd.DataFrame(dict([model_name[i],
                                  model_eval(y_test, load_model[i].predict(x_test), load_prob[i])] 
                                  for i in range(len(model_name))),
                            index=['Accuracy','Precision','Recall','F1_score','AUC'])
load_eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
ols,0.963,0.936,0.985,0.96,0.985
rf,0.963,0.956,0.963,0.959,0.98
gt,0.963,0.942,0.978,0.96,0.987
ridge,0.957,0.923,0.985,0.953,0.959
svc,0.95,0.922,0.97,0.945,0.98
dt,0.947,0.928,0.955,0.941,0.947
ada,0.94,0.926,0.94,0.933,0.98
knn,0.923,0.872,0.97,0.919,0.972


In [85]:
load_eval_df.T.sort_values(by='Accuracy', ascending=False) == eval_df.T.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1_score,AUC
ols,True,True,True,True,True
rf,True,True,True,True,True
gt,True,True,True,True,True
ridge,True,True,True,True,True
svc,True,True,True,True,True
dt,True,True,True,True,True
ada,True,True,True,True,True
knn,True,True,True,True,True
