# This notebook documents and contains code for training and saving traditional classification models on the engineered data

First, lets try logistic regression

In [None]:
# necessary imports
!pip install pandas 
!pip install numpy
!pip install scikit-learn

In [11]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('data/post_FE.csv')
data = df.to_numpy()
X = data[:,:-1]
y = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print(X.shape, y.shape)

(303, 18) (303,)


In [12]:
import numpy as np

def accuracy(y_pred, y_act):
    return np.sum(y_pred==y_act)/np.shape(y_pred)[0]

model = LogisticRegression(random_state=50, fit_intercept=False, max_iter=500) # We already have intercept in our data
fit = model.fit(X_train, y_train)

preds = fit.predict(X_test)

print(accuracy(preds, y_test))

0.8032786885245902


In [13]:
preds == y_test

array([ True,  True,  True, False, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True, False, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True])

These results are really good! To make sure that the model works well out of the box, lets cross-validate

In [14]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(random_state=0, fit_intercept=False, max_iter=500)
scores = cross_val_score(model, X, y, cv=10)

print(scores)
print('average test accuracy', np.mean(scores)) 

[0.83870968 0.74193548 0.80645161 0.86666667 0.83333333 0.73333333
 0.86666667 0.73333333 0.83333333 0.83333333]
average test accuracy 0.808709677419355


As we can see, the logistic regression model works pretty well for our dataset, or at least OK. Lets summarize the model performance on the test data using cross validation (5-fold)

In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds, labels=[0, 1], target_names=['low prob', 'high prob']))

              precision    recall  f1-score   support

    low prob       0.84      0.79      0.81        33
   high prob       0.77      0.82      0.79        28

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.81      0.80      0.80        61



I want to try Support Vector Machines and classification trees as well before moving on to deep learning models:

In [16]:
from sklearn import svm
# trying different kernels and hyperparams
kernels = ['rbf', 'poly', 'sigmoid']

for kernel in kernels:
    if kernel == 'poly':
        for deg in range(5):
            svm_classifier = svm.SVC(kernel=kernel, degree=deg)
            scores = cross_val_score(svm_classifier, X, y, cv=10)
            print('average test accuracy for', kernel, 'of degree', deg, np.mean(scores)) 
    else:
        svm_classifier = svm.SVC(kernel=kernel)
        scores = cross_val_score(svm_classifier, X, y, cv=10)
        print('average test accuracy for', kernel, np.mean(scores)) 

average test accuracy for rbf 0.8053763440860214
average test accuracy for poly of degree 0 0.5445161290322581
average test accuracy for poly of degree 1 0.8218279569892474
average test accuracy for poly of degree 2 0.8316129032258065
average test accuracy for poly of degree 3 0.8283870967741936
average test accuracy for poly of degree 4 0.8218279569892474
average test accuracy for sigmoid 0.7759139784946236


So, the 2nd degree polynomial kernel performs best. Lets try with a decision tree instead!

In [17]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini')
scores = cross_val_score(tree, X, y, cv=10)

print('average test accuracy', np.mean(scores)) 

tree = DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(tree, X, y, cv=10)

print('average test accuracy', np.mean(scores)) 

average test accuracy 0.7661290322580644
average test accuracy 0.7665591397849463


After testing all these models, it is apparent that the Support Vector Machine with polynomial kernel of degree 2 performs best! So, lets train that model on the FULL dataset and save it in a file for later use

In [18]:
from pickle import dump

final_model = svm.SVC(kernel='poly', degree=2) # We already have intercept in our data. 
final_fit = final_model.fit(X, y) # fit the model to the WHOLE dataset
print(X.shape)
dump(final_fit, open('model/final_simple.sav', 'wb'))

(303, 18)
