In [1]:
from get_data import get_structured_data
import utility
from classifiers import *
from sklearn.model_selection import train_test_split
from constants import RANDOM_STATE, CV
from sklearn.model_selection import GridSearchCV

In [2]:
rows = get_structured_data()
all_X, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [3]:
rows = get_structured_data()
all_X, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [4]:
perfect_x, perfect_y = utility.get_perfect_data(rows)
perfect_x.shape, perfect_y.shape

((25, 12), (25, 1))

In [5]:
classifiers = [get_linear_svm_classifier(), get_rbf_svm_classifier(), get_naive_bayes_classifier(), get_random_forest_classifier()]

In [6]:
def train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=True):
    print(f'Optimizing parameters...')
    param_keys = '\n'.join([k for k in params.keys()])
    print(f'List of parameters: \n{param_keys}')
    grid = GridSearchCV(classifier, params, cv=cv, n_jobs=N_JOBS)
    grid.fit(train_x, train_y)
    model = grid.best_estimator_
    print(f'Optimal Parameters Found to be: ')
    print('\n'.join([f'{k}: {v}' for k, v in grid.best_params_.items()]))
    
    model.fit(train_x, train_y)
    
    print(f'Testing Classifier with {test_x.shape[0]} samples.')
    y_pred = model.predict(test_x)
    n_tests = test_y.shape[0]
    num_wrong = (y_pred != test_y).sum()
    
    if save_model:
        filepath, _ = utility.save_model(class_name, model)
        print(f'Model was saved to: {filepath}')

    print(f'Got {num_wrong}/{n_tests} ({num_wrong / n_tests:.2%}) tests wrong for {class_name}.')
    
    return num_wrong, n_tests

In [7]:
def train_all_classifiers(X, y, train_percent, save_models=True):
    cv = 1
    n_iterations = 0
    while cv == 1:
        train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=train_percent, random_state=RANDOM_STATE + n_iterations)
        cv = utility.get_max_cv(train_y)
        n_iterations += 1
    print(f'Using CV: {cv}')
    history = []
    
    train_y = train_y.reshape(-1)
    test_y = test_y.reshape(-1)
    
    for class_name, classifier, params in classifiers:
        print(f'Training/Testing {class_name}')
        num_wrong, n_tests = train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=save_models)
        history.append({'classifier': class_name, 'n_wrong': num_wrong, 'n_tests': n_tests, 'params': params})
        print('')
    
    print('\n'.join(['\n'.join([f'{k}: {v}' for k, v in result.items()]) for result in history]))

In [8]:
training_sizes = np.arange(.1, .9, .1)
for training_size in training_sizes:
    train_all_classifiers(all_X, all_y, training_size)

Using CV: 3
Training/Testing Linear SVM
Optimizing parameters...
List of parameters: 
linearsvc__C
linearsvc__fit_intercept
linearsvc__tol
Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: False
linearsvc__tol: 1e-06
Testing Classifier with 288 samples.
Model was saved to: ./models/LinearSVM_20201105_163559.mdl
Got 104/288 (36.11%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 288 samples.
Model was saved to: ./models/RBFSVM_20201105_163601.mdl
Got 97/288 (33.68%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 288 samples.
Model was saved to: ./models/NaiveBayes_20201105_163601.mdl
Got 120/288 (4

Optimal Parameters Found to be: 
ccp_alpha: 0.01
max_depth: 3
max_features: None
n_estimators: 80
Testing Classifier with 192 samples.
Model was saved to: ./models/RandomForest_20201105_164636.mdl
Got 63/192 (32.81%) tests wrong for Random Forest.

classifier: Linear SVM
n_wrong: 69
n_tests: 192
params: {'linearsvc__C': array([0.125, 0.15 , 0.175, 0.2  , 0.225, 0.25 , 0.275, 0.3  , 0.325,
       0.35 ]), 'linearsvc__fit_intercept': [True, False], 'linearsvc__tol': array([1.e-06, 6.e-06])}
classifier: RBF SVM
n_wrong: 69
n_tests: 192
params: {'svc__gamma': ['scale', 'auto'], 'svc__shrinking': [True, False], 'svc__tol': array([1.0e-05, 1.5e-05, 2.0e-05, 2.5e-05, 3.0e-05, 3.5e-05, 4.0e-05,
       4.5e-05]), 'svc__C': array([0.125, 0.15 , 0.175, 0.2  , 0.225, 0.25 , 0.275, 0.3  , 0.325,
       0.35 ])}
classifier: Naive Bayes
n_wrong: 107
n_tests: 192
params: {'var_smoothing': array([1.0e-12, 6.0e-12, 1.1e-11, 1.6e-11, 2.1e-11, 2.6e-11, 3.1e-11,
       3.6e-11, 4.1e-11, 4.6e-11, 5.1e-11, 5

Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: True
linearsvc__tol: 1e-06
Testing Classifier with 64 samples.
Model was saved to: ./models/LinearSVM_20201105_165405.mdl
Got 22/64 (34.38%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 64 samples.
Model was saved to: ./models/RBFSVM_20201105_165407.mdl
Got 22/64 (34.38%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 3.6e-11
Testing Classifier with 64 samples.
Model was saved to: ./models/NaiveBayes_20201105_165408.mdl
Got 37/64 (57.81%) tests wrong for Naive Bayes.

Training/Testing Random Forest
Optimizing parameters...
List of parameters: 
ccp_alpha
max_features
max_depth