In [1]:
from get_data import get_structured_data
import utility
from classifiers import *
from sklearn.model_selection import train_test_split
from constants import RANDOM_STATE, CV
from sklearn.model_selection import GridSearchCV

In [2]:
rows = get_structured_data()
all_X, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((310, 12), (310, 1))

In [3]:
perfect_x, perfect_y = utility.get_perfect_data(rows)
perfect_x.shape, perfect_y.shape

((25, 12), (25, 1))

In [4]:
classifiers = [get_linear_svm_classifier(), get_rbf_svm_classifier(), get_naive_bayes_classifier(), get_random_forest_classifier()]

In [5]:
def train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=True):
    print(f'Optimizing parameters...')
    param_keys = '\n'.join([k for k in params.keys()])
    print(f'List of parameters: \n{param_keys}')
    grid = GridSearchCV(classifier, params, cv=cv)
    grid.fit(train_x, train_y)
    model = grid.best_estimator_
    print(f'Optimal Parameters Found to be: ')
    print('\n'.join([f'{k}: {v}' for k, v in grid.best_params_.items()]))
    
    model.fit(train_x, train_y)
    
    print(f'Testing Classifier with {test_x.shape[0]} samples.')
    y_pred = model.predict(test_x)
    n_tests = test_y.shape[0]
    num_wrong = (y_pred != test_y).sum()
    
    if save_model:
        filepath, _ = utility.save_model(class_name, model)
        print(f'Model was saved to: {filepath}')

    print(f'Got {num_wrong}/{n_tests} ({num_wrong / n_tests:.2%}) tests wrong for {class_name}.')
    
    return num_wrong, n_tests

In [6]:
def train_all_classifiers(X, y, train_percent, save_models=True):
    train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=train_percent, random_state=RANDOM_STATE)
    cv = min(utility.get_max_cv(train_y), utility.get_max_cv(test_y))
    print(f'Using CV: {cv}')
    history = []
    
    train_y = train_y.reshape(-1)
    test_y = test_y.reshape(-1)
    
    for class_name, classifier, params in classifiers:
        print(f'Training/Testing {class_name}')
        num_wrong, n_tests = train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=save_models)
        history.append({'classifier': class_name, 'n_wrong': num_wrong, 'n_tests': n_tests, 'params': params})
        print('')
    
    print('\n'.join([[f'{k}: {v}\n' for k, v in result.items()] for result in history]))

In [7]:
train_all_classifiers(perfect_x, perfect_y, .25)

Using CV: 2
Training/Testing Linear SVM
Optimizing parameters...
List of parameters: 
linearsvc__C
linearsvc__fit_intercept
linearsvc__tol
Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: True
linearsvc__tol: 1e-06
Testing Classifier with 19 samples.
Model was saved to: ./models/LinearSVM_2020_10_28__142849.mdl
Got 9/19 (47.37%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 19 samples.
Model was saved to: ./models/RBFSVM_2020_10_28__142851.mdl
Got 10/19 (52.63%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 19 samples.
Model was saved to: ./models/NaiveBayes_2020_10_28__142851.mdl
Got 11/19 (57

TypeError: sequence item 0: expected str instance, list found