In [1]:
from get_data import get_structured_data
import utility
from classifiers import *
from sklearn.model_selection import train_test_split
from constants import RANDOM_STATE, CV
from sklearn.model_selection import GridSearchCV

In [2]:
rows = get_structured_data()
all_X, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [3]:
rows = get_structured_data()
all_X, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [4]:
perfect_x, perfect_y = utility.get_perfect_data(rows)
perfect_x.shape, perfect_y.shape

((25, 12), (25, 1))

In [5]:
classifiers = [get_linear_svm_classifier(), get_rbf_svm_classifier(), get_naive_bayes_classifier(), get_random_forest_classifier()]

In [6]:
def train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=True):
    print(f'Optimizing parameters...')
    param_keys = '\n'.join([k for k in params.keys()])
    print(f'List of parameters: \n{param_keys}')
    grid = GridSearchCV(classifier, params, cv=cv, n_jobs=N_JOBS)
    grid.fit(train_x, train_y)
    model = grid.best_estimator_
    print(f'Optimal Parameters Found to be: ')
    print('\n'.join([f'{k}: {v}' for k, v in grid.best_params_.items()]))
    
    model.fit(train_x, train_y)
    
    print(f'Testing Classifier with {test_x.shape[0]} samples.')
    y_pred = model.predict(test_x)
    n_tests = test_y.shape[0]
    num_wrong = (y_pred != test_y).sum()
    
    if save_model:
        filepath, _ = utility.save_model(class_name, model)
        print(f'Model was saved to: {filepath}')

    print(f'Got {num_wrong}/{n_tests} ({num_wrong / n_tests:.2%}) tests wrong for {class_name}.')
    
    return num_wrong, n_tests

In [26]:
def train_all_classifiers(X, y, train_percent, save_models=True):
    cv = 1
    n_iterations = 0
    while cv == 1:
        train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=train_percent, random_state=RANDOM_STATE + n_iterations)
        cv = utility.get_max_cv(train_y)
        n_iterations += 1
    print(f'Using CV: {cv}')
    history = []
    
    train_y = train_y.reshape(-1)
    test_y = test_y.reshape(-1)
    
    for class_name, classifier, params in classifiers:
        print(f'Training/Testing {class_name}')
        num_wrong, n_tests = train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=save_models)
        best_params = {k: v for k, v in classifier.get_params().items() if k in params.keys()}
        last_result = {'classifier': class_name, 'n_wrong': int(num_wrong), 'n_tests': int(n_tests), 'best_params': best_params, 'train_size': float(train_percent)}
        history.append(last_result)
    print('\n'.join(['\n'.join([f'{k}: {v}' for k, v in result.items()]) for result in history]))
    return history

In [27]:
training_sizes = np.arange(.1, .91, .1)
results = []
for training_size in training_sizes:
    curr_results = train_all_classifiers(all_X, all_y, training_size)
    results.extend(curr_results)

utility.save_results(results)

Using CV: 3
Training/Testing Linear SVM
Optimizing parameters...
List of parameters: 
linearsvc__C
linearsvc__fit_intercept
linearsvc__tol
Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: False
linearsvc__tol: 1e-06
Testing Classifier with 288 samples.
Model was saved to: ./models/LinearSVM_20201118_160431.mdl
Got 104/288 (36.11%) tests wrong for Linear SVM.
Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 288 samples.
Model was saved to: ./models/RBFSVM_20201118_160433.mdl
Got 97/288 (33.68%) tests wrong for RBF SVM.
Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 288 samples.
Model was saved to: ./models/NaiveBayes_20201118_160433.mdl
Got 120/288 (41.

Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 160 samples.
Model was saved to: ./models/RBFSVM_20201118_161444.mdl
Got 57/160 (35.62%) tests wrong for RBF SVM.
Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 160 samples.
Model was saved to: ./models/NaiveBayes_20201118_161444.mdl
Got 106/160 (66.25%) tests wrong for Naive Bayes.
Training/Testing Random Forest
Optimizing parameters...
List of parameters: 
ccp_alpha
max_features
max_depth
n_estimators
Optimal Parameters Found to be: 
ccp_alpha: 0.019999999999999997
max_depth: 3
max_features: auto
n_estimators: 80
Testing Classifier with 160 samples.
Model was saved to: ./models/RandomForest_20201118_161652.mdl
Got 49/160 (30.63%) tests wrong for Random Forest.
classifier: Linear SVM
n_wrong: 57
n_tests: 160
best_params: {'linearsvc__C': 1.0, 

Optimal Parameters Found to be: 
ccp_alpha: 0.01
max_depth: None
max_features: auto
n_estimators: 140
Testing Classifier with 32 samples.
Model was saved to: ./models/RandomForest_20201118_162758.mdl
Got 12/32 (37.50%) tests wrong for Random Forest.
classifier: Linear SVM
n_wrong: 12
n_tests: 32
best_params: {'linearsvc__C': 1.0, 'linearsvc__fit_intercept': True, 'linearsvc__tol': 0.0001}
train_size: 0.9
classifier: RBF SVM
n_wrong: 12
n_tests: 32
best_params: {'svc__C': 1.0, 'svc__gamma': 'scale', 'svc__shrinking': True, 'svc__tol': 0.001}
train_size: 0.9
classifier: Naive Bayes
n_wrong: 17
n_tests: 32
best_params: {'var_smoothing': 1e-09}
train_size: 0.9
classifier: Random Forest
n_wrong: 12
n_tests: 32
best_params: {'ccp_alpha': 0.0, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}
train_size: 0.9
Saved results to ./results/results_20201118_162758.json
