In [1]:
from get_data import get_structured_data
import utility
from classifiers import *
from sklearn.model_selection import train_test_split
from constants import RANDOM_STATE, CV
from sklearn.model_selection import GridSearchCV

In [2]:
rows = get_structured_data()
X, y = utility.get_all_data(rows)
X.shape, y.shape

((310, 12), (310, 1))

In [3]:
classifiers = [get_linear_svm_classifier(), get_rbf_svm_classifier(), get_naive_bayes_classifier(), get_random_forest_classifier()]

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=.25, random_state=RANDOM_STATE)

In [4]:
for class_name, classifier, params in classifiers:
    print(f'Classifier: {class_name}')
    
    print(f'Optimizing parameters...')
    param_keys = '\n'.join([k for k in params.keys()])
    print(f'List of parameters: \n{param_keys}')
    grid = GridSearchCV(classifier, params, cv=CV)
    grid.fit(train_x, train_y.reshape(-1))
    model = grid.best_estimator_
    print(f'Optimal Parameters Found to be: ')
    print('\n'.join([f'{k}: {v}' for k, v in grid.best_params_.items()]))
    
    
    model.fit(train_x, train_y.reshape(-1))
    
    print(f'Testing Classifier with {test_x.shape[0]} samples.')
    y_pred = model.predict(test_x)
    n_tests = test_y.shape[0]
    num_wrong = (y_pred != test_y.reshape(-1)).sum()
    
    print(f'Got {num_wrong}/{n_tests} ({num_wrong / n_tests:.2%}) tests wrong.')
    print('')

Classifier: Linear SVM
Optimizing parameters...
List of parameters: 
linearsvc__C
linearsvc__fit_intercept
linearsvc__tol
Optimal Parameters Found to be: 
linearsvc__C: 0.25
linearsvc__fit_intercept: True
linearsvc__tol: 1e-05
Testing Classifier with 78 samples.
Got 24/78 (30.77%) tests wrong.

Classifier: RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.25
svc__gamma: scale
svc__shrinking: True
svc__tol: 5e-05
Testing Classifier with 78 samples.
Got 23/78 (29.49%) tests wrong.

Classifier: Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-10
Testing Classifier with 78 samples.
Got 51/78 (65.38%) tests wrong.

Classifier: Random Forest
Optimizing parameters...
List of parameters: 
ccp_alpha
max_features
max_depth
n_estimators
Optimal Parameters Found to be: 
ccp_alpha: 0.02
max_depth: 5
max_features: auto
n_estimators: 100
Te