In [2]:
from get_data import get_structured_data
import utility
from classifiers import *
from sklearn.model_selection import train_test_split
from constants import RANDOM_STATE, CV, MIN_NUM_PRICES
from sklearn.model_selection import GridSearchCV

In [3]:
rows = get_structured_data()
all_X, _, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [3]:
rows = get_structured_data()
all_X, _, all_y = utility.get_all_data(rows)
all_X.shape, all_y.shape

((319, 12), (319, 1))

In [4]:
perfect_x, _, perfect_y = utility.get_perfect_data(rows)
perfect_x.shape, perfect_y.shape

((25, 12), (25, 1))

In [5]:
classifiers = [get_linear_svm_classifier(), get_rbf_svm_classifier(), get_naive_bayes_classifier(), get_random_forest_classifier()]

In [6]:
def train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=True):
    print(f'Optimizing parameters...')
    param_keys = '\n'.join([k for k in params.keys()])
    print(f'List of parameters: \n{param_keys}')
    grid = GridSearchCV(classifier, params, cv=cv, n_jobs=N_JOBS)
    grid.fit(train_x, train_y)
    model = grid.best_estimator_
    print(f'Optimal Parameters Found to be: ')
    print('\n'.join([f'{k}: {v}' for k, v in grid.best_params_.items()]))
    
    model.fit(train_x, train_y)
    
    print(f'Testing Classifier with {test_x.shape[0]} samples.')
    y_pred = model.predict(test_x)
    n_tests = test_y.shape[0]
    num_wrong = (y_pred != test_y).sum()
    
    if save_model:
        filepath, _ = utility.save_model(class_name, model)
        print(f'Model was saved to: {filepath}')

    print(f'Got {num_wrong}/{n_tests} ({num_wrong / n_tests:.2%}) tests wrong for {class_name}.')
    
    return num_wrong, n_tests

In [7]:
def train_all_classifiers(X, y, train_percent, save_models=True):
    cv = 1
    n_iterations = 0
    while cv == 1:
        train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=train_percent, random_state=RANDOM_STATE + n_iterations)
        cv = utility.get_max_cv(train_y)
        n_iterations += 1
    print(f'Using CV: {cv}')
    history = []
    
    train_y = train_y.reshape(-1)
    test_y = test_y.reshape(-1)
    
    for class_name, classifier, params in classifiers:
        print(f'Training/Testing {class_name}')
        num_wrong, n_tests = train_classifier(class_name, classifier, params, train_x, train_y, test_x, test_y, cv, save_model=save_models)
        best_params = {k: v for k, v in classifier.get_params().items() if k in params.keys()}
        last_result = {'classifier': class_name, 'min_num_prices': MIN_NUM_PRICES, 'n_wrong': int(num_wrong), 'n_tests': int(n_tests), 'best_params': best_params, 'train_size': float(train_percent)}
        history.append(last_result)
        print('')
    print('\n'.join(['\n'.join([f'{k}: {v}' for k, v in result.items()]) for result in history]))
    return history

In [8]:
training_sizes = np.arange(.1, .91, .1)
results = []
for training_size in training_sizes:
    curr_results = train_all_classifiers(all_X, all_y, training_size)
    results.extend(curr_results)
    print('')

utility.save_results(results)

Using CV: 3
Training/Testing Linear SVM
Optimizing parameters...
List of parameters: 
linearsvc__C
linearsvc__fit_intercept
linearsvc__tol
Optimal Parameters Found to be: 
linearsvc__C: 0.175
linearsvc__fit_intercept: False
linearsvc__tol: 1e-06
Testing Classifier with 288 samples.
Model was saved to: ./models/LinearSVM_20201121_164009.mdl
Got 100/288 (34.72%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 288 samples.
Model was saved to: ./models/RBFSVM_20201121_164009.mdl
Got 97/288 (33.68%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 288 samples.
Model was saved to: ./models/NaiveBayes_20201121_164009.mdl
Got 120/288 (4

Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: True
linearsvc__tol: 1e-06
Testing Classifier with 160 samples.
Model was saved to: ./models/LinearSVM_20201121_164820.mdl
Got 57/160 (35.62%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 160 samples.
Model was saved to: ./models/RBFSVM_20201121_164821.mdl
Got 57/160 (35.62%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 1e-12
Testing Classifier with 160 samples.
Model was saved to: ./models/NaiveBayes_20201121_164821.mdl
Got 111/160 (69.38%) tests wrong for Naive Bayes.

Training/Testing Random Forest
Optimizing parameters...
List of parameters: 
ccp_alpha
max_features
max_

Optimal Parameters Found to be: 
linearsvc__C: 0.125
linearsvc__fit_intercept: True
linearsvc__tol: 1e-06
Testing Classifier with 32 samples.
Model was saved to: ./models/LinearSVM_20201121_165650.mdl
Got 12/32 (37.50%) tests wrong for Linear SVM.

Training/Testing RBF SVM
Optimizing parameters...
List of parameters: 
svc__gamma
svc__shrinking
svc__tol
svc__C
Optimal Parameters Found to be: 
svc__C: 0.125
svc__gamma: scale
svc__shrinking: True
svc__tol: 1e-05
Testing Classifier with 32 samples.
Model was saved to: ./models/RBFSVM_20201121_165652.mdl
Got 12/32 (37.50%) tests wrong for RBF SVM.

Training/Testing Naive Bayes
Optimizing parameters...
List of parameters: 
var_smoothing
Optimal Parameters Found to be: 
var_smoothing: 3.1e-11
Testing Classifier with 32 samples.
Model was saved to: ./models/NaiveBayes_20201121_165652.mdl
Got 21/32 (65.62%) tests wrong for Naive Bayes.

Training/Testing Random Forest
Optimizing parameters...
List of parameters: 
ccp_alpha
max_features
max_depth