
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Training Models </h2>	

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

data_file_1 = Path("data/NB_1", "cleaned_data.hdf")
data_from_nb1 = pd.read_hdf(data_file_1, "starting_data")

data_file = Path("data/NB_2", "cleaned_data.hdf")
full_feature_frame = pd.read_hdf(data_file, "full_feature")
full_feature_frame['encoding'] = data_from_nb1['encoding']

In [2]:
from sklearn.model_selection import train_test_split


X = (full_feature_frame[['ft'+str(i) for i in range(18)]]).fillna(0)
y = full_feature_frame['encoding']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state =42)


<h3>  Train models using all of the following methods below. Be sure to drop the actual image column, and the encoding</h3>	Take note of the differences in accuracy, and methods.


Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

parameters = {'max_iter': np.arange(4000,7000,500), 'tol':[0.01, 0.001, 0.0001]}
clf_lr = clf = GridSearchCV(cv=10, param_grid=parameters, 
                            estimator=LogisticRegression(random_state=42, solver='lbfgs', 
                                               multi_class='multinomial')
                            , n_jobs=4)
clf_lr.fit(X_train, y_train)


print('Logistic Regression')
print("The test accuracy is: %.2f"%(clf_lr.score(X_test, y_test)*100))
print("The train accuracy is: %.2f"%(clf_lr.score(X_train, y_train)*100))
print('The Best parameters are: ', clf_lr.best_params_)

Logistic Regression
The test accuracy is: 27.24
The train accuracy is: 27.33
The Best parameters are:  {'max_iter': 6500, 'tol': 0.01}


K-nearest Neighbors

In [4]:
from sklearn.neighbors import NearestNeighbors
from sklearn import neighbors
parameters = {'n_neighbors': np.arange(7,12,1), 'leaf_size':np.arange(27, 35, 1), 'p':[1,2,3]}
clf_knn = GridSearchCV(cv=10, param_grid=parameters, 
                            estimator=neighbors.KNeighborsClassifier()
                       , n_jobs=4, verbose = True)
clf_knn.fit(X_train, y_train)

print('K-nearest Neighbours')
print("The test accuracy is: %.2f"%(clf_knn.score(X_test, y_test)*100))
print("The train accuracy is: %.2f"%(clf_knn.score(X_train, y_train)*100))
print('The Best parameters are: ', clf_knn.best_params_)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=4)]: Done 488 tasks      | elapsed:    5.2s


K-nearest Neighbours
The test accuracy is: 22.92
The train accuracy is: 37.08
The Best parameters are:  {'leaf_size': 27, 'n_neighbors': 11, 'p': 1}


[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:   13.4s finished


Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

parameters = {'n_estimators': np.arange(100,121,6), 'min_samples_leaf':np.arange(8,12,1), 'max_depth': np.arange(7,12,1)}
clf_rf = GridSearchCV(cv=10, param_grid=parameters, 
                          estimator=RandomForestClassifier(criterion='entropy', bootstrap=False,random_state=42)
                   , n_jobs=4, verbose=True)

clf_rf.fit(X_train,y_train)

print('Random Forest')
print("The test accuracy is: %.2f"%(clf_rf.score(X_test, y_test)*100))
print("The train accuracy is: %.2f"%(clf_rf.score(X_train, y_train)*100))
print('The Best parameters are: ', clf_rf.best_params_)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   26.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  8.5min
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:  8.6min finished


Random Forest
The test accuracy is: 40.53
The train accuracy is: 78.08
The Best parameters are:  {'max_depth': 8, 'min_samples_leaf': 8, 'n_estimators': 106}


Support Vector Machine

In [6]:
from sklearn import svm

parameters = {'gamma':[1e-6,1e-7,1e-8,1e-9], 'C':[1,1.5,2,2.5,3]}
clf_svm = GridSearchCV(cv=5, param_grid=parameters, 
                          estimator=svm.SVC(random_state=42, kernel='rbf')
                       , n_jobs=4, verbose=True)
clf_svm.fit(X_train,y_train)

print('SVM')
print("The test accuracy is: %.2f"%(clf_svm.score(X_test, y_test)*100))
print("The train accuracy is: %.2f"%(clf_svm.score(X_train, y_train)*100))
print('The Best parameters are: ', clf_svm.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    6.8s finished


SVM
The test accuracy is: 24.25
The train accuracy is: 42.00
The Best parameters are:  {'C': 3, 'gamma': 1e-09}


In [7]:
#Read Validation Input
val_data_file = Path("data/NB_2", "val_cleaned_data.hdf")
val_feature_frame = pd.read_hdf(val_data_file, "val_full_feature")

#Build model using best params found using cross validation
clf_rf_final = RandomForestClassifier(n_estimators=106,max_depth=8,min_samples_leaf=8,
                                      criterion='entropy', bootstrap=False,random_state=42)
#Train final model on entire data
clf_rf_final.fit(X,y)
# Predict using model
res = clf_rf_final.predict(val_feature_frame.fillna(0))

#Outputting the to a csv
output_file_name = 'validation_output_random_forest'
pd.DataFrame(res).to_csv(output_file_name, header=None)
print('Saved validation output to: ', output_file_name)

Saved validation output to:  validation_output_random_forest
