In [9]:
import pickle
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import svm

with open('pickles\\df.pickle','rb') as data:
    df = pickle.load(data)
    
with open('pickles\\features_train.pickle','rb') as data:
    features_train = pickle.load(data)

with open('pickles\\features_test.pickle','rb') as data:
    features_test = pickle.load(data)
    
with open('pickles\\labels_train.pickle','rb') as data:
    labels_train = pickle.load(data)
    
with open('pickles\\labels_test.pickle','rb') as data:
    labels_test = pickle.load(data)

In [5]:
random_grid = {'C': [.0001, .001, .01],
              'kernel': ['linear', 'rbf', 'poly'],
              'gamma': [.0001, .001, .01, .1, 1, 10, 100],
              'degree': [1, 2, 3, 4, 5],
              'probability': [True]
             }

svc = svm.SVC(random_state=8)

random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

In [6]:
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  9.8min finished


RandomizedSearchCV(cv=3, estimator=SVC(random_state=8), n_iter=50,
                   param_distributions={'C': [0.0001, 0.001, 0.01],
                                        'degree': [1, 2, 3, 4, 5],
                                        'gamma': [0.0001, 0.001, 0.01, 0.1, 1,
                                                  10, 100],
                                        'kernel': ['linear', 'rbf', 'poly'],
                                        'probability': [True]},
                   random_state=8, scoring='accuracy', verbose=1)

In [7]:
random_search.best_params_

{'probability': True, 'kernel': 'poly', 'gamma': 10, 'degree': 4, 'C': 0.01}

In [8]:
random_search.best_score_

0.9280716759322147

In [10]:
#using grid search cross validation to find the best parameters

param_grid = [
  {'C': [.0001, .001, .01, .1], 'kernel':['linear'], 'probability':[True]},
  {'C': [.0001, .001, .01, .1], 'kernel':['poly'], 'degree':[3, 4, 5], 'probability':[True]},
  {'C': [.0001, .001, .01, .1], 'kernel':['rbf'], 'gamma':[1, 10, 100], 'probability':[True]}
]

svc = svm.SVC(random_state=8)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

In [11]:
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:  5.7min finished


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             estimator=SVC(random_state=8),
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1], 'kernel': ['linear'],
                          'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1], 'degree': [3, 4, 5],
                          'kernel': ['poly'], 'probability': [True]},
                         {'C': [0.0001, 0.001, 0.01, 0.1],
                          'gamma': [1, 10, 100], 'kernel': ['rbf'],
                          'probability': [True]}],
             scoring='accuracy', verbose=1)

In [12]:
grid_search.best_params_

{'C': 0.1, 'kernel': 'linear', 'probability': True}

In [13]:
grid_search.best_score_

0.9445333333333333

In [14]:
#finding the best model

best_svc = grid_search.best_estimator_

In [16]:
best_svc

SVC(C=0.1, kernel='linear', probability=True, random_state=8)


In [17]:
#using the best model to train the dataset.

best_svc.fit(features_train, labels_train)

SVC(C=0.1, kernel='linear', probability=True, random_state=8)

In [18]:
#testing the modle on test set

pred = best_svc.predict(features_test)

In [19]:
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))

The training accuracy is: 
0.9629825489159175


In [20]:
print("The test accuracy is: ")
print(accuracy_score(labels_test, pred))

The test accuracy is: 
0.9580838323353293


In [22]:
print("Classification report")
print(classification_report(labels_test,pred))

Classification report
              precision    recall  f1-score   support

           0       0.96      0.97      0.97        80
           1       0.98      0.93      0.96        58
           2       0.93      0.94      0.93        53
           3       0.96      0.98      0.97        82
           4       0.95      0.95      0.95        61

    accuracy                           0.96       334
   macro avg       0.96      0.96      0.96       334
weighted avg       0.96      0.96      0.96       334



In [24]:
conf_matrix = confusion_matrix(labels_test, pred)

In [25]:
conf_matrix

array([[78,  0,  0,  0,  2],
       [ 1, 54,  2,  1,  0],
       [ 2,  0, 50,  0,  1],
       [ 0,  0,  2, 80,  0],
       [ 0,  1,  0,  2, 58]], dtype=int64)

In [26]:
with open('models\\svm.pickle','wb') as output:
    pickle.dump(best_svc,output)
    
with open('models\\randomsearch.pickle','wb') as output:
    pickle.dump(random_search,output)
    
with open('models\\gridsearch.pickle','wb') as output:
    pickle.dump(grid_search,output)