## Reading Libraries and dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.HDFStore('train_data.h5')
y_train = train['labels']
X_train = train['rpkm']
idx1 = y_train.index
X_train= X_train.loc[idx1, :]

In [3]:
test = pd.HDFStore('test_data.h5')
y_test = test['labels']
X_test = test['rpkm']

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [5]:
X_train_best_pca = X_train_pca[:, :120]
X_test_best_pca = X_test_pca[:, :120]

In [6]:
param_grid = {'C': [0.1, 1, 10],
              'gamma': [1, 0.1, 0.01],
              'kernel': ['rbf']}

In [7]:
from sklearn.svm import SVC
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=3)

In [8]:
grid.fit(X_train_best_pca, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END .....................C=0.1, gamma=1, kernel=rbf; total time= 2.3min
[CV 2/3] END .....................C=0.1, gamma=1, kernel=rbf; total time= 2.3min
[CV 3/3] END .....................C=0.1, gamma=1, kernel=rbf; total time= 2.3min
[CV 1/3] END ...................C=0.1, gamma=0.1, kernel=rbf; total time= 2.3min
[CV 2/3] END ...................C=0.1, gamma=0.1, kernel=rbf; total time= 2.3min
[CV 3/3] END ...................C=0.1, gamma=0.1, kernel=rbf; total time= 2.3min
[CV 1/3] END ..................C=0.1, gamma=0.01, kernel=rbf; total time= 2.3min
[CV 2/3] END ..................C=0.1, gamma=0.01, kernel=rbf; total time= 2.3min
[CV 3/3] END ..................C=0.1, gamma=0.01, kernel=rbf; total time= 2.3min
[CV 1/3] END .......................C=1, gamma=1, kernel=rbf; total time= 2.3min
[CV 2/3] END .......................C=1, gamma=1, kernel=rbf; total time= 2.3min
[CV 3/3] END .......................C=1, gamma=1,

GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf']},
             verbose=3)

In [9]:
print(grid.best_params_)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [10]:
print(grid.best_estimator_)

SVC(C=10, gamma=0.01)


In [11]:
print(grid.cv_results_)

{'mean_fit_time': array([ 93.78452293,  93.47567908,  93.99906206,  96.26721708,
        99.18580111, 103.08994063, 112.07218949, 112.53452452,
       103.23494116]), 'std_fit_time': array([0.07900785, 1.1699189 , 0.98268615, 0.79689733, 2.92337956,
       6.46893598, 4.6591351 , 7.18367736, 0.91272936]), 'mean_score_time': array([43.12421894, 43.00000342, 43.16448633, 44.09950447, 43.62405698,
       44.38933118, 46.19340611, 46.22610521, 44.05206593]), 'std_score_time': array([0.15720622, 0.04720504, 0.2286653 , 0.86674308, 0.33683141,
       1.25269089, 0.61238674, 1.68675401, 0.60436858]), 'param_C': masked_array(data=[0.1, 0.1, 0.1, 1, 1, 1, 10, 10, 10],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[1, 0.1, 0.01, 1, 0.1, 0.01, 1, 0.1, 0.01],
             mask=[False, False, False, False, False, False, False, False,
                   False],
 

### SVM with best parameters

In [18]:
best_model = SVC(C=10, gamma=0.01)

In [19]:
best_model.fit(X_train_best_pca, y_train)

SVC(C=10, gamma=0.01)

In [20]:
y_pred_bm = best_model.predict(X_test_best_pca)

In [21]:
accuracy_score(y_test, y_pred_bm)

0.0

In [25]:
y_pred_bm.shape

(2855,)

In [26]:
X_test_pca.shape

(2855, 200)

In [27]:
X_test_best_pca.shape

(2855, 120)

In [29]:
X_train_best_pca.shape

(21389, 120)