In [19]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import torch
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import numpy as np

data, y = torch.load('data/data_v2.pkl')
train_data, test_data = data[:len(y)], data[len(y):]
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(train_data, y, train_size=.8, )

In [26]:
cross_val_score(make_pipeline(StandardScaler(), SVC()), X_train, y_train, cv=10, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


0.8328834115805946

In [14]:
pipe = make_pipeline(StandardScaler(), PCA(), SVC())
params = {
    'pca__n_components': list(range(1, data.shape[1] + 1))
}
grid_search = GridSearchCV(pipe, params, n_jobs=-1, verbose=2, return_train_score=True, cv=10)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('pca', PCA()), ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                               11, 12, 13, 14, 15, 16, 17, 18,
                                               19, 20, 21, 22, 23, 24, 25]},
             return_train_score=True, verbose=2)

In [17]:
grid_search.best_score_, grid_search.best_params_, grid_search.cv_results_

(0.8328834115805946,
 {'pca__n_components': 20},
 {'mean_fit_time': array([0.03660524, 0.02842088, 0.03360591, 0.0350409 , 0.03589549,
         0.03340685, 0.03860898, 0.03200674, 0.03620927, 0.04555655,
         0.03843138, 0.03910251, 0.03790712, 0.03570898, 0.03690693,
         0.03760917, 0.03610642, 0.03650703, 0.04261067, 0.03800778,
         0.03760769, 0.03356726, 0.0379585 , 0.03482518, 0.03520782]),
  'std_fit_time': array([0.00361026, 0.00470871, 0.00279948, 0.00709112, 0.00436316,
         0.00162374, 0.01062432, 0.00161266, 0.00604831, 0.03390732,
         0.00696771, 0.0096911 , 0.00537513, 0.00596721, 0.00543178,
         0.00928867, 0.00370045, 0.00473959, 0.00792895, 0.00525412,
         0.0043655 , 0.00200697, 0.0074197 , 0.0034265 , 0.00227256]),
  'mean_score_time': array([0.00787735, 0.0070595 , 0.0090035 , 0.00815313, 0.00930188,
         0.00800273, 0.00880065, 0.00780449, 0.01760316, 0.01080265,
         0.00817149, 0.00940289, 0.01070318, 0.00890284, 0.0084024 

In [29]:
X_train.shape

(712, 25)

In [30]:
from sklearn.model_selection import cross_val_score

lda = LinearDiscriminantAnalysis(n_components=1)
lda.fit_transform(X_train, y_train)

array([[-0.18865839],
       [-0.56584063],
       [-1.21870177],
       [-1.46259808],
       [-0.2950938 ],
       [ 0.51778658],
       [ 0.53657366],
       [-0.48990456],
       [-1.22209207],
       [-0.00547836],
       [ 2.01625951],
       [-1.58908617],
       [-0.68277565],
       [-1.38866691],
       [-1.15133018],
       [ 1.82624419],
       [ 2.62590363],
       [ 2.67991106],
       [-0.6820445 ],
       [ 2.72615078],
       [-0.34791374],
       [-1.57218741],
       [ 0.90460395],
       [ 1.34228985],
       [ 1.27411477],
       [-0.81423546],
       [ 1.52741784],
       [-1.1516765 ],
       [-0.59472508],
       [-0.80723491],
       [-1.40577317],
       [ 0.21320202],
       [-1.21509594],
       [ 2.12302992],
       [-1.05224583],
       [ 0.04527955],
       [ 0.37752467],
       [ 1.11484434],
       [-0.04190818],
       [ 2.09034388],
       [-1.23888764],
       [-1.22209207],
       [ 1.50825882],
       [-1.43396716],
       [-0.03535863],
       [-1