In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('../data/train_chunks_features_absdiff.csv')

In [3]:
features = df_train.columns.drop('different_author')

X = df_train[features]
y = list(map(lambda y: int(y), df_train['different_author']))

In [4]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('var_filter', VarianceThreshold()),
    ('selector', SelectKBest()),
    ('clf', SVC()),
])

In [5]:
parameters = {
    'selector__k': (150, 200, 250),
    'clf__gamma': np.logspace(-4, -2, 15),
}

In [6]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=10)
grid_search

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x11387ae18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': (150, 200, 250), 'clf__gamma': array([0.0001 , 0.00014, 0.00019, 0.00027, 0.00037, 0.00052, 0.00072,
       0.001  , 0.00139, 0.00193, 0.00268, 0.00373, 0.00518, 0.0072 ,
       0.01   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [7]:
grid_search.fit(X, y)

Fitting 3 folds for each of 45 candidates, totalling 135 fits
[CV] clf__gamma=0.0001, selector__k=150 ..............................
[CV] clf__gamma=0.0001, selector__k=150 ..............................
[CV] clf__gamma=0.0001, selector__k=150 ..............................
[CV] clf__gamma=0.0001, selector__k=200 ..............................
[CV]  clf__gamma=0.0001, selector__k=150, score=0.6615853658536586, total=   8.9s
[CV] clf__gamma=0.0001, selector__k=200 ..............................
[CV]  clf__gamma=0.0001, selector__k=150, score=0.6453252032520326, total=   9.0s
[CV]  clf__gamma=0.0001, selector__k=150, score=0.6598984771573604, total=   9.0s
[CV] clf__gamma=0.0001, selector__k=200 ..............................
[CV] clf__gamma=0.0001, selector__k=250 ..............................
[CV]  clf__gamma=0.0001, selector__k=200, score=0.665989847715736, total=  10.8s
[CV] clf__gamma=0.0001, selector__k=250 ..............................
[CV]  clf__gamma=0.0001, selector__k=200, s

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   30.1s


[CV] clf__gamma=0.00013894954943731373, selector__k=150 ..............
[CV]  clf__gamma=0.0001, selector__k=250, score=0.6690355329949239, total=  12.2s
[CV] clf__gamma=0.00013894954943731373, selector__k=150 ..............
[CV]  clf__gamma=0.0001, selector__k=250, score=0.6565040650406504, total=  11.9s
[CV] clf__gamma=0.00013894954943731373, selector__k=150 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=150, score=0.665989847715736, total=   7.8s
[CV] clf__gamma=0.00013894954943731373, selector__k=200 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=150, score=0.6529471544715447, total=   8.0s
[CV] clf__gamma=0.00013894954943731373, selector__k=200 ..............


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   46.0s


[CV]  clf__gamma=0.00013894954943731373, selector__k=150, score=0.6717479674796748, total=   7.7s
[CV] clf__gamma=0.00013894954943731373, selector__k=200 ..............
[CV]  clf__gamma=0.0001, selector__k=250, score=0.6666666666666666, total=  12.2s
[CV] clf__gamma=0.00013894954943731373, selector__k=250 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=200, score=0.6695431472081218, total=   9.6s
[CV] clf__gamma=0.00013894954943731373, selector__k=250 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=200, score=0.654979674796748, total=   9.8s
[CV] clf__gamma=0.00013894954943731373, selector__k=250 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=200, score=0.6686991869918699, total=   9.8s
[CV] clf__gamma=0.00019306977288832496, selector__k=150 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=250, score=0.6736040609137056, total=  11.5s
[CV] clf__gamma=0.00019306977288832496, selector__k=150 ..............
[CV

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.3min


[CV]  clf__gamma=0.00019306977288832496, selector__k=150, score=0.6539634146341463, total=   7.5s
[CV] clf__gamma=0.00019306977288832496, selector__k=200 ..............
[CV]  clf__gamma=0.00013894954943731373, selector__k=250, score=0.6702235772357723, total=  11.8s
[CV] clf__gamma=0.00019306977288832496, selector__k=200 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=150, score=0.6697154471544715, total=   7.7s
[CV] clf__gamma=0.00019306977288832496, selector__k=250 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=200, score=0.6720812182741117, total=   9.4s
[CV] clf__gamma=0.00019306977288832496, selector__k=250 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=200, score=0.6626016260162602, total=   9.2s
[CV] clf__gamma=0.00019306977288832496, selector__k=250 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=200, score=0.6681910569105691, total=   9.2s
[CV] clf__gamma=0.00026826957952797245, selector__k=150 .

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min


[CV]  clf__gamma=0.00019306977288832496, selector__k=250, score=0.682233502538071, total=  11.2s
[CV] clf__gamma=0.00026826957952797245, selector__k=150 ..............
[CV]  clf__gamma=0.00026826957952797245, selector__k=150, score=0.666497461928934, total=   7.6s
[CV] clf__gamma=0.00026826957952797245, selector__k=150 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=250, score=0.6646341463414634, total=  11.6s
[CV] clf__gamma=0.00026826957952797245, selector__k=200 ..............
[CV]  clf__gamma=0.00019306977288832496, selector__k=250, score=0.6763211382113821, total=  11.3s
[CV] clf__gamma=0.00026826957952797245, selector__k=200 ..............
[CV]  clf__gamma=0.00026826957952797245, selector__k=150, score=0.6539634146341463, total=   7.4s
[CV] clf__gamma=0.00026826957952797245, selector__k=200 ..............
[CV]  clf__gamma=0.00026826957952797245, selector__k=150, score=0.6686991869918699, total=   7.4s
[CV] clf__gamma=0.00026826957952797245, selector__k=250 ...

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.2min


[CV]  clf__gamma=0.00026826957952797245, selector__k=250, score=0.6847715736040609, total=  10.9s
[CV] clf__gamma=0.0003727593720314938, selector__k=150 ...............
[CV]  clf__gamma=0.00026826957952797245, selector__k=250, score=0.6788617886178862, total=  10.8s
[CV] clf__gamma=0.0003727593720314938, selector__k=150 ...............
[CV]  clf__gamma=0.0003727593720314938, selector__k=150, score=0.6710659898477157, total=   7.0s
[CV] clf__gamma=0.0003727593720314938, selector__k=200 ...............
[CV]  clf__gamma=0.00026826957952797245, selector__k=250, score=0.6758130081300813, total=  10.8s
[CV] clf__gamma=0.0003727593720314938, selector__k=200 ...............
[CV]  clf__gamma=0.0003727593720314938, selector__k=150, score=0.6559959349593496, total=   7.6s
[CV] clf__gamma=0.0003727593720314938, selector__k=200 ...............
[CV]  clf__gamma=0.0003727593720314938, selector__k=150, score=0.6692073170731707, total=   8.8s
[CV] clf__gamma=0.0003727593720314938, selector__k=250 .....

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min


[CV]  clf__gamma=0.0003727593720314938, selector__k=250, score=0.6954314720812182, total=  12.4s
[CV] clf__gamma=0.0005179474679231213, selector__k=150 ...............
[CV]  clf__gamma=0.0005179474679231213, selector__k=150, score=0.6761421319796954, total=   7.0s
[CV] clf__gamma=0.0005179474679231213, selector__k=150 ...............
[CV]  clf__gamma=0.0003727593720314938, selector__k=250, score=0.6747967479674797, total=  11.1s
[CV] clf__gamma=0.0005179474679231213, selector__k=200 ...............
[CV]  clf__gamma=0.0003727593720314938, selector__k=250, score=0.6702235772357723, total=  11.0s
[CV] clf__gamma=0.0005179474679231213, selector__k=200 ...............
[CV]  clf__gamma=0.0005179474679231213, selector__k=150, score=0.654979674796748, total=   7.6s
[CV] clf__gamma=0.0005179474679231213, selector__k=200 ...............
[CV]  clf__gamma=0.0005179474679231213, selector__k=150, score=0.6681910569105691, total=   7.2s
[CV] clf__gamma=0.0005179474679231213, selector__k=250 .........

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  3.5min


[CV]  clf__gamma=0.0007196856730011522, selector__k=150, score=0.6802030456852792, total=   6.6s
[CV] clf__gamma=0.0007196856730011522, selector__k=200 ...............
[CV]  clf__gamma=0.0005179474679231213, selector__k=250, score=0.676829268292683, total=  10.2s
[CV] clf__gamma=0.0007196856730011522, selector__k=200 ...............
[CV]  clf__gamma=0.0007196856730011522, selector__k=150, score=0.663109756097561, total=   6.6s
[CV] clf__gamma=0.0007196856730011522, selector__k=200 ...............
[CV]  clf__gamma=0.0007196856730011522, selector__k=150, score=0.6681910569105691, total=   6.8s
[CV] clf__gamma=0.0007196856730011522, selector__k=250 ...............
[CV]  clf__gamma=0.0007196856730011522, selector__k=200, score=0.6903553299492385, total=   8.3s
[CV] clf__gamma=0.0007196856730011522, selector__k=250 ...............
[CV]  clf__gamma=0.0007196856730011522, selector__k=200, score=0.6732723577235772, total=   8.1s
[CV] clf__gamma=0.0007196856730011522, selector__k=250 ..........

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  4.0min


[CV]  clf__gamma=0.001, selector__k=150, score=0.6676829268292683, total=   6.6s
[CV] clf__gamma=0.001, selector__k=200 ...............................
[CV]  clf__gamma=0.001, selector__k=150, score=0.6707317073170732, total=   7.0s
[CV] clf__gamma=0.001, selector__k=250 ...............................
[CV]  clf__gamma=0.001, selector__k=200, score=0.6954314720812182, total=   8.5s
[CV] clf__gamma=0.001, selector__k=250 ...............................
[CV]  clf__gamma=0.001, selector__k=200, score=0.6763211382113821, total=   8.4s
[CV] clf__gamma=0.001, selector__k=250 ...............................
[CV]  clf__gamma=0.001, selector__k=200, score=0.6783536585365854, total=   8.1s
[CV] clf__gamma=0.0013894954943731374, selector__k=150 ...............
[CV]  clf__gamma=0.001, selector__k=250, score=0.7076142131979696, total=  10.4s
[CV] clf__gamma=0.0013894954943731374, selector__k=150 ...............
[CV]  clf__gamma=0.001, selector__k=250, score=0.6758130081300813, total=  10.3s
[CV] cl

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  4.7min


[CV]  clf__gamma=0.0013894954943731374, selector__k=200, score=0.6808943089430894, total=   9.4s
[CV] clf__gamma=0.0019306977288832496, selector__k=150 ...............
[CV]  clf__gamma=0.0013894954943731374, selector__k=250, score=0.7157360406091371, total=  10.7s
[CV] clf__gamma=0.0019306977288832496, selector__k=150 ...............
[CV]  clf__gamma=0.0013894954943731374, selector__k=250, score=0.676829268292683, total=  11.0s
[CV] clf__gamma=0.0019306977288832496, selector__k=150 ...............
[CV]  clf__gamma=0.0013894954943731374, selector__k=250, score=0.6875, total=  11.3s
[CV] clf__gamma=0.0019306977288832496, selector__k=200 ...............
[CV]  clf__gamma=0.0019306977288832496, selector__k=150, score=0.6868020304568528, total=   7.6s
[CV] clf__gamma=0.0019306977288832496, selector__k=200 ...............
[CV]  clf__gamma=0.0019306977288832496, selector__k=150, score=0.6737804878048781, total=   6.7s
[CV] clf__gamma=0.0019306977288832496, selector__k=200 ...............
[CV] 

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  5.5min


[CV]  clf__gamma=0.0019306977288832496, selector__k=250, score=0.693089430894309, total=  10.1s
[CV] clf__gamma=0.0026826957952797246, selector__k=200 ...............
[CV]  clf__gamma=0.0026826957952797246, selector__k=150, score=0.6803861788617886, total=   6.4s
[CV] clf__gamma=0.0026826957952797246, selector__k=200 ...............
[CV]  clf__gamma=0.0026826957952797246, selector__k=150, score=0.6788617886178862, total=   6.5s
[CV] clf__gamma=0.0026826957952797246, selector__k=250 ...............
[CV]  clf__gamma=0.0026826957952797246, selector__k=200, score=0.7020304568527919, total=   8.1s
[CV] clf__gamma=0.0026826957952797246, selector__k=250 ...............
[CV]  clf__gamma=0.0026826957952797246, selector__k=200, score=0.695630081300813, total=   8.0s
[CV] clf__gamma=0.0026826957952797246, selector__k=250 ...............
[CV]  clf__gamma=0.0026826957952797246, selector__k=200, score=0.6905487804878049, total=   8.4s
[CV] clf__gamma=0.003727593720314938, selector__k=150 ...........

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  6.4min


[CV]  clf__gamma=0.003727593720314938, selector__k=250, score=0.7045685279187818, total=  10.7s
[CV] clf__gamma=0.005179474679231213, selector__k=150 ................
[CV]  clf__gamma=0.003727593720314938, selector__k=250, score=0.6742886178861789, total=  10.6s
[CV] clf__gamma=0.005179474679231213, selector__k=150 ................
[CV]  clf__gamma=0.003727593720314938, selector__k=250, score=0.6966463414634146, total=  10.7s
[CV] clf__gamma=0.005179474679231213, selector__k=200 ................
[CV]  clf__gamma=0.005179474679231213, selector__k=150, score=0.6837563451776649, total=   7.0s
[CV] clf__gamma=0.005179474679231213, selector__k=200 ................
[CV]  clf__gamma=0.005179474679231213, selector__k=150, score=0.6798780487804879, total=   6.8s
[CV] clf__gamma=0.005179474679231213, selector__k=200 ................
[CV]  clf__gamma=0.005179474679231213, selector__k=150, score=0.6915650406504065, total=   6.8s
[CV] clf__gamma=0.005179474679231213, selector__k=250 ...............

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  7.2min


[CV]  clf__gamma=0.007196856730011514, selector__k=200, score=0.6949238578680204, total=   9.4s
[CV] clf__gamma=0.007196856730011514, selector__k=250 ................
[CV]  clf__gamma=0.007196856730011514, selector__k=200, score=0.6895325203252033, total=   9.3s
[CV] clf__gamma=0.007196856730011514, selector__k=250 ................
[CV]  clf__gamma=0.007196856730011514, selector__k=200, score=0.6996951219512195, total=   9.3s
[CV] clf__gamma=0.01, selector__k=150 ................................
[CV]  clf__gamma=0.007196856730011514, selector__k=250, score=0.6934010152284263, total=  12.4s
[CV] clf__gamma=0.01, selector__k=150 ................................
[CV]  clf__gamma=0.01, selector__k=150, score=0.6862944162436548, total=   7.4s
[CV] clf__gamma=0.01, selector__k=150 ................................
[CV]  clf__gamma=0.007196856730011514, selector__k=250, score=0.6737804878048781, total=  12.6s
[CV] clf__gamma=0.01, selector__k=200 ................................
[CV]  clf__gam

[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:  8.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x11387ae18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': (150, 200, 250), 'clf__gamma': array([0.0001 , 0.00014, 0.00019, 0.00027, 0.00037, 0.00052, 0.00072,
       0.001  , 0.00139, 0.00193, 0.00268, 0.00373, 0.00518, 0.0072 ,
       0.01   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [8]:
grid_search.best_score_

0.6994581781239417

In [9]:
grid_search.best_params_

{'clf__gamma': 0.003727593720314938, 'selector__k': 200}

In [10]:
df_test = pd.read_csv('../data/validation_chunks_features_absdiff.csv')

In [11]:
X_test = df_test[features]
y_test = list(map(lambda y: int(y), df_test['different_author']))

In [12]:
grid_search.score(X_test, y_test)

0.7289879931389366