In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('../data/500_ngrams/train_features_scaled_standard.csv', index_col=0)
df_val = pd.read_csv('../data/500_ngrams/validation_features_scaled_standard.csv', index_col=0)

In [3]:
df_feature_names = pd.read_csv('../data/500_ngrams/feature_names.csv', index_col=0)

In [4]:
y_train = list(map(lambda y: int(y), df_train['different_author']))
y_val = list(map(lambda y: int(y), df_val['different_author']))

In [5]:
N = int((len(df_train.columns) - 1) / 2)

df_train_a = df_train.iloc[:, :N]
df_train_b = df_train.iloc[:, N:-1]

df_train_a = df_train_a.rename(columns={x:y for x,y in zip(df_train_a.columns,range(0,len(df_train_a.columns)))})
df_train_b = df_train_b.rename(columns={x:y for x,y in zip(df_train_b.columns,range(0,len(df_train_b.columns)))})


df_val_a = df_val.iloc[:, :N]
df_val_b = df_val.iloc[:, N:-1]

df_val_a = df_val_a.rename(columns={x:y for x,y in zip(df_val_a.columns,range(0,len(df_val_a.columns)))})
df_val_b = df_val_b.rename(columns={x:y for x,y in zip(df_val_b.columns,range(0,len(df_val_b.columns)))})

In [6]:
df_train_sqdiff = df_train_a.subtract(df_train_b) ** 2
df_train_absdiff = df_train_a.subtract(df_train_b).abs()

df_val_sqdiff = df_val_a.subtract(df_val_b) ** 2
df_val_absdiff = df_val_a.subtract(df_val_b).abs()

In [7]:
del df_train
del df_val
del df_train_a
del df_train_b
del df_val_a
del df_val_b

In [8]:
pipeline = Pipeline([
#     ('scaler', StandardScaler()),
    ('var_filter', VarianceThreshold()),
    ('selector', SelectKBest()),
    ('clf', SVC()),
])

In [9]:
parameters = {
    'selector__k': list(range(11, 20)),
#     'clf__gamma': np.logspace(-4, -2, 15),
}

In [10]:
grid_search_sqdiff = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=10)
grid_search_sqdiff

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x109648e18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': [11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [11]:
grid_search_sqdiff.fit(df_train_sqdiff, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] ......... selector__k=11, score=0.6926333615580017, total=   2.4s
[CV] selector__k=11 ..................................................
[CV] ......... selector__k=11, score=0.7021996615905245, total=   2.6s
[CV] ......... selector__k=11, score=0.6740050804403048, total=   2.4s
[CV] ......... selector__k=11, score=0.6731583403895004, total=   2.5s
[CV] selector__k=12 ..................................................
[CV] selector__k=12 ..................................................
[CV] selector__k=12 ..................................................
[CV] ......... selector__k=11, score=0.6900931414055885, total=   2.2s
[CV] selector__k=

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.6s


[CV] selector__k=12 ..................................................
[CV] ......... selector__k=12, score=0.6968670618120237, total=   2.2s
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=12, score=0.6824724809483489, total=   2.2s
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=12, score=0.6867061812023709, total=   2.1s
[CV] ......... selector__k=12, score=0.6900931414055885, total=   2.3s
[CV] selector__k=13 ..................................................
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=13, score=0.7064297800338409, total=   2.0s
[CV] ......... selector__k=13, score=0.7087214225232854, total=   1.9s
[CV] selector__k=13 ..................................................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.7s


[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.6850127011007621, total=   2.0s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.6994072819644369, total=   1.9s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.6968670618120237, total=   2.2s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=14, score=0.7021996615905245, total=   2.0s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=14, score=0.6985605419136325, total=   1.9s
[CV] selector__k=15 ..................................................


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.1s


[CV] ......... selector__k=14, score=0.6816257408975445, total=   2.0s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=14, score=0.7044877222692634, total=   1.9s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=14, score=0.6926333615580017, total=   2.2s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=15, score=0.7106598984771574, total=   2.0s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=15, score=0.7002540220152413, total=   2.0s
[CV] selector__k=16 ..................................................
[CV] .......... selector__k=15, score=0.676545300592718, total=   2.2s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=15, score=0.6740050804403048, total=   2.4s
[CV] selector__k=16 ..................................................


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   19.3s


[CV] ......... selector__k=15, score=0.7027942421676545, total=   2.0s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=16, score=0.7055837563451777, total=   2.1s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=16, score=0.6977138018628282, total=   2.1s
[CV] selector__k=17 ..................................................
[CV] .......... selector__k=16, score=0.676545300592718, total=   2.1s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=16, score=0.6875529212531752, total=   2.4s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=16, score=0.6960203217612193, total=   2.1s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=17, score=0.7038917089678511, total=   2.2s
[CV] selector__k=17 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   28.0s


[CV] ......... selector__k=17, score=0.6646909398814564, total=   3.0s
[CV] ......... selector__k=17, score=0.6926333615580017, total=   2.5s
[CV] selector__k=18 ..................................................
[CV] selector__k=18 ..................................................
[CV] ......... selector__k=18, score=0.6895093062605753, total=   2.4s
[CV] selector__k=18 ..................................................
[CV] ......... selector__k=18, score=0.6867061812023709, total=   2.8s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.6629974597798476, total=   3.0s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.6621507197290432, total=   3.0s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.6892464013547841, total=   2.5s
[CV] selector__k=19 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:   36.4s remaining:    1.7s


[CV] ......... selector__k=19, score=0.6672311600338696, total=   2.6s
[CV] ......... selector__k=19, score=0.6900931414055885, total=   1.8s


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   37.8s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x109648e18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': [11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [12]:
grid_search_sqdiff.best_score_

0.6992888587876736

In [13]:
grid_search_sqdiff.best_params_

{'selector__k': 13}

In [14]:
grid_search_sqdiff.score(df_val_sqdiff, y_val)

0.7015437392795884

In [15]:
selector = grid_search_sqdiff.best_estimator_.named_steps.selector

In [16]:
idxs_selected = selector.get_support(indices=True)
idxs_selected

array([  1,   5,  21,  25,  26,  28,  59,  60,  67,  68,  69,  99, 115])

In [17]:
df_feature_names.iloc[idxs_selected]

Unnamed: 0,0
1,Ari Index
5,Coleman Liau Index
21,DT_NN_VBZ
25,Double Hyphens
26,E
28,F
59,JJ_NN_VBZ
60,JJ_NN___END__
67,MD_RB_VB
68,MD_VB_DT


---

In [18]:
pipeline = Pipeline([
#     ('scaler', StandardScaler()),
    ('var_filter', VarianceThreshold()),
    ('selector', SelectKBest()),
    ('clf', SVC()),
])

In [19]:
parameters = {
    'selector__k': list(range(11, 20)),
#     'clf__gamma': np.logspace(-4, -2, 15),
}

In [20]:
grid_search_absdiff = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=10)
grid_search_absdiff

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x109648e18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': [11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [21]:
grid_search_absdiff.fit(df_train_absdiff, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] selector__k=11 ..................................................
[CV] ......... selector__k=11, score=0.7256562235393734, total=   1.7s
[CV] ......... selector__k=11, score=0.7174280879864636, total=   1.7s
[CV] selector__k=11 ..................................................
[CV] selector__k=12 ..................................................
[CV] ......... selector__k=11, score=0.6824724809483489, total=   1.7s
[CV] selector__k=12 ..................................................
[CV] .......... selector__k=11, score=0.720575783234547, total=   1.6s
[CV] selector__k=12 ..................................................
[CV] ......... selector__k=11, score=0.7146486028789162, total=   1.7s
[CV] selector__k=

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.1s


[CV] selector__k=12 ..................................................
[CV] ......... selector__k=12, score=0.7231160033869602, total=   1.7s
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=12, score=0.6790855207451313, total=   1.6s
[CV] selector__k=13 ..................................................
[CV] .......... selector__k=12, score=0.720575783234547, total=   1.5s
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=12, score=0.7138018628281118, total=   1.6s
[CV] selector__k=13 ..................................................
[CV] ......... selector__k=13, score=0.7250423011844331, total=   1.6s


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.7s


[CV] selector__k=13 ..................................................
[CV] ......... selector__k=13, score=0.7265029635901779, total=   1.5s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.6816257408975445, total=   2.0s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.7188823031329382, total=   1.8s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=13, score=0.7061812023708721, total=   1.7s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=14, score=0.7267343485617598, total=   1.7s
[CV] selector__k=14 ..................................................
[CV] ......... selector__k=14, score=0.7281964436917866, total=   2.1s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=14, score=0.6773920406435224, total=   2.0s
[CV] s

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.9s


[CV] ......... selector__k=14, score=0.7129551227773073, total=   2.0s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=14, score=0.7019475021168501, total=   1.8s
[CV] selector__k=15 ..................................................
[CV] ......... selector__k=15, score=0.7233502538071066, total=   1.9s
[CV] selector__k=15 ..................................................
[CV] .......... selector__k=15, score=0.720575783234547, total=   2.0s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=15, score=0.6748518204911093, total=   2.0s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=15, score=0.7171888230313294, total=   2.0s
[CV] selector__k=16 ..................................................


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.6s


[CV] ......... selector__k=15, score=0.7011007620660458, total=   1.7s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=16, score=0.7258883248730964, total=   1.7s
[CV] selector__k=16 ..................................................
[CV] ......... selector__k=16, score=0.7222692633361558, total=   1.6s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=16, score=0.6748518204911093, total=   1.6s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=16, score=0.7112616426756986, total=   2.0s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=16, score=0.7036409822184589, total=   2.3s
[CV] selector__k=17 ..................................................
[CV] ......... selector__k=17, score=0.7267343485617598, total=   2.2s
[CV] selector__k=17 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   24.0s


[CV] ......... selector__k=17, score=0.7154953429297206, total=   1.7s
[CV] selector__k=18 ..................................................
[CV] ......... selector__k=17, score=0.7104149026248942, total=   1.7s
[CV] selector__k=18 ..................................................
[CV] ......... selector__k=18, score=0.7131979695431472, total=   1.7s
[CV] selector__k=18 ..................................................
[CV] ......... selector__k=18, score=0.7129551227773073, total=   1.7s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.6621507197290432, total=   1.7s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.7070279424216765, total=   1.7s
[CV] selector__k=19 ..................................................
[CV] ......... selector__k=18, score=0.7112616426756986, total=   1.7s
[CV] selector__k=19 ..................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:   30.1s remaining:    1.4s


[CV] ......... selector__k=19, score=0.7070279424216765, total=   1.3s


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   31.4s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_filter', VarianceThreshold(threshold=0.0)), ('selector', SelectKBest(k=10, score_func=<function f_classif at 0x109648e18>)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'selector__k': [11, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [22]:
grid_search_absdiff.best_score_

0.7124957670165933

In [23]:
grid_search_absdiff.best_params_

{'selector__k': 12}

In [24]:
grid_search_absdiff.score(df_val_absdiff, y_val)

0.7320754716981132

In [25]:
selector = grid_search_absdiff.best_estimator_.named_steps.selector

In [26]:
idxs_selected = selector.get_support(indices=True)
idxs_selected

array([  1,  21,  22,  25,  26,  28,  59,  60,  66,  68,  99, 115])

In [27]:
df_feature_names.iloc[idxs_selected]

Unnamed: 0,0
1,Ari Index
21,DT_NN_VBZ
22,DT_NN___END__
25,Double Hyphens
26,E
28,F
59,JJ_NN_VBZ
60,JJ_NN___END__
66,M
68,MD_VB_DT
