In [2]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.feature_extraction.text
import sklearn.svm
import matplotlib
import dateutil
import gc
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
courses = pd.read_csv('course.csv')
courses

Unnamed: 0,date,change_rate,delta_addictive,delta_mul
0,11.01.2007,26.4898,0.0433,1.001637
1,12.01.2007,26.5320,0.0422,1.001593
2,13.01.2007,26.5770,0.0450,1.001696
3,16.01.2007,26.5645,-0.0125,0.999530
4,17.01.2007,26.5481,-0.0164,0.999383
5,18.01.2007,26.5646,0.0165,1.000622
6,19.01.2007,26.5343,-0.0303,0.998859
7,20.01.2007,26.5075,-0.0268,0.998990
8,23.01.2007,26.5214,0.0139,1.000524
9,24.01.2007,26.5240,0.0026,1.000098


In [4]:
abs_deltas = courses.set_index('date').to_dict()['delta_addictive']

In [5]:
with open('lemmas/counted_days_lemmas.txt') as fl:
    dates = [dateutil.parser.parse(line[:line.find(' ')]).strftime('%d.%m.%Y') for line in fl]
y_all = np.array([1 if abs_deltas[x] > 0 else -1 for x in dates])

# Bag of Words

In [10]:
X_all = np.load('bow.npy')
X_svd = sklearn.decomposition.PCA(0.99, copy=False).fit_transform(sklearn.preprocessing.scale(X_all))
gc.collect()
X_svd = sklearn.preprocessing.scale(X_svd, copy=False)
X_svd.shape

(1989, 1545)

In [8]:
def get_days_cv(X_len, start_part=0.5, prediction_days=1, days_step=3):
    start, end = int(X_len * start_part), X_len - prediction_days
    indexes = np.arange(X_len)
    for test_start in range (start, end, days_step):
        yield indexes[:test_start], indexes[test_start:test_start + prediction_days]

def get_cv(cls, params, X, y, prediction_days=3, days_step=3):
    grid_cv = sklearn.model_selection.GridSearchCV(cls, params, n_jobs=-1,
                                                   cv=get_days_cv(len(X), days_step=days_step,
                                                                  prediction_days=prediction_days),
                                                   return_train_score=False, verbose=1)
    grid_cv.fit(X, y)
    return grid_cv

def show_cv(cv_results, params):
    return pd.DataFrame(cv_results).sort_values('mean_test_score', ascending=False)[params]

exp_range = [10**p for p in range(-5, 6)]
svm_params = [
    {
        'kernel': ['linear'],
        'C': exp_range
    },
    {
        'kernel': ['rbf'],
        'C': exp_range,
        'gamma': exp_range
    }
]
range_01 = [n / 10 for n in range(0, 11)]
sgd_params = {
    'loss': ['hinge', 'log'],
    'penalty': ['elasticnet'],
    'alpha': exp_range,
    'l1_ratio': range_01,
    'tol': [1e-4],
    'random_state': [42],
}

def show_svm(cv_results, extra=None):
    return show_cv(cv_results, ['mean_test_score', 'param_C', 'param_gamma'] + extra if extra else [])

def show_sgd(cv_results):
    return show_cv(cv_results, ['mean_test_score'] + ['param_' + x for x in sgd_params.keys()])


print('CV balance of classes:', sum(1 for y in y_all[len(y_all) // 2:] if y > 0) / (len(y_all) // 2))

CV balance of classes: 0.5241448692152918


In [20]:
svm_grid_cv = get_cv(sklearn.svm.SVC(), svm_params, X_svd, y_all)

Fitting 91 folds for each of 132 candidates, totalling 12012 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 36.1min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 46.7min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 58.8min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 72.5min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 88.0min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 105.2min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 123.8min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 143.8min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 165.3min
[Parallel(n_jobs=-1)]: Done 12012 out of 12012 | elaps

In [26]:
show_cv(svm_grid_cv.cv_results_, ['mean_test_score', 'param_kernel', 'param_C', 'param_gamma'])

Unnamed: 0,mean_test_score,param_kernel,param_C,param_gamma
89,0.516484,rbf,100,0.0001
3,0.516484,linear,0.01,
91,0.512821,rbf,100,0.01
102,0.512821,rbf,1000,0.01
80,0.512821,rbf,10,0.01
124,0.512821,rbf,100000,0.01
113,0.512821,rbf,10000,0.01
99,0.505495,rbf,1000,1e-05
88,0.498168,rbf,100,1e-05
69,0.494505,rbf,1,0.01


In [21]:
sgd_class_grid_cv = get_cv(sklearn.linear_model.SGDClassifier(), sgd_params, X_svd, y_all)

Fitting 331 folds for each of 242 candidates, totalling 80102 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed: 21





[Parallel(n_jobs=-1)]: Done 14434 tasks      | elapsed: 37.7min






[Parallel(n_jobs=-1)]: Done 16184 tasks      | elapsed: 45.2min
[Parallel(n_jobs=-1)]: Done 18034 tasks      | elapsed: 50.0min
[Parallel(n_jobs=-1)]: Done 19984 tasks      | elapsed: 69.4min




















































[Parallel(n_jobs=-1)]: Done 22034 tasks      | elapsed: 157.6min
[Parallel(n_jobs=-1)]: Done 24184 tasks      | elapsed: 159.9min
[Parallel(n_jobs=-1)]: Done 26434 tasks      | elapsed: 162.7min


[Parallel(n_jobs=-1)]: Done 28784 tasks      | elapsed: 188.9min














[Parallel(n_jobs=-1)]: Done 31234 tasks      | elapsed: 217.0min
[Parallel(n_jobs=-1)]: Done 33784 tasks      | elapsed: 219.6min


[Parallel(n_jobs=-1)]: Done 36434 tasks      | elapsed: 224.9min
[Parallel(n_jobs=-1)]: Done 39184 tasks      | elapsed: 228.5min
[Parallel(n_jobs=-1)]: Done 42034 tasks      | elapsed: 231.7min
[Parallel(n_jobs=-1)]: Done 44984 tasks      | elapsed: 235.0min
[Parallel(n_jobs=-1)]: Done 48034 tasks      | elapsed: 237.5min
[Parallel(n_jobs=-1)]: Done 51184 tasks      | elapsed: 240.1min
[Parallel(n_jobs=-1)]: Done 54434 tasks      | elapsed: 242.7min
[Parallel(n_jobs=-1)]: Done 57784 tasks      | elapsed: 245.3min
[Parallel(n_jobs=-1)]: Done 61234 tasks      | elapsed: 248.0min
[Parallel(n_jobs=-1)]: Done 64784 tasks      | elapsed: 250.8min
[Parallel(n_jobs=-1)]: Done 68434 tasks      | elapsed: 253.6min
[Parallel(n_jobs=-1)]: Done 72184 tasks      | elapsed: 256.6min
[Parallel(n_jobs=-1)]: Done 76034 tasks      | elapsed: 259.6min
[Parallel(n_jobs=-1)]: Done 79984 tasks      | elapsed: 262.6min
[Parallel(n_jobs=-1)]: Done 80102 out of 80102 | elapsed: 262.7min finished


In [18]:
show_cv(sgd_class_grid_cv.cv_results_, ['param_' + x for x in sgd_params.keys()])

NameError: name 'sgd_class_grid_cv' is not defined

In [25]:
pca_100 = sklearn.decomposition.PCA(100, copy=False)
X_svd_100 = sklearn.preprocessing.scale(
    pca_100.fit_transform(
        sklearn.preprocessing.scale(np.load('bow.npy'), copy=False)
    ), copy=False
)
gc.collect()
print(np.sum(pca_100.explained_variance_ratio_))

0.8845042026357965


In [27]:
svm_100_grid_cv = get_cv(sklearn.svm.SVC(), {'C': exp_range, 'gamma': exp_range}, X_svd_100, y_all)

Fitting 331 folds for each of 121 candidates, totalling 40051 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 307 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 557 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 907 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 1357 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1907 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2557 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3307 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 4157 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5107 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 6157 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 7307 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 8557 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 9907 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 11357 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 12907 tasks      | elapsed: 11

In [28]:
show_cv(svm_100_grid_cv.cv_results_, ['mean_test_score', 'param_C', 'param_gamma'])

Unnamed: 0,mean_test_score,param_C,param_gamma
113,0.526687,100000,0.01
102,0.526687,10000,0.01
77,0.518630,100,1e-05
67,0.516616,10,0.0001
91,0.515609,1000,0.01
58,0.515609,1,0.01
111,0.514602,100000,0.0001
112,0.513595,100000,0.001
57,0.513595,1,0.001
47,0.513595,0.1,0.01


In [29]:
sgd_class_100_grid_cv = get_cv(sklearn.linear_model.SGDClassifier(), sgd_params, X_svd_100, y_all)

Fitting 331 folds for each of 242 candidates, totalling 80102 fits


[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 6576 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 12176 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 19376 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 25040 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 29560 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 35560 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 41508 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 51604 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 68404 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 80102 out of 80102 | elapsed:  4.3min finished


In [30]:
show_cv(sgd_class_100_grid_cv.cv_results_, ['mean_test_score'] + ['param_' + x for x in sgd_params.keys()])

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
209,0.530715,log,elasticnet,10000,0.5,0.0001
186,0.529708,hinge,elasticnet,1000,0.5,0.0001
128,0.529708,hinge,elasticnet,1,0.9,0.0001
14,0.529708,hinge,elasticnet,1e-05,0.7,0.0001
237,0.527694,log,elasticnet,100000,0.8,0.0001
40,0.527694,hinge,elasticnet,0.0001,0.9,0.0001
18,0.526687,hinge,elasticnet,1e-05,0.9,0.0001
141,0.526687,log,elasticnet,10,0.4,0.0001
149,0.526687,log,elasticnet,10,0.8,0.0001
7,0.525680,log,elasticnet,1e-05,0.3,0.0001


In [10]:
tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
X_tfidf_svd = sklearn.preprocessing.scale(sklearn.decomposition.PCA(0.99, copy=False).fit_transform(
    sklearn.preprocessing.scale(tfidf_transformer.fit_transform(np.load('bow.npy')).toarray(), copy=False)
              ), copy=False)
gc.collect()
X_tfidf_svd.shape

(1989, 158318)

In [16]:
svm_tfidf_grid_cv = get_cv(sklearn.svm.SVC(), {'C':exp_range, 'gamma': exp_range}, X_tfidf_svd, y_all,
                          days_step=11)
gc.collect()

Fitting 91 folds for each of 121 candidates, totalling 11011 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 36.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 49.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 65.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 83.4min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 103.1min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 125.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 149.6min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 175.5min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 203.8min
[Parallel(n_jobs=-1)]: Done 11011 out of 11011 | elapsed: 229.6min finished


66

In [18]:
show_cv(svm_tfidf_grid_cv.cv_results_, ['mean_test_score', 'param_C', 'param_gamma'])

Unnamed: 0,mean_test_score,param_C,param_gamma
57,0.523810,1,0.001
110,0.509158,100000,1e-05
79,0.501832,100,0.001
89,0.501832,1000,0.0001
90,0.501832,1000,0.001
68,0.501832,10,0.001
112,0.501832,100000,0.001
101,0.501832,10000,0.001
67,0.501832,10,0.0001
111,0.498168,100000,0.0001


In [17]:
sgd_tfidf_grid_cv = get_cv(sklearn.linear_model.SGDClassifier(), sgd_params, X_tfidf_svd, y_all)
gc.collect()

Fitting 331 folds for each of 242 candidates, totalling 80102 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed: 24



[Parallel(n_jobs=-1)]: Done 16184 tasks      | elapsed: 45.1min
[Parallel(n_jobs=-1)]: Done 18034 tasks      | elapsed: 49.8min
[Parallel(n_jobs=-1)]: Done 19984 tasks      | elapsed: 71.7min




















































[Parallel(n_jobs=-1)]: Done 22034 tasks      | elapsed: 179.0min
[Parallel(n_jobs=-1)]: Done 24184 tasks      | elapsed: 182.0min
[Parallel(n_jobs=-1)]: Done 26434 tasks      | elapsed: 185.1min
[Parallel(n_jobs=-1)]: Done 28784 tasks      | elapsed: 190.6min
[Parallel(n_jobs=-1)]: Done 31234 tasks      | elapsed: 197.7min
[Parallel(n_jobs=-1)]: Done 33784 tasks      | elapsed: 201.1min


[Parallel(n_jobs=-1)]: Done 36434 tasks      | elapsed: 212.4min
[Parallel(n_jobs=-1)]: Done 39184 tasks      | elapsed: 217.4min
[Parallel(n_jobs=-1)]: Done 42034 tasks      | elapsed: 221.5min
[Parallel(n_jobs=-1)]: Done 44984 tasks      | elapsed: 226.7min
[Parallel(n_jobs=-1)]: Done 48034 tasks      | elapsed: 229.9min
[Parallel(n_jobs=-1)]: Done 51184 tasks      | elapsed: 233.4min
[Parallel(n_jobs=-1)]: Done 54434 tasks      | elapsed: 236.6min
[Parallel(n_jobs=-1)]: Done 57784 tasks      | elapsed: 239.9min
[Parallel(n_jobs=-1)]: Done 61234 tasks      | elapsed: 243.3min
[Parallel(n_jobs=-1)]: Done 64784 tasks      | elapsed: 246.7min
[Parallel(n_jobs=-1)]: Done 68434 tasks      | elapsed: 250.3min
[Parallel(n_jobs=-1)]: Done 72184 tasks      | elapsed: 253.9min
[Parallel(n_jobs=-1)]: Done 76034 tasks      | elapsed: 257.6min
[Parallel(n_jobs=-1)]: Done 79984 tasks      | elapsed: 261.4min
[Parallel(n_jobs=-1)]: Done 80102 out of 80102 | elapsed: 261.5min finished


0

In [19]:
show_cv(sgd_tfidf_grid_cv.cv_results_, ['mean_test_score'] + ['param_' + x for x in sgd_params.keys()])

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
223,0.543807,log,elasticnet,100000,0.1,0.0001
226,0.542800,hinge,elasticnet,100000,0.3,0.0001
55,0.540785,log,elasticnet,0.001,0.5,0.0001
12,0.538771,hinge,elasticnet,1e-05,0.6,0.0001
180,0.536757,hinge,elasticnet,1000,0.2,0.0001
167,0.535750,log,elasticnet,100,0.6,0.0001
60,0.531722,hinge,elasticnet,0.001,0.8,0.0001
203,0.529708,log,elasticnet,10000,0.2,0.0001
234,0.528701,hinge,elasticnet,100000,0.7,0.0001
35,0.527694,log,elasticnet,0.0001,0.6,0.0001


# Lemmas

In [5]:
X_svd = sklearn.decomposition.PCA(0.99, copy=False).fit_transform(
    sklearn.preprocessing.scale(np.load('lemmas/bow_lemmas.npy'))
)
gc.collect()
X_svd = sklearn.preprocessing.scale(X_svd, copy=False)
X_svd.shape

(1989, 1549)

In [6]:
X_svd = sklearn.decomposition.PCA(0.99, copy=False).fit_transform(
    sklearn.preprocessing.scale(np.load('lemmas/bow_avg_lemmas.npy'))
)
gc.collect()
X_svd = sklearn.preprocessing.scale(X_svd, copy=False)
X_svd.shape

(1989, 1870)

In [7]:
X_svd = sklearn.decomposition.TruncatedSVD(700).fit_transform(
    np.load('lemmas/bow_avg_lemmas.npy')
)
X_svd = sklearn.preprocessing.scale(X_svd, copy=False)

In [20]:
svm_lemmas_grid_cv = get_cv(sklearn.svm.SVC(), {'C': exp_range, 'gamma': exp_range},
                            X_svd, y_all, days_step=1, prediction_days=1)
gc.collect()

Fitting 994 folds for each of 121 candidates, totalling 120274 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 22.6min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 34.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 41.1min


KeyboardInterrupt: 

In [12]:
# old bad lemmans
show_svm(svm_lemmas_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_C,param_gamma
110,0.520147,100000,1e-05
101,0.512821,10000,0.001
102,0.512821,10000,0.01
90,0.512821,1000,0.001
91,0.512821,1000,0.01
69,0.512821,10,0.01
79,0.512821,100,0.001
113,0.512821,100000,0.01
112,0.512821,100000,0.001
80,0.512821,100,0.01


In [12]:
show_svm(svm_lemmas_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_C,param_gamma
91,0.512821,1000,0.01
69,0.512821,10,0.01
80,0.512821,100,0.01
99,0.512821,10000,1e-05
102,0.512821,10000,0.01
110,0.512821,100000,1e-05
113,0.512821,100000,0.01
111,0.509158,100000,0.0001
100,0.509158,10000,0.0001
58,0.494505,1,0.01


In [19]:
# with avg counts
show_svm(svm_lemmas_grid_cv.cv_results_, ['param_kernel'])

Unnamed: 0,mean_test_score,param_C,param_gamma,param_kernel
79,0.512821,10,0.001,rbf
90,0.505495,100,0.001,rbf
101,0.505495,1000,0.001,rbf
112,0.505495,10000,0.001,rbf
123,0.505495,100000,0.001,rbf
8,0.505495,1000,,linear
10,0.505495,100000,,linear
9,0.505495,10000,,linear
7,0.505495,100,,linear
6,0.505495,10,,linear


In [10]:
sgd_lemmas_grid_cv = get_cv(sklearn.linear_model.SGDClassifier(), sgd_params, X_svd, y_all, prediction_days=1, days_step=1)
gc.collect()

Fitting 994 folds for each of 242 candidates, totalling 240548 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 2452 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3552 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 4852 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 6352 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 8052 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 9952 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 12052 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 14352 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 16852 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 19552 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 22452 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 25552 tasks      | elapse







































[Parallel(n_jobs=-1)]: Done 64096 tasks      | elapsed: 80.3min


















































[Parallel(n_jobs=-1)]: Done 67346 tasks      | elapsed: 114.0min
[Parallel(n_jobs=-1)]: Done 70696 tasks      | elapsed: 117.3min
[Parallel(n_jobs=-1)]: Done 74146 tasks      | elapsed: 120.3min
[Parallel(n_jobs=-1)]: Done 77696 tasks      | elapsed: 122.8min
[Parallel(n_jobs=-1)]: Done 81346 tasks      | elapsed: 125.0min
[Parallel(n_jobs=-1)]: Done 85096 tasks      | elapsed: 133.2min










































































[Parallel(n_jobs=-1)]: Done 88946 tasks      | elapsed: 179.9min
[Parallel(n_jobs=-1)]: Done 92896 tasks      | elapsed: 183.3min
[Parallel(n_jobs=-1)]: Done 96946 tasks      | elapsed: 185.2min
[Parallel(n_jobs=-1)]: Done 101096 tasks      | elapsed: 186.7min
[Parallel(n_jobs=-1)]: Done 105346 tasks      | elapsed: 188.4min
[Parallel(n_jobs=-1)]: Done 109696 tasks      | elapsed: 190.6min
[Parallel(n_jobs=-1)]: Done 114146 tasks      | elapsed: 192.9min
[Parallel(n_jobs=-1)]: Done 118696 tasks      | elapsed: 194.8min
[Parallel(n_jobs=-1)]: Done 123346 tasks      | elapsed: 196.7min
[Parallel(n_jobs=-1)]: Done 128096 tasks      | elapsed: 198.6min
[Parallel(n_jobs=-1)]: Done 132946 tasks      | elapsed: 200.5min
[Parallel(n_jobs=-1)]: Done 137896 tasks      | elapsed: 202.3min
[Parallel(n_jobs=-1)]: Done 142946 tasks      | elapsed: 204.1min
[Parallel(n_jobs=-1)]: Done 148096 tasks      | elapsed: 205.9min
[Parallel(n_jobs=-1)]: Done 153346 tasks      | elapsed: 207.8min
[Parallel(n_j

[Parallel(n_jobs=-1)]: Done 217696 tasks      | elapsed: 230.2min
[Parallel(n_jobs=-1)]: Done 224146 tasks      | elapsed: 232.4min
[Parallel(n_jobs=-1)]: Done 230696 tasks      | elapsed: 234.7min
[Parallel(n_jobs=-1)]: Done 237346 tasks      | elapsed: 237.0min
[Parallel(n_jobs=-1)]: Done 240548 out of 240548 | elapsed: 238.1min finished


0

In [15]:
show_cv(sgd_lemmas_grid_cv.cv_results_, ['mean_test_score'] + ['param_' + x for x in sgd_params.keys()])

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
225,0.542800,log,elasticnet,100000,0.2,0.0001
220,0.542800,hinge,elasticnet,100000,0,0.0001
229,0.536757,log,elasticnet,100000,0.4,0.0001
130,0.535750,hinge,elasticnet,1,1,0.0001
161,0.533736,log,elasticnet,100,0.3,0.0001
128,0.531722,hinge,elasticnet,1,0.9,0.0001
168,0.529708,hinge,elasticnet,100,0.7,0.0001
29,0.527694,log,elasticnet,0.0001,0.3,0.0001
12,0.523666,hinge,elasticnet,1e-05,0.6,0.0001
40,0.521652,hinge,elasticnet,0.0001,0.9,0.0001


In [13]:
show_sgd(sgd_lemmas_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
147,0.540785,log,elasticnet,10,0.7,0.0001
132,0.538771,hinge,elasticnet,10,0,0.0001
231,0.535750,log,elasticnet,100000,0.5,0.0001
152,0.532729,hinge,elasticnet,10,1,0.0001
168,0.532729,hinge,elasticnet,100,0.7,0.0001
229,0.529708,log,elasticnet,100000,0.4,0.0001
186,0.529708,hinge,elasticnet,1000,0.5,0.0001
195,0.529708,log,elasticnet,1000,0.9,0.0001
183,0.528701,log,elasticnet,1000,0.3,0.0001
80,0.528701,hinge,elasticnet,0.01,0.7,0.0001


In [8]:
# with avg counts
show_sgd(sgd_lemmas_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
155,0.546278,log,elasticnet,100,0,0.0001
115,0.532193,log,elasticnet,1,0.2,0.0001
232,0.526157,hinge,elasticnet,100000,0.6,0.0001
58,0.526157,hinge,elasticnet,0.001,0.7,0.0001
92,0.526157,hinge,elasticnet,0.1,0.2,0.0001
26,0.524145,hinge,elasticnet,0.0001,0.2,0.0001
207,0.523139,log,elasticnet,10000,0.4,0.0001
137,0.523139,log,elasticnet,10,0.2,0.0001
61,0.523139,log,elasticnet,0.001,0.8,0.0001
42,0.522133,hinge,elasticnet,0.0001,1,0.0001


In [11]:
# svd
show_sgd(sgd_lemmas_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol,param_random_state
27,0.547284,log,elasticnet,0.0001,0.2,0.0001,42
4,0.545272,hinge,elasticnet,1e-05,0.2,0.0001,42
66,0.544266,hinge,elasticnet,0.01,0,0.0001,42
7,0.542254,log,elasticnet,1e-05,0.3,0.0001,42
50,0.541247,hinge,elasticnet,0.001,0.3,0.0001,42
46,0.539235,hinge,elasticnet,0.001,0.1,0.0001,42
12,0.539235,hinge,elasticnet,1e-05,0.6,0.0001,42
3,0.538229,log,elasticnet,1e-05,0.1,0.0001,42
26,0.537223,hinge,elasticnet,0.0001,0.2,0.0001,42
73,0.537223,log,elasticnet,0.01,0.3,0.0001,42


In [14]:
tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
X_tfidf_svd = sklearn.preprocessing.scale(sklearn.decomposition.PCA(0.99, copy=False).fit_transform(
    sklearn.preprocessing.scale(tfidf_transformer.fit_transform(np.load('lemmas/bow_lemmas.npy')).toarray(), copy=False)
              ), copy=False)
gc.collect()
X_tfidf_svd.shape

(1989, 1911)

In [38]:
tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
X_tfidf_svd = tfidf_transformer.fit_transform(np.load('lemmas/bow_lemmas.npy')).toarray()
X_tfidf_svd = sklearn.preprocessing.scale(sklearn.decomposition.TruncatedSVD(1550).fit_transform(X_tfidf_svd), copy=False)
gc.collect()

28515

In [19]:
svm_lemmas_tfidf_grid_cv = get_cv(sklearn.svm.SVC(), svm_params, X_tfidf_svd, y_all,
                          days_step=11)
gc.collect()

Fitting 91 folds for each of 132 candidates, totalling 12012 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 53.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 72.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 94.8min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 118.6min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 144.6min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 173.0min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 204.9min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 238.7min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 274.9min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 315.5min
[Parallel(n_jobs=-1)]: Done 12012 out of 12012 | el

66

In [22]:
# old naive lemmas
show_svm(svm_lemmas_tfidf_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_C,param_gamma
88,0.531136,1000,1e-05
67,0.527473,10,0.0001
101,0.527473,10000,0.001
112,0.527473,100000,0.001
78,0.520147,100,0.0001
77,0.520147,100,1e-05
57,0.520147,1,0.001
90,0.516484,1000,0.001
69,0.509158,10,0.01
110,0.505495,100000,1e-05


In [21]:
show_svm(svm_lemmas_tfidf_grid_cv.cv_results_, ['param_kernel'])

Unnamed: 0,mean_test_score,param_C,param_gamma,param_kernel
3,0.542125,0.01,,linear
99,0.534799,1000,1e-05,rbf
89,0.527473,100,0.0001,rbf
68,0.523810,1,0.001,rbf
110,0.509158,10000,1e-05,rbf
121,0.509158,100000,1e-05,rbf
4,0.505495,0.1,,linear
88,0.501832,100,1e-05,rbf
5,0.494505,1,,linear
6,0.494505,10,,linear


In [39]:
sgd_lemmas_tfidf_grid_cv = get_cv(sklearn.linear_model.SGDClassifier(), sgd_params, X_tfidf_svd, y_all)
gc.collect()

Fitting 331 folds for each of 242 candidates, totalling 80102 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 11234 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 12784 tasks      | elapsed: 11





[Parallel(n_jobs=-1)]: Done 22034 tasks      | elapsed: 47.6min
[Parallel(n_jobs=-1)]: Done 24184 tasks      | elapsed: 49.4min
[Parallel(n_jobs=-1)]: Done 26434 tasks      | elapsed: 51.4min






[Parallel(n_jobs=-1)]: Done 28784 tasks      | elapsed: 72.4min














[Parallel(n_jobs=-1)]: Done 31234 tasks      | elapsed: 100.2min
[Parallel(n_jobs=-1)]: Done 33784 tasks      | elapsed: 102.5min
[Parallel(n_jobs=-1)]: Done 36434 tasks      | elapsed: 105.4min
[Parallel(n_jobs=-1)]: Done 39184 tasks      | elapsed: 108.5min
[Parallel(n_jobs=-1)]: Done 42034 tasks      | elapsed: 110.9min
[Parallel(n_jobs=-1)]: Done 44984 tasks      | elapsed: 113.7min
[Parallel(n_jobs=-1)]: Done 48034 tasks      | elapsed: 116.0min
[Parallel(n_jobs=-1)]: Done 51184 tasks      | elapsed: 118.4min
[Parallel(n_jobs=-1)]: Done 54434 tasks      | elapsed: 120.9min
[Parallel(n_jobs=-1)]: Done 57784 tasks      | elapsed: 123.4min
[Parallel(n_jobs=-1)]: Done 61234 tasks      | elapsed: 126.0min
[Parallel(n_jobs=-1)]: Done 64784 tasks      | elapsed: 128.7min
[Parallel(n_jobs=-1)]: Done 68434 tasks      | elapsed: 131.5min
[Parallel(n_jobs=-1)]: Done 72184 tasks      | elapsed: 134.4min
[Parallel(n_jobs=-1)]: Done 76034 tasks      | elapsed: 137.3min
[Parallel(n_jobs=-1)]: Do

0

In [24]:
# old naive lemmas
show_sgd(sgd_lemmas_tfidf_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
232,0.544814,hinge,elasticnet,100000,0.6,0.0001
2,0.543807,hinge,elasticnet,1e-05,0.1,0.0001
34,0.538771,hinge,elasticnet,0.0001,0.6,0.0001
61,0.536757,log,elasticnet,0.001,0.8,0.0001
155,0.536757,log,elasticnet,100,0,0.0001
36,0.535750,hinge,elasticnet,0.0001,0.7,0.0001
5,0.535750,log,elasticnet,1e-05,0.2,0.0001
40,0.534743,hinge,elasticnet,0.0001,0.9,0.0001
30,0.532729,hinge,elasticnet,0.0001,0.4,0.0001
45,0.532729,log,elasticnet,0.001,0,0.0001


In [40]:
show_sgd(sgd_lemmas_tfidf_grid_cv.cv_results_)

Unnamed: 0,mean_test_score,param_loss,param_penalty,param_alpha,param_l1_ratio,param_tol
131,0.538771,log,elasticnet,1,1,0.0001
145,0.535750,log,elasticnet,10,0.6,0.0001
178,0.534743,hinge,elasticnet,1000,0.1,0.0001
146,0.530715,hinge,elasticnet,10,0.7,0.0001
6,0.530715,hinge,elasticnet,1e-05,0.3,0.0001
45,0.528701,log,elasticnet,0.001,0,0.0001
176,0.527694,hinge,elasticnet,1000,0,0.0001
41,0.525680,log,elasticnet,0.0001,0.9,0.0001
188,0.525680,hinge,elasticnet,1000,0.6,0.0001
111,0.524673,log,elasticnet,1,0,0.0001
