In [1]:
import pandas as pd
import numpy as np
import twosigmafunc
from collections import defaultdict

In [2]:
import matplotlib.pyplot as plt
import seaborn
plt.rcParams['figure.figsize'] = (14.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
# plt.rcParams['image.cmap'] = 'gray'

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [4]:
train = pd.read_hdf('train.h5')

In [21]:
markers = [ 'o', 'v', 's','*', ',', '+', 'x', '2', '3', 'D', '|', '*', ',']

In [5]:
excl = ['id', 'timestamp', 'y']
cols_origin = [col for col in train.columns if col not in ['y']]
feature_origin = [col for col in train.columns if col not in excl]
feature_diff = [col + '_diff' for col in feature_origin]


# Add Nans 

In [6]:
twosigmafunc.add_nans(train, feature_origin)

sucessfully add 108 nan features


# Add differs 

In [7]:
# add time series diff
train.fillna(train.median(), inplace=True)
d_mean = train.median()
train.sort_values(['id', 'timestamp'], inplace=True)
train['id_diff'] = train.id.diff()
d_mean['id_diff'] = 0.0
for col in feature_origin:
    train[col + '_diff'] = train[col].diff()
    d_mean[col + '_diff'] = 0.0

train.loc[train.id_diff!=0, feature_diff] = 0


In [8]:
train['y_past'] = train.y.shift()
train['tec20-30'] = train.technical_20 - train.technical_30
train['tec123'] = train['tec20-30'] + train.technical_13
train['tec123_diff'] = train.tec123.diff()
train['tec20-30_diff'] = train['tec20-30'].diff()
train.loc[train.id_diff != 0, ['tec123_diff', 'tec20-30_diff', 'y_past']] = 0

In [9]:
from sklearn.linear_model import LinearRegression, Ridge
lr = LinearRegression(n_jobs=-1)
ridge = Ridge()

In [10]:
low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (train.y > high_y_cut)
y_is_below_cut = (train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

In [11]:
model_1 = Ridge()
model_2 = Ridge()
model_1.fit(np.array(train.loc[y_is_within_cut, 'technical_20_diff']).reshape(-1,1), train.loc[y_is_within_cut, 'y'])
model_2.fit(train.loc[y_is_within_cut, ['tec20-30', 'technical_20_diff']], train.loc[y_is_within_cut, 'y'])

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

# Predicting Rewards 
although reward can be used as flag to indicate which Ridge model to involve, however, in submit model, we can not get reward before we hand in our predictions, so let's predict reward now

In [43]:
reward1 = []
reward2 = []
for timestamp in range(0, train.timestamp.max() + 1):
    x_1 = np.array(train.loc[train.timestamp == timestamp, 'technical_20_diff']).reshape(-1,1)
    x_2 = train.loc[train.timestamp == timestamp, ['tec20-30', 'technical_20_diff']]
    y_1 = model_1.predict(x_1).clip(low_y_cut, high_y_cut)
    y_2 = model_2.predict(x_2).clip(low_y_cut, high_y_cut)
    y_t = train.loc[train.timestamp == timestamp, 'y'].clip(low_y_cut, high_y_cut)
    reward1.append(twosigmafunc.R_score(y_1, y_t))
    reward2.append(twosigmafunc.R_score(y_2, y_t))
    if timestamp % 100 == 0:
        print(timestamp)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


In [13]:
time_gp = train.groupby('timestamp')

In [14]:
cols = [col for col in train.columns if col not in excl]

In [15]:
gp = time_gp.mean()[cols]
gp.head()

Unnamed: 0_level_0,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,fundamental_3,fundamental_5,...,technical_40_diff,technical_41_diff,technical_42_diff,technical_43_diff,technical_44_diff,y_past,tec20-30,tec123,tec123_diff,tec20-30_diff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.036649,0.097312,-0.06161,-0.081894,0.063217,0.020847,0.053706,-0.187253,0.044835,0.26412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000386,0.001536,0.0,0.0
1,0.035497,0.202283,0.579504,-0.033354,0.063989,0.021344,0.053442,-0.22403,0.043261,0.262934,...,0.003051,0.0,0.0,0.00088,0.0,-0.00187,0.000472,0.001325,-0.000211,8.6e-05
2,0.035163,0.203478,0.586814,-0.022597,0.064276,0.021575,0.053336,-0.2385,0.043072,0.26233,...,0.000952,0.0,0.0,0.0,0.0,0.000501,0.000511,0.001283,-4.2e-05,3.9e-05
3,0.034876,0.20451,0.593121,-0.013317,0.064524,0.021774,0.053246,-0.250984,0.042909,0.26181,...,0.000821,0.0,0.0,0.0,0.0,-0.005787,-3.2e-05,0.000658,-0.000625,-0.000543
4,0.034412,0.206172,0.60328,0.001631,0.064923,0.022094,0.053099,-0.271093,0.042646,0.260971,...,0.001323,0.0,0.0,0.0,0.0,0.001633,-0.000307,0.00066,2e-06,-0.000275


In [16]:
gp['timestamp'] = gp.index

In [47]:
gp['bi'] = np.array(reward1) < np.array(reward2)

In [48]:
gp.bi.unique()

array([True, False], dtype=object)

In [18]:
gp['tec20_std'] = time_gp.std()['technical_20_diff']
gp['tec20_30_std'] = time_gp.std()['tec20-30']
gp['tec20_max'] = time_gp.max()['technical_20_diff']
gp['tec20_min'] = time_gp.min()['technical_20_diff']
gp['tec20_30_max'] = time_gp.max()['tec20-30']
gp['tec20_30_min'] = time_gp.min()['tec20-30']

In [19]:
features_for_bi = [col for col in gp.columns if col not in ['timestamp', 'bi']]


In [42]:
gp.bi = np.array(reward1) > np.array(reward2)

1812

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
rfc = RandomForestClassifier(n_estimators=256, n_jobs=-1, random_state=11)

In [21]:
gp.head()

Unnamed: 0_level_0,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,fundamental_3,fundamental_5,...,tec123_diff,tec20-30_diff,timestamp,bi,tec20_std,tec20_30_std,tec20_max,tec20_min,tec20_30_max,tec20_30_min
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.036649,0.097312,-0.06161,-0.081894,0.063217,0.020847,0.053706,-0.187253,0.044835,0.26412,...,0.0,0.0,0,False,0.0,0.007681,0.0,0.0,0.033168,-0.036647
1,0.035497,0.202283,0.579504,-0.033354,0.063989,0.021344,0.053442,-0.22403,0.043261,0.262934,...,-0.000211,8.6e-05,1,False,0.003028,0.006507,0.013369,-0.012672,0.02658,-0.029471
2,0.035163,0.203478,0.586814,-0.022597,0.064276,0.021575,0.053336,-0.2385,0.043072,0.26233,...,-4.2e-05,3.9e-05,2,False,0.002239,0.006825,0.016507,-0.009636,0.022925,-0.037371
3,0.034876,0.20451,0.593121,-0.013317,0.064524,0.021774,0.053246,-0.250984,0.042909,0.26181,...,-0.000625,-0.000543,3,False,0.001778,0.007721,0.010776,-0.00812,0.019447,-0.037181
4,0.034412,0.206172,0.60328,0.001631,0.064923,0.022094,0.053099,-0.271093,0.042646,0.260971,...,2e-06,-0.000275,4,False,0.002525,0.008156,0.009547,-0.013342,0.016363,-0.039013


In [53]:
g_train, y_train, g_test, y_test = twosigmafunc.split_data(gp, features_for_bi, 'bi')

In [54]:
parameter_grid = {
                 'max_depth': [3, 5, None],
                 'max_features': ['sqrt', 0.5, None]
                 }

cross_validation = StratifiedKFold(5)

grid_search = GridSearchCV(rfc,
                           param_grid=parameter_grid,
                           cv=cross_validation, n_jobs=-1)

grid_search.fit(g_train[features_for_bi], y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=256, n_jobs=-1, oob_score=False, random_state=11,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': ['sqrt', 0.5, None], 'max_depth': [3, 5, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [55]:
np.mean(y_test == grid_search.predict(g_test[features_for_bi]))

0.4696802646085998

In [56]:
imp = pd.DataFrame({'feature': features_for_bi, 'weight':grid_search.best_estimator_.feature_importances_}).sort_values(
    'weight', ascending=False)

In [57]:
imp

Unnamed: 0,feature,weight
312,technical_31_diff,0.021300
314,technical_33_diff,0.020260
309,technical_28_diff,0.019490
303,technical_20_diff,0.018734
306,technical_24_diff,0.018489
333,tec20_max,0.017731
287,technical_1_diff,0.015535
322,technical_41_diff,0.015182
334,tec20_min,0.015041
307,technical_25_diff,0.014750


In [58]:
grid_search.best_params_

{'max_depth': None, 'max_features': None}

In [64]:
np.mean(y_train == rfc.predict(g_train[imp.feature[:30]]))

0.8741721854304636

In [61]:
rfc = RandomForestClassifier(n_estimators=256, max_depth=5, max_features='sqrt', random_state=11)
rfc.fit(g_train[imp.feature[:30]], y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=256, n_jobs=1, oob_score=False, random_state=11,
            verbose=0, warm_start=False)

In [65]:
np.mean(y_test == rfc.predict(g_test[imp.feature[:30]]))

0.53693495038588757

In [67]:
features = imp.loc[imp.weight > imp.weight.mean(), 'feature']

In [68]:
features.shape

(115,)

In [69]:
grid_search.fit(g_train[features], y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=256, n_jobs=-1, oob_score=False, random_state=11,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': ['sqrt', 0.5, None], 'max_depth': [3, 5, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [71]:
np.mean(y_test == grid_search.predict(g_test[features]))

0.52701212789415652

In [72]:
np.mean(y_train == grid_search.predict(g_train[features]))

0.7516556291390728

In [73]:
gp.bi.mean()

0.4688361831218974