In [121]:
import numpy as np
import pandas as pd
import twosigmafunc
import json

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
train, origin_features = twosigmafunc.preprocess()

In [5]:
try:
    with open('features.json', 'r') as js:
        features = json.load(js)
except IOError as err:
    raise err

In [6]:
train, origin_features = twosigmafunc.preprocess()

In [7]:
twosigmafunc.add_nans(train, origin_features)

sucessfully add 108 nan features


In [10]:
for col in features['t22_features']:
    train[col] = train[col].fillna(train.groupby('technical_22')[col].transform("median"))

In [12]:
for col in features['t34_features']:
    train[col] = train[col].fillna(train.groupby('technical_34')[col].transform("median"))

In [14]:
twosigmafunc.add_diffs(train, origin_features)

sucessfully add 108 diff features


# Add dummy features 

In [15]:
t22_dummy = pd.get_dummies(train.technical_22, prefix='t22')
train = pd.concat([train, t22_dummy], axis=1)
t34_dummy = pd.get_dummies(train.technical_34, prefix='t34')
train = pd.concat([train, t34_dummy], axis=1)

In [16]:
train.drop(['technical_22', 'technical_34'], axis=1, inplace=True)

In [17]:
train.head()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_41_diff,technical_42_diff,technical_43_diff,technical_44_diff,t22_-0.5,t22_0.0,t22_0.5,t34_-0.5,t34_0.0,t34_0.5
131062,0,167,0.002392,-0.024803,0.019819,-0.005069,0.016409,-0.056218,0.003928,0.062897,...,0.0,0.0,0.0,0.0,1,0,0,0,1,0
131895,0,168,0.002392,-0.024803,0.019819,-0.005069,0.016409,-0.056218,0.003928,0.062897,...,0.0,0.0,0.0,0.0,1,0,0,0,1,0
132728,0,169,0.002392,-0.024803,0.019819,-0.005069,0.016409,-0.056218,0.003928,0.062897,...,0.0,0.0,0.0,0.0,1,0,0,0,1,0
133561,0,170,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,0.003928,0.212425,...,0.0,0.0,0.027205,0.0,1,0,0,0,1,0
134393,0,171,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,0.003928,0.212425,...,0.0,0.0,0.0,0.0,1,0,0,0,1,0


# Special features 

In [20]:
train['tec20-30'] = train.technical_20 - train.technical_30
train['tec123'] = train['tec20-30'] + train.technical_13
train['y_past'] = train.y.shift()
train['tec123_past'] = train.tec123.shift()
train.loc[train.id_diff != 0, ['y_past', 'tec123_past']] = 0
train.fillna(0, inplace=True)

In [22]:
etr_features = twosigmafunc.origin_features(train)

In [23]:
etr_features.remove('id_diff')

In [25]:
len(etr_features)

333

In [26]:
X_train, y_train, X_test, y_test = twosigmafunc.split_data(train, etr_features)

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 806298 entries, 131062 to 806297
Columns: 333 entries, derived_0 to tec123_past
dtypes: float32(218), int64(109), uint8(6)
memory usage: 1.3 GB


In [29]:
del train

# Train ETR

In [30]:
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=80, max_depth=6, max_features='sqrt', min_samples_leaf=30,
                          random_state=17, verbose=0)

In [31]:
etr.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=30,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=80, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [34]:
pred = etr.predict(X_train)
twosigmafunc.R_score(pred, y_train)

0.042337441721562362

In [35]:
pred = etr.predict(X_test)
twosigmafunc.R_score(pred, y_test)

0.016598126624915416

In [37]:
imp = pd.DataFrame({'feature':etr_features, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [41]:
features1 = list(imp.loc[imp.important > 0].feature)

In [42]:
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=30,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=80, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [43]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.044104738713101548

In [44]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.017932906038324991

In [45]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [47]:
features1 = list(imp.loc[imp.important > 0].feature)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=30,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=80, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [49]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.043356456778671178

In [50]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.016418749782478134

In [51]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [53]:
features1 = list(imp.loc[imp.important > 0].feature)
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=10, max_features='sqrt', min_samples_leaf=20,
                          random_state=17, verbose=0)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=10,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=20,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [54]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.079430841062204285

In [55]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.0209048938469957

overfit, retrain...

In [56]:
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=6, max_features='sqrt', min_samples_leaf=20,
                          random_state=17, verbose=0)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=20,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [57]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.044592482252342129

In [58]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.017250228288929519

In [59]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [63]:
features1 = list(imp.loc[imp.important > 0.0005].feature)
len(features1)

237

In [64]:
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=5, max_features='sqrt', min_samples_leaf=30,
                          random_state=17, verbose=0)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=30,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [65]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.037582525809880911

In [66]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.017641225492253196

In [67]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [72]:
features1 = list(imp.loc[imp.important > 0.0005].feature)
len(features1)

203

In [73]:
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=5, max_features='sqrt', min_samples_leaf=10,
                          random_state=17, verbose=0)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=10,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [74]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.039001450701881948

In [75]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.017531833530225232

In [76]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [79]:
features1 = list(imp.loc[imp.important > 0.001].feature)
len(features1)

174

In [80]:
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=5, max_features=0.5, min_samples_leaf=10,
                          random_state=17, verbose=0)
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features=0.5, max_leaf_nodes=None, min_impurity_split=1e-07,
          min_samples_leaf=10, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
          oob_score=False, random_state=17, verbose=0, warm_start=False)

In [81]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.057029250729028459

In [82]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.025862180690397269

In [83]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [87]:
features1 = list(imp.loc[imp.important > 0.001].feature)
len(features1)

119

In [88]:
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features=0.5, max_leaf_nodes=None, min_impurity_split=1e-07,
          min_samples_leaf=10, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
          oob_score=False, random_state=17, verbose=0, warm_start=False)

In [89]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.057149051908043001

In [90]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.026110956456432254

In [91]:
imp = pd.DataFrame({'feature':features1, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [92]:
imp

Unnamed: 0,feature,important
0,y_past,0.056752
2,tec123,0.049693
1,tec20-30,0.045011
3,technical_30,0.044728
4,technical_11,0.043691
6,technical_43,0.041693
7,fundamental_11,0.035513
5,tec123_past,0.034407
8,technical_20,0.031330
9,technical_11_diff,0.025702


In [94]:
for col in list(t22_dummy.columns) + list(t34_dummy.columns):
    print(imp[imp.feature==col])

Empty DataFrame
Columns: [feature, important]
Index: []
Empty DataFrame
Columns: [feature, important]
Index: []
    feature  important
47  t22_0.5   0.009822
     feature  important
51  t34_-0.5   0.007906
Empty DataFrame
Columns: [feature, important]
Index: []
     feature  important
107  t34_0.5   0.001431


In [100]:
features1 = list(imp.loc[imp.important > 0.002].feature)
len(features1)

90

In [101]:
etr.fit(X_train[features1], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features=0.5, max_leaf_nodes=None, min_impurity_split=1e-07,
          min_samples_leaf=10, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
          oob_score=False, random_state=17, verbose=0, warm_start=False)

In [102]:
pred = etr.predict(X_train[features1])
twosigmafunc.R_score(pred, y_train)

0.057408487244495196

In [103]:
pred = etr.predict(X_test[features1])
twosigmafunc.R_score(pred, y_test)

0.026549861209870653

In [108]:
min_impurities = [1e-7, 1e-6, 1e-5]
for split in min_impurities:
    print("\nmin_imp:", split)
    etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=5, max_features=0.5, min_samples_leaf=10,
                          random_state=17, min_impurity_split=split, verbose=0)
    etr.fit(X_train[features1], y_train)
    pred = etr.predict(X_train[features1])
    print("train r_score:", twosigmafunc.R_score(pred, y_train))
    pred = etr.predict(X_test[features1])
    print("test r_score:", twosigmafunc.R_score(pred, y_test))


min_imp: 1e-07
train r_score: 0.0574084872445
test r_score: 0.0265498612099

min_imp: 1e-06
train r_score: 0.0574084872445
test r_score: 0.0265498612099

min_imp: 1e-05
train r_score: 0.0574084872445
test r_score: 0.0265498612099


In [110]:
depths = [3, 6, 10]
for depth in depths:
    print("\nmax_depth:", depth)
    etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=depth, max_features=0.5, min_samples_leaf=10,
                          random_state=17, min_impurity_split=1e-6, verbose=0)
    etr.fit(X_train[features1], y_train)
    pred = etr.predict(X_train[features1])
    print("train r_score:", twosigmafunc.R_score(pred, y_train))
    pred = etr.predict(X_test[features1])
    print("test r_score:", twosigmafunc.R_score(pred, y_test))


max_depth: 3
train r_score: 0.0350788669252
test r_score: 0.0204991760254

max_depth: 6
train r_score: 0.0691055516104
test r_score: 0.029090089532

max_depth: 10
train r_score: 0.126810120496
test r_score: 0.0335086324267


In [112]:
samples = [10, 30, 50]
for sample in samples:
    print("\nmin_samples:", sample)
    etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=6, max_features=0.5, min_samples_leaf=sample,
                          random_state=17, min_impurity_split=1e-6, verbose=0)
    etr.fit(X_train[features1], y_train)
    pred = etr.predict(X_train[features1])
    print("train r_score:", twosigmafunc.R_score(pred, y_train))
    pred = etr.predict(X_test[features1])
    print("test r_score:", twosigmafunc.R_score(pred, y_test))


min_samples: 10
train r_score: 0.0691055516104
test r_score: 0.029090089532

min_samples: 30
train r_score: 0.0678485822473
test r_score: 0.0303077151944

min_samples: 50
train r_score: 0.0650103716902
test r_score: 0.0304655350213


In [115]:
features = [0.6, 0.75, 1.0]
for feature in features:
    print("\nmax_features:", feature)
    etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=6, max_features=feature, min_samples_leaf=sample,
                          random_state=17, min_impurity_split=1e-6, verbose=0)
    etr.fit(X_train[features1], y_train)
    pred = etr.predict(X_train[features1])
    print("train r_score:", twosigmafunc.R_score(pred, y_train))
    pred = etr.predict(X_test[features1])
    print("test r_score:", twosigmafunc.R_score(pred, y_test))


max_features: 0.6
train r_score: 0.0670824348793
test r_score: 0.0297397428562

max_features: 0.75
train r_score: 0.0682134865686
test r_score: 0.0310089972195

max_features: 1.0
train r_score: 0.0695938583222
test r_score: 0.0311505577685


In [119]:
depths = [5, 7, None]
samples = [55, 70]
features = [0.8]
best_param = {}
best_score = -99
for depth in depths:
    for sample in samples:
        for feature in features:
            print('max_depth:', depth, 'min_samples_leaf:', sample, 'max_features:', feature)
            etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=100, max_depth=depth, max_features=feature, min_samples_leaf=sample,
                          random_state=17, min_impurity_split=1e-6, verbose=0)
            etr.fit(X_train[features1], y_train)
            pred = etr.predict(X_train[features1])
            print("train r_score:", twosigmafunc.R_score(pred, y_train))
            pred = etr.predict(X_test[features1])
            r = twosigmafunc.R_score(pred, y_test)
            print("test r_score:", r)
            if r > best_score:
                best_param = {'samples':sample, 'depth':depth, 'features':feature}
print(best_param)

max_depth: 5 min_samples_leaf: 55 max_features: 0.8
train r_score: 0.0571148335089
test r_score: 0.0284461617142
max_depth: 5 min_samples_leaf: 70 max_features: 0.8
train r_score: 0.0566318710383
test r_score: 0.0284634151628
max_depth: 7 min_samples_leaf: 55 max_features: 0.8
train r_score: 0.0790944866718
test r_score: 0.0334907548645
max_depth: 7 min_samples_leaf: 70 max_features: 0.8
train r_score: 0.0774035865749
test r_score: 0.0328106626575
max_depth: None min_samples_leaf: 55 max_features: 0.8


KeyboardInterrupt: 

In [120]:
features1

['y_past',
 'tec123',
 'tec20-30',
 'technical_30',
 'technical_11',
 'technical_43',
 'fundamental_11',
 'tec123_past',
 'technical_20',
 'technical_11_diff',
 'technical_2',
 'technical_43_diff',
 'technical_21_diff',
 'technical_6_diff',
 'technical_2_diff',
 'technical_30_diff',
 'technical_17',
 'technical_6',
 'technical_44_nan',
 'technical_7',
 'fundamental_8',
 'technical_21',
 'fundamental_25_nan',
 'technical_0_nan',
 'fundamental_27_nan',
 'technical_24_nan',
 'fundamental_53',
 'technical_40',
 't22_0.5',
 'technical_9_nan',
 'technical_14',
 'technical_17_diff',
 'technical_20_diff',
 'technical_19',
 'technical_28_nan',
 'technical_37_nan',
 'technical_12_nan',
 'technical_32_nan',
 't34_-0.5',
 'technical_25_nan',
 'fundamental_21',
 'technical_14_diff',
 'fundamental_33_nan',
 'fundamental_18',
 'technical_38_nan',
 'technical_35',
 'derived_1_nan',
 'fundamental_33',
 'fundamental_48',
 'technical_31_nan',
 'technical_19_diff',
 'fundamental_50',
 'technical_27',
 'nu

# Hardcoding... 

In [122]:
import zlib
import pickle
def zdumps(obj):
    return zlib.compress(pickle.dumps(obj,pickle.HIGHEST_PROTOCOL),9)

def zloads(zstr):
    return pickle.loads(zlib.decompress(zstr))

In [139]:
etr = ExtraTreesRegressor(n_jobs=-1,n_estimators=128, max_depth=5, max_features=0.8, min_samples_leaf=50,
                          random_state=17, verbose=0)
etr.fit(train[features1], target)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=5,
          max_features=0.8, max_leaf_nodes=None, min_impurity_split=1e-07,
          min_samples_leaf=50, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
          oob_score=False, random_state=17, verbose=0, warm_start=False)

In [123]:
train = pd.concat([X_train, X_test], axis=0)

In [124]:
target = pd.concat([y_train, y_test], axis=0)

In [126]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1710756 entries, 131062 to 1710755
Columns: 333 entries, derived_0 to tec123_past
dtypes: float32(218), int64(109), uint8(6)
memory usage: 2.8 GB


In [140]:
etr_hard = zdumps(etr)

In [135]:
try:
    with open('etr_hard.hard', 'wb') as hard_f:
        hard_f.write(etr_hard)
except IOError as err:
    raise err

In [141]:
etr_hard

b'x\xda\xdc\xbdy<\x95\xdd\xf77\xae\tE\xa5Y\xa8P!\x19#3\xeb\x08!2\xcf\x999\x1c\xf3<$D\x08%ITJR\xa4AD\t\x95lE\xa5\xa4"\xa59\x8a\xd2$)M\xa4G\xf7\xc7\xd5\xd9\xc7u\xdf\xcf\xf5\xb9\x7f\xcf\xeb\xf7\xc7\xf7{\xfe\xe8:b]{Z{\xed\xb5\xde\xfb\xbd\xd7\x8e\x9b\x98\xcd\x984\x8e\xed\xf7\'m^\x88\x8f/\xdd%\xd8_\x8a\xee\x1fB\xf7s\xf5\xa5Ky\x04\x04\xd3CB\xb3\xd2f\xe9D\x86\x06\xbbX\x04\xd3\xe9!ft\xcf\x91\xff\x0b\t\x08\xce\xda\x99\xb5,&+3+&K,\x8d\xd7\xcf\xcb\xdf)\x82\xee\xe5\xc9\x08u\xf2\x08vq\x0b\xf5\n\xf0w\x1ay\x97G\x96.\xdb\xe8\'\x8d\xdd\xdf\xc9;\xc05$k\xcd\xaf\x91O\x1a\xb7\x9fK\xa4\x93\x07\xdd%4l\xe4mY\xba\xf0:\xe7\xf7go\xda4W\x97\x10\xba\xd3H\xa1^~.\xa1#\x85\xa4\xcd *\x15:R\xfa_\xffd\xa5\xcd\xfcS\x9d\xbf\xad\xcd\x14\x7f\'7_\x97\x90\x10z\x88S\x96\x11\x83\x9dY\x05\xce\x90@_\xaf\xd0P\xfa\xc8k\xd9\x83]\xfc\xdd\x03\xfc\xb2\x18\x9ci\x13]\xc2B\x03F\n\xfa\xdd\x86\x10\x17\xbf@\xdf\x11\xb9\xbf\xfe0\xcb`|\x1a\xf7_o\x1am[\x96Q\x1a\'\xf3\xcdi\xd3~\xb7\xe1w+\x9d\xfc\x03\xdcGZa\xf4\xbb\xe4\x80\xb0\xd0\xc0\xb0\xd0\xbf~