In [3]:
import pandas as pd
import numpy as np
import twosigmafunc

In [4]:
train = pd.read_hdf('train.h5')

In [5]:
null_labels = [
 'technical_21',
 'technical_19',
 'technical_27',
 'technical_36',
 'technical_35',
 'technical_17',
 'technical_43',
 'technical_13',
 'fundamental_33',
 'technical_14',
 'technical_33',
 'fundamental_18',
 'fundamental_48',
 'fundamental_59',
 'technical_9',
 'technical_16',
 'technical_42',
 'technical_18',
 'fundamental_42',
 'fundamental_0',
 'fundamental_7',
 'fundamental_41',
 'technical_41',
 'fundamental_21',
 'fundamental_19',
 'technical_29',
 'technical_24',
 'derived_0',
 'derived_1',
 'fundamental_17',
 'technical_3',
 'fundamental_20',
 'fundamental_32',
 'fundamental_62',
 'fundamental_25',
 'technical_1',
 'fundamental_58',
 'derived_3',
 'technical_5',
 'fundamental_52',
 'technical_10',
 'technical_31',
 'technical_25',
 'technical_44',
 'technical_28',
 'fundamental_40',
 'fundamental_27',
 'fundamental_29',
 'fundamental_43',
 'fundamental_15',
 'fundamental_30',
 'fundamental_60',
 'fundamental_16',
 'fundamental_50',
 'fundamental_44',
 'fundamental_37',
 'fundamental_14',
 'fundamental_23',
 'fundamental_55',
 'fundamental_8',
 'fundamental_63',
 'fundamental_39',
 'fundamental_54',
 'derived_2',
 'derived_4',
 'fundamental_35',
 'fundamental_34',
 'fundamental_47',
 'fundamental_51',
 'fundamental_31',
 'fundamental_49',
 'fundamental_22',
 'fundamental_9',
 'fundamental_24',
 'fundamental_57',
 'fundamental_28',
 'fundamental_61',
 'fundamental_1',
 'fundamental_6',
 'fundamental_38',
 'fundamental_5']

In [6]:
excl = ['id', 'timestamp', 'y']
cols_origin = [col for col in train.columns if col not in ['y']]
feature_origin = [col for col in train.columns if col not in excl]
feature_diff = [col + '_diff' for col in feature_origin]


# Add differs 

In [7]:
# add time series diff
d_mean = train.median()
train.sort_values(['id', 'timestamp'], inplace=True)
train['id_diff'] = train.id.diff()
d_mean['id_diff'] = 0.0
for col in feature_origin:
    train[col + '_diff'] = train[col].diff()
    d_mean[col + '_diff'] = 0.0

train.loc[train.id_diff!=0, feature_diff] = 0


In [6]:
train.head()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_35_diff,technical_36_diff,technical_37_diff,technical_38_diff,technical_39_diff,technical_40_diff,technical_41_diff,technical_42_diff,technical_43_diff,technical_44_diff
131062,0,167,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131895,0,168,,,,,,,,,...,,,,,,,,,,
132728,0,169,,,,,,,,,...,,,,,,,,,,
133561,0,170,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,,0.212425,...,,,,,,,,,,
134393,0,171,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,,0.212425,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,


# add `nan` tags 

In [8]:
n = train.isnull().sum(axis=1)
for c in null_labels:
    train[c + '_nan'] = pd.isnull(train[c])
    d_mean[c + '_nan'] = 0
train = train.fillna(d_mean)
train['nullcounts'] = n

In [7]:
train.head()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,fundamental_9_nan,fundamental_24_nan,fundamental_57_nan,fundamental_28_nan,fundamental_61_nan,fundamental_1_nan,fundamental_6_nan,fundamental_38_nan,fundamental_5_nan,nullcounts
131062,0,167,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,...,True,True,True,True,True,True,True,True,True,107
131895,0,168,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,...,True,True,True,True,True,True,True,True,True,212
132728,0,169,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,...,True,True,True,True,True,True,True,True,True,212
133561,0,170,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,-0.007395,0.212425,...,False,False,False,False,True,True,False,False,False,125
134393,0,171,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,-0.007395,0.212425,...,False,False,False,False,True,True,False,False,False,38


In [14]:
feature_cols = [col for col in train.columns if col not in excl]
X_train, y_train, X_test, y_test = twosigmafunc.split_data(train, feature_cols)

# extract feature importance on 190 features  
* Random Forest

In [7]:
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(n_estimators=100,max_depth=4, n_jobs=-1, random_state=17, verbose=0)

In [56]:
etr.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=4,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [59]:
imp = pd.DataFrame({'feature':feature_cols, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [60]:
imp

Unnamed: 0,feature,important
93,technical_30,0.120668
85,technical_20,0.084096
15,fundamental_11,0.063413
231,technical_9_nan,0.062162
77,technical_11,0.052754
195,technical_21_diff,0.050056
186,technical_11_diff,0.043054
215,technical_43_diff,0.039284
70,technical_2,0.030954
106,technical_43,0.027248


* linear model

In [56]:
from sklearn.linear_model import LinearRegression, Ridge
lr = LinearRegression(n_jobs=-2)

In [37]:
low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (y_train > high_y_cut)
y_is_below_cut = (y_train < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

In [17]:
linear_data = X_train[['technical_20', 'technical_20_diff']]

In [21]:
lr.fit(linear_data.loc[y_is_within_cut], y_train.loc[y_is_within_cut])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-2, normalize=False)

Lr with normalize

In [22]:
lr_n = LinearRegression(n_jobs=-1, normalize=True)
lr_n.fit(linear_data.loc[y_is_within_cut], y_train.loc[y_is_within_cut])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

**Score**

In [24]:
y_lr = lr.predict(X_test[['technical_20', 'technical_20_diff']]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr, y_test)

0.026258437858586125

In [25]:
y_lr_n = lr_n.predict(X_test[['technical_20', 'technical_20_diff']]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_n, y_test)

0.026253916130410282

**train on test**

In [29]:
y_above_cut = (y_test > high_y_cut)
y_below_cut = (y_test < low_y_cut)
y_within_cut_t = (~y_above_cut & ~y_below_cut)

In [38]:
lr.fit(X_test.loc[y_within_cut_t, ['technical_20', 'technical_20_diff']], y_test.loc[y_within_cut_t])
y_lr_t = lr.predict(X_test[['technical_20', 'technical_20_diff']]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_t, y_test)

0.02872537889954821

### 4 features 

In [50]:
lr_features = ['technical_20', 'technical_20_diff', 'technical_30', 'technical_30_diff']

In [39]:
lr.fit(X_train.loc[y_is_within_cut, lr_features], y_train.loc[y_is_within_cut])
y_lr_4 = lr.predict(X_test[lr_features]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_4, y_test)

0.02517587722068941

on test

In [40]:
lr.fit(X_test.loc[y_within_cut_t, lr_features], y_test.loc[y_within_cut_t])
y_lr_4t = lr.predict(X_test[lr_features]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_4t, y_test)

0.02542409547017583

In [41]:
lr.intercept_

0.00019000316200126489

In [46]:
lr.coef_

array([-0.11967673, -0.28443399, -0.06038046,  0.14130551])

2 diff features

In [47]:
lr_features = ['technical_20_diff', 'technical_30_diff']

In [48]:
lr.fit(X_train.loc[y_is_within_cut, lr_features], y_train.loc[y_is_within_cut])
y_lr_2diff = lr.predict(X_test[lr_features]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_2diff, y_test)

0.025547632665956835

** Poly features **

In [49]:
from sklearn.preprocessing import PolynomialFeatures

In [52]:
poly = PolynomialFeatures(2)
poly_data = poly.fit_transform(X_train[lr_features])
poly_data.shape

(806298, 15)

In [54]:
lr.fit(poly_data[y_is_within_cut,:], y_train.loc[y_is_within_cut])
y_lr_d2 = lr.predict(poly.transform(X_test[lr_features])).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_lr_d2, y_test)



0.027660929025422259

In [55]:
lr.coef_

array([  0.00000000e+00,  -8.84715340e-02,  -1.72526378e-01,
         1.60155888e-03,   1.98629066e-01,  -7.77571378e+00,
        -3.27377858e+00,   1.65334503e-14,  -1.80358043e+01,
         2.63114299e+01,   4.10690260e+01,   2.30332217e+01,
        -2.86928507e-01,   1.76027971e+00,   4.20058233e+00])

In [58]:
ridge = Ridge()
ridge.fit(poly_data[y_is_within_cut,:], y_train.loc[y_is_within_cut])
y_rid_d2 = ridge.predict(poly.transform(X_test[lr_features])).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_rid_d2, y_test)

0.024408742142398687

* Ridge

In [40]:
from sklearn.linear_model import Ridge, Lasso
ridge = Ridge()

In [53]:
ridge.fit(X_train[['technical_30', 'technical_30_diff', 'technical_20', 'technical_20_diff']], X_train.y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

* Lasso

In [63]:
lasso = Lasso()
lasso.fit(X_train[['technical_30', 'technical_30_diff', 'technical_20', 'technical_20_diff']], X_train.y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

* Xgboost

In [50]:
from xgboost import XGBRegressor
xgb = XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1)

In [52]:
xgb.fit(X_train[xgb_features], y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [30]:
imp = twosigmafunc.sort_dict(xgb.booster().get_fscore(), key=lambda x : x[1])

In [48]:
xgb_features = [a for a, b in imp if b > 5]

In [49]:
xgb_features

['technical_30',
 'technical_20',
 'technical_30_diff',
 'fundamental_53',
 'technical_20_diff',
 'technical_21_diff',
 'technical_14_diff',
 'technical_43_diff',
 'technical_40',
 'technical_11_diff',
 'technical_35_diff',
 'technical_36',
 'fundamental_11',
 'technical_19',
 'technical_19_diff',
 'technical_2_diff',
 'technical_27',
 'fundamental_53_diff',
 'technical_21',
 'technical_36_diff',
 'technical_9_nan',
 'fundamental_60_diff',
 'technical_17',
 'fundamental_18_diff',
 'technical_17_diff',
 'technical_13_diff',
 'fundamental_44',
 'technical_33']

### creat validation set 

In [57]:
y_etr = etr.predict(X_test).clip(low_y_cut, high_y_cut)

In [37]:
y_svr = svr.predict(X_test[feature_cols])

In [56]:
y_ridge = ridge.predict(X_test[['technical_30', 'technical_30_diff','technical_20', 'technical_20_diff']]).clip(low_y_cut, high_y_cut)

In [64]:
y_lasso = lasso.predict(X_test[['technical_30', 'technical_30_diff','technical_20', 'technical_20_diff']]).clip(low_y_cut, high_y_cut)

In [53]:
y_xgb = xgb.predict(X_test[xgb_features]).clip(low_y_cut, high_y_cut)

In [58]:
from twosigmafunc import R_score
R_score(y_etr, y_test)

0.019170860958290212

In [57]:
R_score(y_ridge, X_test.y)

0.02514904325342103

In [65]:
R_score(y_lasso, X_test.y)

-0.002363115148336778

In [41]:
R_score(y_svr.clip(low_y_cut, high_y_cut), X_test.y)

-0.15549906463860921

In [61]:
R_score(y_xgb,  X_test.y)

0.050979812927681063

### clip value 0.08

In [85]:
low_y_cut = -0.086093
high_y_cut = 0.093497
y_is_above_cut = (X_train.y > high_y_cut)
y_is_below_cut = (X_train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

In [86]:
y_xgb = xgb.predict(X_test[feature_cols]).clip(low_y_cut, high_y_cut)

In [87]:
R_score(y_xgb, X_test.y)

0.050979812927681063

In [34]:
train = 0

In [27]:
model_split = [0.05 * x for x in range(20)]

In [28]:
for a in model_split:
    y = a * y_etr + (1 - a) * y_lr
    print('Random Forest:', a, 'Score:', R_score(y, X_test.y))

Random Forest: 0.0 Score: 0.021760649232
Random Forest: 0.05 Score: 0.0232846698717
Random Forest: 0.1 Score: 0.0246520369963
Random Forest: 0.15000000000000002 Score: 0.0258875855594
Random Forest: 0.2 Score: 0.0270094117878
Random Forest: 0.25 Score: 0.0280311727395
Random Forest: 0.30000000000000004 Score: 0.0289634605351
Random Forest: 0.35000000000000003 Score: 0.0298146696811
Random Forest: 0.4 Score: 0.0305915689563
Random Forest: 0.45 Score: 0.0312996922544
Random Forest: 0.5 Score: 0.0319436137629
Random Forest: 0.55 Score: 0.0325271466046
Random Forest: 0.6000000000000001 Score: 0.0330534892749
Random Forest: 0.65 Score: 0.0335253355128
Random Forest: 0.7000000000000001 Score: 0.0339449579503
Random Forest: 0.75 Score: 0.0343142725443
Random Forest: 0.8 Score: 0.0346348886419
Random Forest: 0.8500000000000001 Score: 0.0349081480923
Random Forest: 0.9 Score: 0.0351351558445
Random Forest: 0.9500000000000001 Score: 0.0353168037937


In [29]:
for a in model_split:
    a = (0.15 / 1) * a + 0.7
    y = a * y_etr + (1 - a) * y_lr
    print('Random Forest:', a, 'Score:', R_score(y, X_test.y))

Random Forest: 0.7 Score: 0.0339449579503
Random Forest: 0.7074999999999999 Score: 0.034003519683
Random Forest: 0.715 Score: 0.0340609558841
Random Forest: 0.7224999999999999 Score: 0.0341172722382
Random Forest: 0.73 Score: 0.0341724742816
Random Forest: 0.7374999999999999 Score: 0.0342265674061
Random Forest: 0.745 Score: 0.0342795568612
Random Forest: 0.7525 Score: 0.0343314477574
Random Forest: 0.76 Score: 0.0343822450687
Random Forest: 0.7675 Score: 0.0344319536351
Random Forest: 0.7749999999999999 Score: 0.0344805781654
Random Forest: 0.7825 Score: 0.0345281232393
Random Forest: 0.7899999999999999 Score: 0.0345745933101
Random Forest: 0.7975 Score: 0.0346199927068
Random Forest: 0.8049999999999999 Score: 0.0346643256359
Random Forest: 0.8125 Score: 0.0347075961843
Random Forest: 0.82 Score: 0.0347498083206
Random Forest: 0.8274999999999999 Score: 0.0347909658972
Random Forest: 0.835 Score: 0.0348310726526
Random Forest: 0.8425 Score: 0.0348701322125


mean_dict

In [32]:
ymean_dict = dict(train.groupby(["id"])["y"].median())

In [33]:
median_split = [0.005 * x for x in range(20)]
for b in median_split:
    y = 0.77 * y_etr + 0.23 * y_lr
    y_data = pd.DataFrame({'id':X_test.id, 'y':y})
    y = y_data.apply(lambda r: (1 - b) * r['y'] + b * ymean_dict[r['id']] if r['id'] in ymean_dict else r['y'], axis = 1)
    print('median split:',b, 'score', R_score(y, X_test.y))

median split: 0.0 score 0.0344482820315
median split: 0.005 score 0.034560629489
median split: 0.01 score 0.0346706639744
median split: 0.015 score 0.0347784074414
median split: 0.02 score 0.0348838811183
median split: 0.025 score 0.0349871055331
median split: 0.03 score 0.0350881005369
median split: 0.035 score 0.0351868853264
median split: 0.04 score 0.0352834784658
median split: 0.045 score 0.0353778979069
median split: 0.05 score 0.0354701610084
median split: 0.055 score 0.0355602845545
median split: 0.06 score 0.0356482847726
median split: 0.065 score 0.0357341773495
median split: 0.07 score 0.0358179774478
median split: 0.075 score 0.0358996997208
median split: 0.08 score 0.035979358327
median split: 0.085 score 0.0360569669439
median split: 0.09 score 0.0361325387808
median split: 0.095 score 0.0362060865918


# here we remove some unimportant features 

In [61]:
feature_imp = imp.loc[imp.important > imp.important.mean() / 10, 'feature']

In [62]:
feature_imp.shape

(169,)

we remove 43 features, let see how remaining features perform!

# Train on the 169 features 

In [63]:
etr.fit(X_train[feature_imp], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=4,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [64]:
imp = pd.DataFrame({'feature':feature_imp, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [65]:
imp

Unnamed: 0,feature,important
93,technical_30,0.120480
85,technical_20,0.079670
15,fundamental_11,0.070782
231,technical_9_nan,0.059384
195,technical_21_diff,0.054813
186,technical_11_diff,0.044502
215,technical_43_diff,0.043677
106,technical_43,0.035264
77,technical_11,0.034311
179,technical_2_diff,0.033536


In [66]:
y_p = etr.predict(X_test[feature_imp])

In [68]:
R_score(y_p, y_test)

0.01836709369368009

that's ok

In [69]:
feature_imp2 = imp.loc[imp.important > imp.important.mean() / 5, 'feature']

In [70]:
feature_imp2.shape

(91,)

# train on the 91 features 

In [71]:
etr.fit(X_train[feature_imp2], y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=4,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [72]:
imp = pd.DataFrame({'feature':feature_imp2, 'important':etr.feature_importances_}).sort_values('important', ascending=False)

In [73]:
y_p = etr.predict(X_test[feature_imp2])
R_score(y_p, y_test)

0.018817558384923039

still_better

In [74]:
imp

Unnamed: 0,feature,important
93,technical_30,0.102649
85,technical_20,0.086099
231,technical_9_nan,0.066540
195,technical_21_diff,0.054683
15,fundamental_11,0.049471
186,technical_11_diff,0.046366
77,technical_11,0.043903
215,technical_43_diff,0.042044
106,technical_43,0.035027
70,technical_2,0.033336


In [76]:
list(imp.feature)

['technical_30',
 'technical_20',
 'technical_9_nan',
 'technical_21_diff',
 'fundamental_11',
 'technical_11_diff',
 'technical_11',
 'technical_43_diff',
 'technical_43',
 'technical_2',
 'technical_2_diff',
 'technical_6',
 'technical_6_diff',
 'technical_17_diff',
 'technical_14_diff',
 'fundamental_25_nan',
 'technical_17',
 'technical_7',
 'technical_30_diff',
 'technical_20_diff',
 'technical_40',
 'fundamental_8',
 'technical_14',
 'fundamental_60_diff',
 'fundamental_18',
 'technical_21',
 'fundamental_51',
 'fundamental_53',
 'technical_36_diff',
 'technical_29_diff',
 'technical_19_diff',
 'fundamental_56_diff',
 'technical_44_nan',
 'technical_25_nan',
 'technical_35_diff',
 'fundamental_27_nan',
 'fundamental_48',
 'technical_27',
 'fundamental_2_diff',
 'fundamental_51_diff',
 'technical_36',
 'fundamental_58',
 'technical_40_diff',
 'fundamental_23',
 'technical_24_nan',
 'derived_1_nan',
 'fundamental_15_diff',
 'fundamental_45_diff',
 'fundamental_33_nan',
 'fundamenta

so far today, try more later...

In [35]:
feature_imp3 = imp.loc[imp.important > imp.important.mean() / 3, 'feature']

In [36]:
feature_imp3

85            technical_20
93            technical_30
202      technical_30_diff
194      technical_20_diff
92            technical_29
228     fundamental_18_nan
225     fundamental_33_nan
229     fundamental_48_nan
170    fundamental_57_diff
284     fundamental_47_nan
90            technical_27
116     fundamental_2_diff
41          fundamental_37
166    fundamental_53_diff
22          fundamental_18
131    fundamental_18_diff
106           technical_43
87            technical_22
77            technical_11
18          fundamental_14
80            technical_14
57          fundamental_53
238     fundamental_41_nan
103           technical_40
98            technical_35
193      technical_19_diff
212      technical_40_diff
143    fundamental_30_diff
86            technical_21
84            technical_19
              ...         
254          derived_3_nan
113         derived_4_diff
187      technical_12_diff
58          fundamental_54
264     fundamental_29_nan
296     fundamental_38_nan
2

In [37]:
feature_imp3.shape

(217,)

### training on 217 features 

In [38]:
etr.fit(X_train[list(feature_imp3)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [40]:
imp = pd.DataFrame({'feature':feature_imp3, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp3])
R_score(y_p, X_test.y)

0.017785342460594933

In [41]:
imp

Unnamed: 0,feature,important
85,technical_20,0.047134
93,technical_30,0.027867
202,technical_30_diff,0.023659
194,technical_20_diff,0.019830
131,fundamental_18_diff,0.016366
228,fundamental_18_nan,0.014560
229,fundamental_48_nan,0.013488
225,fundamental_33_nan,0.012820
90,technical_27,0.011937
41,fundamental_37,0.011489


In [42]:
feature_imp4 = imp.loc[imp.important > imp.important.mean() / 5, 'feature']

In [44]:
feature_imp4.shape

(209,)

### training on 209 features 

In [45]:
etr.fit(X_train[list(feature_imp4)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [46]:
imp = pd.DataFrame({'feature':feature_imp4, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp4])
R_score(y_p, X_test.y)

0.019290564330312857

become solid again!!!

In [51]:
feature_imp5 = imp.loc[imp.important > imp.important.mean() / 3, 'feature']

In [52]:
feature_imp5.shape

(189,)

### training on 189 features 

In [53]:
etr.fit(X_train[list(feature_imp5)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [54]:
imp = pd.DataFrame({'feature':feature_imp5, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp5])
R_score(y_p, X_test.y)

0.016423012775028726

In [55]:
feature_imp6 = imp.loc[imp.important > imp.important.mean() / 5, 'feature']
feature_imp6.shape

(184,)

### training on 184 features 

In [56]:
etr.fit(X_train[list(feature_imp6)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [57]:
imp = pd.DataFrame({'feature':feature_imp6, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp6])
R_score(y_p, X_test.y)

0.019946981056484202

solid again!

In [58]:
feature_imp7 = imp.loc[imp.important > imp.important.mean() / 5, 'feature']
feature_imp7.shape

(177,)

### training on 177 features! 

In [60]:
etr.fit(X_train[list(feature_imp7)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [61]:
imp = pd.DataFrame({'feature':feature_imp7, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp7])
R_score(y_p, X_test.y)

0.018857205191046052

In [62]:
imp

Unnamed: 0,feature,important
85,technical_20,0.050796
93,technical_30,0.024535
194,technical_20_diff,0.023917
202,technical_30_diff,0.023150
229,fundamental_48_nan,0.016550
92,technical_29,0.015930
84,technical_19,0.014103
225,fundamental_33_nan,0.013412
228,fundamental_18_nan,0.013359
90,technical_27,0.012461


In [65]:
feature_imp8 = imp.loc[imp.important > imp.important.mean() / 4, 'feature']
feature_imp8.shape

(173,)

### training on 173 features 

In [66]:
etr.fit(X_train[list(feature_imp8)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [68]:
imp = pd.DataFrame({'feature':feature_imp8, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp8])
R_score(y_p, X_test.y)

0.019853806360231506

In [70]:
feature_imp9 = imp.loc[imp.important > imp.important.mean() / 4, 'feature']
feature_imp9.shape

(165,)

### training on 165 features 

In [71]:
etr.fit(X_train[list(feature_imp9)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [73]:
imp = pd.DataFrame({'feature':feature_imp9, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp9])
R_score(y_p, X_test.y)

0.020116523903936128

good, compress again!

In [74]:
feature_imp10 = imp.loc[imp.important > imp.important.mean() / 4, 'feature']
feature_imp10.shape

(157,)

### training on 157 features 

In [75]:
etr.fit(X_train[list(feature_imp10)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [76]:
imp = pd.DataFrame({'feature':feature_imp10, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp10])
R_score(y_p, X_test.y)

0.017990627537795775

In [78]:
feature_imp11 = imp.loc[imp.important > imp.important.mean() / 3, 'feature']
feature_imp11.shape

(147,)

### training on 147 features 

In [79]:
etr.fit(X_train[list(feature_imp11)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [80]:
imp = pd.DataFrame({'feature':feature_imp11, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp11])
R_score(y_p, X_test.y)

0.019052930948383926

In [82]:
feature_imp12 = imp.loc[imp.important > imp.important.mean() / 3, 'feature']
feature_imp12.shape

(142,)

### training on 142 features 

In [83]:
etr.fit(X_train[list(feature_imp12)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [84]:
imp = pd.DataFrame({'feature':feature_imp12, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp12])
R_score(y_p, X_test.y)

0.019622147486808905

In [89]:
feature_imp13 = imp.loc[imp.important > imp.important.mean() / 2, 'feature']
feature_imp13.shape

(123,)

### training on 123 features

In [90]:
etr.fit(X_train[list(feature_imp13)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [91]:
imp = pd.DataFrame({'feature':feature_imp13, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp13])
R_score(y_p, X_test.y)

0.019872222753794959

In [92]:
feature_imp14 = imp.loc[imp.important > imp.important.mean() / 2, 'feature']
feature_imp14.shape

(109,)

### training on 109 features

In [93]:
etr.fit(X_train[list(feature_imp14)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [95]:
imp = pd.DataFrame({'feature':feature_imp14, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp14])
R_score(y_p, X_test.y)

0.019484597524361814

In [96]:
feature_imp15 = imp.loc[imp.important > imp.important.mean() / 2, 'feature']
feature_imp15.shape

(94,)

### training on 93 features 

In [97]:
etr.fit(X_train[list(feature_imp15)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [98]:
imp = pd.DataFrame({'feature':feature_imp15, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp15])
R_score(y_p, X_test.y)

0.02009225543425288

In [99]:
feature_imp16 = imp.loc[imp.important > imp.important.mean() / 2, 'feature']
feature_imp16.shape

(82,)

### trainin on 82 features

In [100]:
etr.fit(X_train[list(feature_imp16)], X_train.y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=6,
          max_features='sqrt', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=-1, oob_score=False, random_state=17,
          verbose=0, warm_start=False)

In [101]:
imp = pd.DataFrame({'feature':feature_imp16, 'important':etr.feature_importances_}).sort_values('important', ascending=False)
y_p = etr.predict(X_test[feature_imp16])
R_score(y_p, X_test.y)

0.019395972706415677

end now, let's do it on 93 features

In [104]:
feature_imp15

85            technical_20
93            technical_30
202      technical_30_diff
194      technical_20_diff
225     fundamental_33_nan
41          fundamental_37
228     fundamental_18_nan
92            technical_29
166    fundamental_53_diff
183       technical_7_diff
106           technical_43
80            technical_14
18          fundamental_14
229     fundamental_48_nan
90            technical_27
74             technical_7
57          fundamental_53
170    fundamental_57_diff
131    fundamental_18_diff
284     fundamental_47_nan
116     fundamental_2_diff
289      fundamental_9_nan
212      technical_40_diff
86            technical_21
22          fundamental_18
89            technical_25
87            technical_22
216      technical_44_diff
79            technical_13
73             technical_6
              ...         
244          derived_0_nan
169    fundamental_56_diff
158    fundamental_45_diff
239       technical_41_nan
186      technical_11_diff
11           fundamental_7
5

In [67]:
def feature_type(feature):
    if feature.endswith('nan'):
        return 0
    elif feature.endswith('diff'):
        return 1
    else:
        return 2

In [106]:
type(feature_imp15)

pandas.core.series.Series

In [76]:
nan_features = [feature for feature in imp.feature if feature_type(feature)==0]

In [77]:
diff_features = [feature for feature in imp.feature if feature_type(feature)==1]

In [78]:
normal_features = [feature for feature in imp.feature if feature_type(feature)==2]

In [79]:
nan_features

['fundamental_18_nan',
 'fundamental_44_nan',
 'fundamental_14_nan',
 'fundamental_31_nan',
 'fundamental_8_nan',
 'fundamental_41_nan',
 'fundamental_0_nan',
 'fundamental_7_nan',
 'fundamental_33_nan',
 'fundamental_62_nan',
 'fundamental_34_nan',
 'derived_0_nan',
 'technical_5_nan',
 'fundamental_24_nan',
 'fundamental_35_nan',
 'technical_3_nan',
 'technical_24_nan',
 'derived_1_nan',
 'fundamental_25_nan',
 'technical_31_nan',
 'fundamental_39_nan',
 'technical_18_nan',
 'fundamental_21_nan',
 'fundamental_9_nan',
 'fundamental_47_nan',
 'fundamental_50_nan',
 'technical_9_nan',
 'fundamental_52_nan',
 'fundamental_16_nan',
 'derived_3_nan',
 'fundamental_23_nan',
 'technical_16_nan',
 'fundamental_48_nan',
 'fundamental_28_nan',
 'fundamental_17_nan',
 'fundamental_40_nan',
 'technical_1_nan',
 'fundamental_54_nan',
 'fundamental_32_nan',
 'fundamental_57_nan',
 'fundamental_49_nan',
 'fundamental_5_nan',
 'fundamental_55_nan',
 'fundamental_37_nan',
 'fundamental_60_nan',
 'fun

In [80]:
diff_features

['technical_20_diff',
 'technical_30_diff',
 'technical_2_diff',
 'technical_14_diff',
 'technical_11_diff',
 'technical_17_diff',
 'fundamental_57_diff',
 'technical_6_diff',
 'technical_1_diff',
 'technical_44_diff',
 'technical_40_diff',
 'fundamental_53_diff',
 'technical_41_diff',
 'technical_43_diff',
 'fundamental_49_diff',
 'fundamental_41_diff',
 'technical_27_diff',
 'technical_21_diff',
 'technical_7_diff',
 'technical_36_diff',
 'fundamental_18_diff',
 'technical_28_diff',
 'technical_10_diff',
 'fundamental_36_diff',
 'technical_31_diff',
 'fundamental_45_diff',
 'technical_24_diff',
 'fundamental_7_diff',
 'fundamental_52_diff',
 'technical_35_diff',
 'technical_19_diff',
 'technical_33_diff',
 'fundamental_42_diff',
 'fundamental_20_diff',
 'fundamental_30_diff',
 'fundamental_26_diff',
 'derived_2_diff',
 'fundamental_15_diff',
 'technical_13_diff',
 'fundamental_8_diff',
 'derived_4_diff',
 'technical_42_diff',
 'fundamental_13_diff',
 'fundamental_48_diff',
 'fundamen

In [81]:
normal_features

['technical_20',
 'technical_30',
 'technical_27',
 'technical_12',
 'technical_6',
 'technical_7',
 'technical_17',
 'technical_11',
 'technical_43',
 'technical_36',
 'technical_19',
 'technical_35',
 'fundamental_18',
 'fundamental_56',
 'fundamental_59',
 'fundamental_0',
 'technical_21',
 'technical_33',
 'technical_2',
 'fundamental_45',
 'technical_13',
 'technical_14',
 'technical_16',
 'fundamental_8',
 'technical_28',
 'fundamental_7',
 'technical_31',
 'technical_3',
 'technical_25',
 'fundamental_60',
 'fundamental_20',
 'derived_0',
 'fundamental_2',
 'technical_34',
 'fundamental_9',
 'fundamental_53',
 'nullcounts',
 'technical_24',
 'fundamental_58',
 'technical_29',
 'fundamental_52',
 'technical_22',
 'fundamental_61',
 'derived_1',
 'technical_5',
 'fundamental_46',
 'fundamental_14',
 'fundamental_42',
 'fundamental_62',
 'fundamental_57',
 'fundamental_40',
 'technical_42',
 'fundamental_50',
 'technical_10',
 'fundamental_16',
 'fundamental_27',
 'technical_40',
 