In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

In [29]:
def R_score(y_pred, y):
    u = np.mean(y)
    R2 = 1 - np.sum(np.square(y - y_pred)) / np.sum(np.square(y - u))
    R = np.sign(R2) * np.sqrt(np.abs(R2))
    return R

In [3]:
excl = ['id', 'y', 'timestamp']
train = pd.read_hdf('train.h5')
feature_cols = [c for c in train.columns if c not in excl]
y = train.y
X_train = train[feature_cols]
d_mean = X_train.median(axis=0)

In [5]:
X_train.head()

Unnamed: 0,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,fundamental_3,fundamental_5,...,technical_35,technical_36,technical_37,technical_38,technical_39,technical_40,technical_41,technical_42,technical_43,technical_44
0,0.370326,-0.006316,0.222831,-0.21303,0.729277,-0.335633,0.113292,1.621238,-0.179404,,...,0.93788,0.775208,,,,-0.414776,,,-2.0,
1,0.014765,-0.038064,-0.017425,0.320652,-0.034134,0.004413,0.114285,-0.210185,0.216281,0.09675,...,0.232154,0.02559,,,,-0.273607,,,-2.0,
2,-0.010622,-0.050577,3.379575,-0.157525,-0.06855,-0.155937,1.219439,-0.764516,,,...,0.372688,0.151881,,,,-0.17571,,,-2.0,
3,,,,,,0.178495,,-0.007262,-0.097903,,...,0.751021,1.035936,,,,-0.211506,,,-2.0,
4,0.176693,-0.025284,-0.05768,0.0151,0.180894,0.139445,-0.125687,-0.018707,0.196391,,...,0.595206,0.630232,,,,-0.001957,,,0.0,


# Fill NaN 

In [8]:
n = X_train.isnull().sum(axis=1)
for c in X_train.columns:
    X_train[c + '_nan_'] = X_train[c].isnull()
    X_train[c + '_nan_'] = X_train[c + '_nan_'].map(lambda x : 1 if x else 0)
    d_mean[c + '_nan_'] = 0
X_train = X_train.fillna(d_mean)
X_train['znull'] = n

In [9]:
X_train.head()

Unnamed: 0,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,fundamental_3,fundamental_5,...,technical_36_nan__nan_,technical_37_nan__nan_,technical_38_nan__nan_,technical_39_nan__nan_,technical_40_nan__nan_,technical_41_nan__nan_,technical_42_nan__nan_,technical_43_nan__nan_,technical_44_nan__nan_,znull_nan_
0,0.370326,-0.006316,0.222831,-0.21303,0.729277,-0.335633,0.113292,1.621238,-0.179404,0.033375,...,0,0,0,0,0,0,0,0,0,0
1,0.014765,-0.038064,-0.017425,0.320652,-0.034134,0.004413,0.114285,-0.210185,0.216281,0.09675,...,0,0,0,0,0,0,0,0,0,0
2,-0.010622,-0.050577,3.379575,-0.157525,-0.06855,-0.155937,1.219439,-0.764516,-0.040183,0.033375,...,0,0,0,0,0,0,0,0,0,0
3,-0.000837,0.005523,0.021095,0.002476,0.011752,0.178495,-0.007395,-0.007262,-0.097903,0.033375,...,0,0,0,0,0,0,0,0,0,0
4,0.176693,-0.025284,-0.05768,0.0151,0.180894,0.139445,-0.125687,-0.018707,0.196391,0.033375,...,0,0,0,0,0,0,0,0,0,0


# Cut y 

In [11]:
low_y_cut = -0.075
high_y_cut = 0.075
y_is_above_cut = (y > high_y_cut)
y_is_below_cut = (y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

# Extra Trees model

In [10]:
rfr = ExtraTreesRegressor(n_estimators=100, max_depth=4, n_jobs=-1, random_state=17, verbose=0)
clf_tree = rfr.fit(X_train, y)

# linear model 

In [12]:
clf_linear = LinearRegression(n_jobs=-1)

In [13]:
X_linear = np.array(train[feature_cols].fillna(d_mean).loc[y_is_within_cut,'technical_20'].values).reshape(-1, 1)

In [15]:
y_linear = train.loc[y_is_within_cut, 'y']

In [16]:
clf_linear.fit(X_linear, y_linear)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

# Predict 

In [17]:
ymean_dict = dict(train.groupby(["id"])["y"].median())

In [18]:
ymean_dict

{0: 0.00014715458,
 6: -0.00010041468,
 7: -8.5601396e-06,
 10: -0.0021154177,
 11: -0.0002619196,
 12: -0.00012671162,
 13: -0.00072409183,
 14: -0.00014462898,
 15: -0.00017424813,
 16: 0.0005480476,
 17: 0.00030971391,
 18: -0.0010425585,
 19: -0.0006848062,
 20: -2.0579713e-05,
 22: -0.0002498877,
 23: -0.0011390932,
 24: -0.00011076839,
 25: -4.3393651e-05,
 26: 0.00057791552,
 27: 0.00034231629,
 30: -0.00069737807,
 31: -0.00034755486,
 32: -0.0014639522,
 33: -0.0002883288,
 38: 7.7341705e-05,
 39: 7.030111e-05,
 40: 0.00029423155,
 41: -0.00044703536,
 43: -0.0014288621,
 44: -4.7546026e-05,
 46: 0.00052034878,
 47: 0.00036031249,
 48: 0.00032337959,
 49: -0.0018101166,
 52: 0.00010535077,
 54: -0.00015889705,
 55: 0.00026677683,
 56: -0.00028469961,
 58: -0.0015310098,
 59: -0.00063046254,
 60: -0.00062268006,
 62: 0.0017435532,
 63: 0.00060161331,
 64: 0.00040197227,
 65: -0.00085036032,
 66: -0.00061704742,
 68: -0.0010960643,
 69: -6.4337466e-05,
 70: 0.00024851682,
 72: 0

In [19]:
y_tree = clf_tree.predict(X_train).clip(low_y_cut, high_y_cut) 

In [25]:
y_linear = clf_linear.predict(np.array(X_train['technical_20']).reshape(-1,1)).clip(low_y_cut, high_y_cut) 

In [26]:
def combine_models(y_tree, y_linear, model_weights, dict_weights):
    pred = y_tree * model_weights + (1 - model_weights) * y_linear
    train['y_pred'] = pred
    pred = train.apply(lambda r: 0.95 * r['y_pred'] + 0.05 * ymean_dict[r['id']]
                           if r['id'] in ymean_dict else r['y_pred'], axis=1)
    pred = [float(format(x, '.6f')) for x in train['y_pred']]
    return pred


In [30]:
y0 = combine_models(y_tree, y_linear, 0.65, 0.95)

In [31]:
y0

[0.000279,
 0.000254,
 -0.00031,
 -0.000323,
 -0.00029,
 -0.000717,
 -0.000262,
 -0.000424,
 0.000254,
 0.00024,
 0.000272,
 -0.00032,
 0.000254,
 0.001348,
 0.000261,
 0.000586,
 -0.000437,
 0.000262,
 0.000232,
 0.000243,
 0.000204,
 -0.000514,
 0.000258,
 0.000205,
 0.000241,
 -0.00054,
 0.000272,
 -0.001488,
 0.002028,
 -0.000165,
 0.000272,
 -0.006109,
 0.000222,
 0.000258,
 0.000715,
 0.000261,
 -0.000169,
 0.000221,
 -0.000358,
 0.000253,
 -0.001645,
 0.000199,
 0.000247,
 -0.000233,
 -0.001448,
 0.000599,
 0.000632,
 0.000254,
 0.00024,
 -0.00043,
 0.000265,
 -0.000337,
 -0.000232,
 0.000273,
 0.000206,
 0.00027,
 -0.000534,
 -0.00023,
 -0.000961,
 0.000233,
 0.0002,
 0.000238,
 0.000262,
 0.000258,
 0.001562,
 0.000278,
 -0.000391,
 0.000634,
 0.000279,
 0.000262,
 -0.000474,
 0.000275,
 0.000926,
 0.000212,
 0.000633,
 0.000258,
 0.000193,
 -0.000142,
 -0.000362,
 0.000221,
 0.000378,
 0.000223,
 0.000281,
 0.000254,
 0.000204,
 -0.000159,
 0.000943,
 0.000199,
 -0.001992,
 -

In [32]:
R_score(y0, y)

0.029620990131762143

In [35]:
def grid_search(model_weights, dict_weights):
    best_est = {'model_weight' : 0, 'dict_weight': 0, 'score': -9999}
    for model_weight in model_weights:
        for dict_weight in dict_weights:
            y_pred = combine_models(y_tree, y_linear, model_weight, dict_weight)
            score = R_score(y_pred, y)
            if score > best_est['score']:
                best_est['score'] = score
                best_est['model_weight'] = model_weight
                best_est['dict_weight'] = dict_weight
    return best_est
            


In [36]:
model_weights = [0.5, 0.6, 0,7, 0.8, 0,9]
dict_weights = [0.1, 0.05, 0.02]
print(grid_search(model_weights, dict_weights))

{'dict_weight': 0.1, 'model_weight': 0.8, 'score': 0.031515694425364386}


In [37]:
model_weights = [0.75, 0.8, 0.85]
dict_weights = [0.1, 0.08, 0.12]
print(grid_search(model_weights, dict_weights))

{'dict_weight': 0.1, 'model_weight': 0.85, 'score': 0.032058865104607086}


In [38]:
R_score(y_linear, y)

0.012728074988517333

In [40]:
R_score(y_tree, y)

0.033450507507379816

In [41]:
print(R_score(combine_models(y_tree, y_linear, 0.65, 1), y))

0.0296209901318


In [48]:
train.loc[train['y']>0.08][['derived_0', 'derived_1']]

Unnamed: 0,derived_0,derived_1
185,0.004057,0.311185
378,0.059387,0.002478
688,0.428701,0.074175
778,0.045536,0.053580
895,-0.028834,0.352537
962,,
974,0.106547,0.031361
1027,0.542401,0.984190
1131,-0.464875,0.110491
1255,0.446576,0.839643
