In [1]:
import pandas as pd
import numpy as np

# Data Loading 

In [2]:
train = pd.read_hdf('train.h5')

# Feature Columns 

In [3]:
excl = ['id', 'y', 'timestamp']
origin_features = [feature for feature in train.columns if feature not in excl]
acc_features = [feature + '_acc' for feature in origin_features]

# Add Accumulation Features 

In [4]:
train.fillna(train.mean(), inplace=True)
train.sort_values(['id', 'timestamp'], inplace=True)
train['id_diff'] = train.id.diff()

In [5]:
train.head()

Unnamed: 0,id,timestamp,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,...,technical_37,technical_38,technical_39,technical_40,technical_41,technical_42,technical_43,technical_44,y,id_diff
131062,0,167,-4.536046,772943600000.0,-0.332033,-0.504601,18.016613,-0.020409,-570375360.0,-0.162295,...,-0.091034,-0.081567,-0.07287,0.049083,0.005236,-0.017,-0.97353,0.000388,-0.007108,
131895,0,168,-4.536046,772943600000.0,-0.332033,-0.504601,18.016613,-0.020409,-570375360.0,-0.162295,...,-0.091034,-0.081567,-0.07287,0.049083,0.005236,-0.017,-0.97353,0.000388,0.00195,0.0
132728,0,169,-4.536046,772943600000.0,-0.332033,-0.504601,18.016613,-0.020409,-570375360.0,-0.162295,...,-0.091034,-0.081567,-0.07287,0.049083,0.005236,-0.017,-0.97353,0.000388,0.017724,0.0
133561,0,170,-0.230583,0.4880956,0.93592,0.028222,-0.083071,-0.240929,-570375360.0,0.212425,...,0.0,0.0,0.0,-0.160478,0.005236,0.0,0.0,0.000388,0.012934,0.0
134393,0,171,-0.230583,0.4880956,0.93592,0.028222,-0.083071,-0.240929,-570375360.0,0.212425,...,0.0,0.0,0.0,-0.160478,0.005236,0.0,0.0,0.000388,-0.025229,0.0


In [5]:
for feature in origin_features:
    train[feature + '_acc'] = train[feature].expanding().mean()
train.loc[train.id_diff!=0, acc_features]=0

In [6]:
X_train = train.loc[train.timestamp <=905, origin_features + acc_features]
y_train = train.loc[train.timestamp <=905, 'y']
X_test = train.loc[train.timestamp > 905, origin_features + acc_features]
y_test = train.loc[train.timestamp > 905, 'y']

In [7]:
del train

In [8]:
importance = {}
for feature in acc_features:
    importance[feature] = np.corrcoef(X_train[feature], y_train)[0][1]
importance = sorted(importance.items(), key=lambda x : np.abs(x[1]), reverse=True)

In [9]:
importance[:10]

[('technical_11_acc', -0.0030668451382165562),
 ('fundamental_15_acc', 0.0028671613340361486),
 ('technical_30_acc', 0.0027132133481807216),
 ('fundamental_35_acc', 0.002594418159358743),
 ('technical_2_acc', -0.0025280015280443853),
 ('fundamental_30_acc', 0.0024511623726762785),
 ('technical_21_acc', -0.0021763898804496519),
 ('technical_19_acc', -0.0021714845433681864),
 ('technical_6_acc', -0.0021502191687992841),
 ('technical_20_acc', 0.0020860887517654336)]

# Using Linear Models 

In [10]:
from sklearn.linear_model import Ridge

In [11]:
from twosigmafunc import R_score
acc_score = {}
for acc_feature, _ in importance:
    ridge = Ridge()
    ridge.fit(X_train[acc_feature].reshape(-1,1), y_train)
    y_pred = ridge.predict(X_test[acc_feature].reshape(-1,1))
    score = R_score(y_pred, y_test)
    acc_score[acc_feature] = score



In [12]:
acc_score = sorted(acc_score.items(), key=lambda x : x[1], reverse=True)

In [13]:
acc_score

[('technical_11_acc', 0.0017381372752961657),
 ('technical_2_acc', 0.0010909971910032824),
 ('technical_17_acc', -0.0014296309046830328),
 ('technical_6_acc', -0.0014679596099107957),
 ('fundamental_10_acc', -0.001885814428149407),
 ('technical_0_acc', -0.002037484000162338),
 ('technical_38_acc', -0.0020778637368478242),
 ('fundamental_55_acc', -0.0023383783405406993),
 ('technical_29_acc', -0.002395605210330273),
 ('technical_37_acc', -0.0024045826815668027),
 ('fundamental_11_acc', -0.0024214211991485076),
 ('fundamental_63_acc', -0.0024398789378452571),
 ('technical_14_acc', -0.0024643839068755934),
 ('fundamental_25_acc', -0.0024846351146125332),
 ('technical_41_acc', -0.0024907556172098907),
 ('technical_19_acc', -0.0024913393125952899),
 ('derived_1_acc', -0.0025066955010761521),
 ('fundamental_30_acc', -0.0025081952222639902),
 ('technical_10_acc', -0.0025239359011927742),
 ('technical_32_acc', -0.0025288036373703154),
 ('fundamental_6_acc', -0.0025507954055162345),
 ('fundamen