In [1]:
import pandas as pd
import numpy as np
import twosigmafunc

import matplotlib.pyplot as plt
import seaborn
plt.rcParams['figure.figsize'] = (14.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Load data 

In [3]:
train, origin_features = twosigmafunc.preprocess()

In [4]:
train = train.fillna(train.median())

In [5]:
twosigmafunc.add_diffs(train, origin_features)

sucessfully add 108 diff features


In [6]:
train['tec20-30'] = train.technical_20 - train.technical_30

# Split into 2 folds 

In [7]:
features = twosigmafunc.origin_features(train, excl=['id', 'timestamp', 'y', 'id_diff'])
X_1, y_1, X_2, y_2 = twosigmafunc.split_data(train, features)

In [8]:
X_1.head()

Unnamed: 0,derived_0,derived_1,derived_2,derived_3,derived_4,fundamental_0,fundamental_1,fundamental_2,fundamental_3,fundamental_5,...,technical_36_diff,technical_37_diff,technical_38_diff,technical_39_diff,technical_40_diff,technical_41_diff,technical_42_diff,technical_43_diff,technical_44_diff,tec20-30
131062,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,-0.040183,0.033375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131895,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,-0.040183,0.033375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132728,-0.000837,0.005523,0.021095,0.002476,0.011752,-0.040645,-0.007395,-0.030291,-0.040183,0.033375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133561,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,-0.007395,0.212425,-0.178111,-0.126889,...,0.81268,3.951567e-12,1.418487e-13,1.591224e-16,-0.14571,0.0,0.0,0.659754,0.0,0.0
134393,-0.230583,0.488096,0.93592,0.028222,-0.083071,-0.240929,-0.007395,0.212425,-0.178111,-0.126889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# train Linear models on two folds

In [9]:
from sklearn.linear_model import Ridge, LinearRegression

In [10]:
low_y_cut = -0.08
high_y_cut = 0.08

In [11]:
y1_in = (y_1 > low_y_cut) & (y_1 < high_y_cut)
y2_in = (y_2 > low_y_cut) & (y_2 < high_y_cut)

In [12]:
lr_1 = LinearRegression(n_jobs=-1, normalize=True)
lr_2 = LinearRegression(n_jobs=-1, normalize=True)

In [13]:
lr_1.fit(X_1.loc[y1_in, 'technical_20_diff'].values.reshape(-1, 1), y_1[y1_in])
y_pred_2 = lr_1.predict(X_2['technical_20_diff'].values.reshape(-1, 1)).clip(low_y_cut, high_y_cut)

In [14]:
twosigmafunc.R_score(y_pred_2, y_2)

0.026099078154087435

In [15]:
lr_1.fit(X_2.loc[y2_in, 'technical_20_diff'].values.reshape(-1, 1), y_2[y2_in])
y_pred_1 = lr_1.predict(X_1['technical_20_diff'].values.reshape(-1, 1)).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_pred_1, y_1)

-0.015022089905190586

model trained on fold 2 performs badly on fold 1

### two dimensional linear model 

In [16]:
lr_2.fit(X_1.loc[y1_in, ['technical_20_diff', 'tec20-30']], y_1[y1_in])
y_pred_2_ = lr_2.predict(X_2[['technical_20_diff', 'tec20-30']]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_pred_2_, y_2)

0.02978515625

In [17]:
lr_2.fit(X_2.loc[y2_in, ['technical_20_diff', 'tec20-30']], y_2[y2_in])
y_pred_1_ = lr_2.predict(X_1[['technical_20_diff', 'tec20-30']]).clip(low_y_cut, high_y_cut)
twosigmafunc.R_score(y_pred_1_, y_1)

0.0037664181202493926

same as 1 dimensional case

### combine two models' result 

In [18]:
y_1_pred = 0.5 * (y_pred_1 + y_pred_1_)
twosigmafunc.R_score(y_1_pred, y_1)

-0.0094555257475767013

In [19]:
y_2_pred = 0.5 * (y_pred_2 + y_pred_2_)
twosigmafunc.R_score(y_2_pred, y_2)

0.029231698334171417

In [20]:
y_pred = np.concatenate([y_1_pred.reshape(-1, 1), y_2_pred.reshape(-1, 1)], axis=0)

In [22]:
y_pred.shape

(1710756, 1)

In [21]:
train.y.shape

(1710756,)

In [None]:
twosigmafunc.R_score(y_pred, train.y)