In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def get_xy(market_train_df, le=None):
    x, le = get_x(market_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    return x, y, le

def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le

def get_x(market_train_df, le=None):
    if le is None:
        le_assetCode = label_encode(market_train_df['assetCode'], min_count=10)
        le_assetName = label_encode(market_train_df['assetName'], min_count=5)
    else:
        le_assetCode, le_assetName = le
        
    x = market_train_df.copy()
    
    x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
    x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)
    
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)
    x.fillna(-1000,inplace=True)
    return x, (le_assetCode, le_assetName)

In [None]:
eps = 1e-4
market_train_df['returnsOpenNextMktres10'].quantile([eps, 1-eps])

In [None]:
market_train_df['returnsOpenNextMktres10'].hist(bins=200, range=(-0.3, 0.3), cumulative=True, normed=True)
x = np.linspace(-0.3, 0.3)
y = np.tanh(x / 0.065) * 0.5 + 0.5
plt.plot(x, y)
plt.xlim(-0.3, 0.3)

In [None]:
X, y, le = get_xy(market_train_df)

In [None]:
X.tail()

In [None]:
n_train = int(market_train_df.shape[0] * 0.8)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

df_valid = market_train_df.iloc[n_train:]

In [None]:
X.shape

In [None]:
train_cols = X.columns.tolist()
categorical_cols = ['assetCode', 'assetName']

dtrain = lgb.Dataset(X_train, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
dvalid = lgb.Dataset(X_valid, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

In [None]:
lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 15,
    max_depth = -1,
    min_data_in_leaf = 20,
    min_sum_hessian_in_leaf = 1e-3,
    bagging_fraction = 0.5,
    bagging_freq = 2,
    feature_fraction = 1.0,
    lambda_l1 = 0,
    lambda_l2 = 0
)

def sigma_score(preds, train_data):
    labels = train_data.get_label()
    
    assert len(labels) == len(df_valid)

    x_t = (preds * labels * df_valid['universe']).to_frame()
    x_t_sum = x_t.groupby(df_valid['time'])[0].sum()
    score = x_t_sum.mean() / x_t_sum.std()

    return 'sigma_score', score, True


evals_result = {}
m = lgb.train(lgb_params, dtrain, num_boost_round=400, valid_sets=(dvalid), valid_names=('valid'), verbose_eval=25,
              early_stopping_rounds=50, feval=sigma_score, evals_result=evals_result)


df_result = pd.DataFrame(evals_result['valid'])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 6))

metric = df_result.columns[0]

axt = ax[0].twinx()

df_result['sigma_score'].plot(ax=ax[0])
ax[0].scatter(df_result['sigma_score'].idxmax(), df_result['sigma_score'].max(), marker='+', color='red')

df_result[metric].plot(ax=axt, color='orange')
axt.scatter(df_result[metric].idxmin(), df_result[metric].min(), marker='+', color='red')

df_result.plot(x=metric, y='sigma_score', ax=ax[1], alpha=0.5)
df_result.plot(kind='scatter', x=metric, y='sigma_score', ax=ax[1])
ax[1].set_xlim(*df_result[metric].quantile((0.01, 0.99)))
ax[1].set_ylim(*df_result['sigma_score'].quantile((0.01, 0.99)))

fig.legend(loc='center right')
fig.tight_layout()

In [None]:
lgb.plot_importance(m);

# Train full model
Now we train a full model with `num_boost_round` found in validation.

In [None]:
num_boost_round, valid_score = df_result['sigma_score'].idxmax()+1, df_result['sigma_score'].max()

print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

In [None]:
dtrain_full = lgb.Dataset(X, y, feature_name=train_cols, categorical_feature=categorical_cols)

model = lgb.train(lgb_params, dtrain, num_boost_round=num_boost_round)

In [None]:
def make_predictions(predictions_template_df, market_obs_df, le):
    x, _ = get_x(market_obs_df, le)
    predictions_template_df.confidenceValue = np.clip(model.predict(x), -1, 1)

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(predictions_template_df, market_obs_df, le)
    env.predict(predictions_template_df)
print('Done!')

In [None]:
env.write_submission_file()