In [7]:
!pip install halo pyarrow numerapi lightgbm

import sys
import os
import pickle
from halo import Halo
import json
import gc
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
import lightgbm
import matplotlib
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
sys.path.insert(0, './utils')

from utils import (
    save_model,
    load_model,
    neutralize,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)
from metrics import evaluate
from cross_validation import PurgedTimeSeriesSplitGroups

from numerapi import NumerAPI
napi = NumerAPI()

current_round = napi.get_current_round(tournament=8)  # tournament 8 is the primary Numerai Tournament

train_pq_path = "./numeraidata/numerai_training_data.parquet"

# read in just those features along with era and target columns
with open("./numeraidata/features.json") as f:
    feature_metadata = json.load(f)


features_small = set(feature_metadata["feature_sets"]["small"])
features_medium = set(feature_metadata["feature_sets"]["medium"])

in_medium_not_in_small = features_medium - features_small
features = list(features_small) + list(in_medium_not_in_small)

read_columns = features + [DATA_TYPE_COL, ERA_COL, TARGET_COL]
df_train = pd.read_parquet(train_pq_path, columns=read_columns)

eras = df_train.era.astype(int)
df_train["era"] = eras

gc.collect()



477



In [8]:
def scoring_func(df, pred_col="prediction"):

  def _getcorrs(sub_df):
    ranked_preds = sub_df[pred_col].rank(pct=True, method="first")
    return np.corrcoef(ranked_preds, sub_df[TARGET_COL])[0, 1]
  
  corrs = df.groupby("era").apply(_getcorrs)
  mean_corr = corrs.mean()
  sharpe_ratio = mean_corr / corrs.std()

  return mean_corr, sharpe_ratio



In [12]:
base_params = {
    'n_estimators': 2000, 
    'learning_rate': 0.01, 
    'num_leaves': 30, 
    'max_depth': 5, 
    'colsample_bytree': 0.1,
    }
    

n_splits = 3
purge = 5
split = 0

cv=PurgedTimeSeriesSplitGroups(n_splits, purge)

corrs_s = []
sharpes_s = []
corrs_m = []
sharpes_m = []

# get out of sample training preds via embargoed time series cross validation
print(f"entering time series cross validation loop")
for train_index, test_index in (cv.split(df_train[features], df_train[TARGET_COL], eras)):
    gc.collect()
    print(f"doing split {split+1} out of {n_splits}")
    
    train_eras = df_train.iloc[train_index, :].era.unique()
    test_eras = df_train.iloc[test_index, :].era.unique()
    train_split_index = df_train[ERA_COL].isin(train_eras)
    test_split_index = df_train[ERA_COL].isin(test_eras)

            
    print(f'training models')
    model_small = lightgbm.LGBMRegressor(**base_params)
    model_small.fit(df_train.loc[train_split_index, features_small],
              df_train.loc[train_split_index, [TARGET_COL]])
    
    model_medium = lightgbm.LGBMRegressor(**base_params)
    model_medium.fit(df_train.loc[train_split_index, features_medium],
              df_train.loc[train_split_index, [TARGET_COL]])
          
    print("predicting models")
    df_train.loc[test_split_index, "prediction_s"] = model_small.predict(df_train.loc[test_split_index, features_small])
    df_train.loc[test_split_index, "prediction_m"] = model_medium.predict(df_train.loc[test_split_index, features_medium])

    print(f"evaluating models")
    corr_s, sharpe_s = scoring_func(df_train.loc[test_split_index], pred_col="prediction_s")
    print(f'scores for model trained on small:  corr: {corr_s}, sharpe: {sharpe_s}')
    
    corr_m, sharpe_m = scoring_func(df_train.loc[test_split_index], pred_col="prediction_m")
    print(f'scores for model trained on medium:  corr: {corr_m}, sharpe: {sharpe_m}')
    
    corrs_s.append(corr_s)
    corrs_m.append(corr_m)
          
    sharpes_s.append(sharpe_s)
    sharpes_m.append(sharpe_m)
    
    split += 1

print(f"final score for small model: corr: {np.mean(corrs_s)}, sharpe: {np.mean(sharpes_s)}")
print(f"final score for medium model: corr: {np.mean(corrs_m)}, sharpe: {np.mean(sharpes_m)}")

entering time series cross validation loop
doing split 1 out of 3
training models
predicting models
evaluating models
scores for model trained on small:  corr: 0.05203828829452122, sharpe: 1.7218079348899469
scores for model trained on medium:  corr: 0.05009236402949101, sharpe: 1.5819559632983888
doing split 2 out of 3
training models
predicting models
evaluating models
scores for model trained on small:  corr: 0.05081116989184061, sharpe: 1.5833631106401396
scores for model trained on medium:  corr: 0.054754276753451944, sharpe: 1.4379924079715303
doing split 3 out of 3
training models
predicting models
evaluating models
scores for model trained on small:  corr: 0.0423433221748213, sharpe: 1.799960584928587
scores for model trained on medium:  corr: 0.04859222074436574, sharpe: 2.0458342283134963
final score for small model: corr: 0.04839759345372771, sharpe: 1.7017105434862245
final score for medium model: corr: 0.05114628717576956, sharpe: 1.6885941998611385


In [13]:
print("doing a full train of the models")
model_small = lightgbm.LGBMRegressor(**base_params)
model_small.fit(df_train.loc[:, features_small],
          df_train.loc[:, [TARGET_COL]])

model_medium = lightgbm.LGBMRegressor(**base_params)
model_medium.fit(df_train.loc[:, features_medium],
          df_train.loc[:, [TARGET_COL]])

# getting validation data
print('Reading minimal features of validation data...')
validation_data = pd.read_parquet('numeraidata/numerai_validation_data.parquet',
                                  columns=read_columns)
    
print("predicting models on validation data")
validation_data.loc[:, f"prediction_s"] = model_small.predict(validation_data.loc[:, features_small])
validation_data.loc[:, f"prediction_m"] = model_medium.predict(validation_data.loc[:, features_medium])

Reading minimal features of validation data...
predicting models on validation data


In [14]:
with open("models/lgb2000_0.01_small.pkl", "wb") as f:
    pickle.dump(model_small, f)
with open("models/lgb2000_0.01_med.pkl", "wb") as g:
    pickle.dump(model_medium, g)



In [17]:
print("compute feature correlations with target")
all_feature_corrs = df_train.groupby('era').apply(lambda d: d[features].corrwith(d[TARGET_COL]))
# compute the volatility of the feature correlations
feature_corr_volatility = all_feature_corrs.std()

print("compute feature exposures")
# calculate the feature exposures of the predictions
feature_exposure_list = []
for feature in features:
    feature_exposure_list.append(np.corrcoef(df_train[feature], df_train["prediction_m"])[0,1])
feature_exposure_list = pd.Series(feature_exposure_list, index=features)

# neutralize our predictions to the riskiest features
riskiest_features = (feature_exposure_list.abs()*feature_corr_volatility).sort_values()[-100:].index.tolist()

print("neutralizing")
validation_data[f"prediction_s_neutral_riskiest_100_0.5"] = neutralize(
    df=validation_data,
    columns=[f"prediction_s"],
    neutralizers=riskiest_features,
    proportion=0.5,
    normalize=True,
    era_col=ERA_COL
)

validation_data[f"prediction_m_neutral_riskiest_100_0.5"] = neutralize(
    df=validation_data,
    columns=[f"prediction_m"],
    neutralizers=riskiest_features,
    proportion=0.5,
    normalize=True,
    era_col=ERA_COL
)

validation_data[f"prediction_s_neutral_riskiest_100_1.0"] = neutralize(
    df=validation_data,
    columns=[f"prediction_s"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

validation_data[f"prediction_m_neutral_riskiest_100_1.0"] = neutralize(
    df=validation_data,
    columns=[f"prediction_m"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

compute feature correlations with target
compute feature exposures
neutralizing


In [19]:
validation_preds = pd.read_parquet('numeraidata/example_validation_predictions.parquet')
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]

validation_stats = validation_metrics(validation_data, ["prediction_s", "prediction_m", "prediction_s_neutral_riskiest_100_0.5", "prediction_m_neutral_riskiest_100_0.5", "prediction_s_neutral_riskiest_100_1.0", "prediction_m_neutral_riskiest_100_0.1.0"], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
validation_stats

Unnamed: 0,mean,std,sharpe,max_drawdown,apy,mmc_mean,corr_plus_mmc_sharpe,corr_with_example_preds
prediction_s,0.022195,0.031914,0.695462,-0.139106,186.253624,0.007552,0.609771,0.485103
prediction_m,0.020781,0.0315,0.659712,-0.196412,167.663651,0.002604,0.530506,0.68422
prediction_s_neutral_riskiest_100_0.5,0.02363,0.027827,0.849166,-0.108358,208.402721,0.0088,0.770073,0.478299
prediction_m_neutral_riskiest_100_0.5,0.023231,0.027464,0.845879,-0.106577,202.746749,0.003962,0.742243,0.711463
prediction_s_neutral_riskiest_100_1.0,0.022794,0.022483,1.013797,-0.055818,198.179289,0.009332,0.916884,0.420382
prediction_m_neutral_riskiest_100_0.1.0,0.023177,0.020041,1.156473,-0.067451,204.457188,0.00538,1.065684,0.639433


