In [None]:
!pip install halo pyarrow numerapi lightgbm tabulate optuna

import os
from halo import Halo
import json
import gc
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
import lightgbm
from sklearn.model_selection import KFold
from scipy.stats import spearmanr
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    get_time_series_cross_val_splits,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)

from metrics import evaluate

from numerapi import NumerAPI
napi = NumerAPI()

current_round = napi.get_current_round(tournament=8)  # tournament 8 is the primary Numerai Tournament

# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')
napi.download_dataset("numerai_training_data_int8.parquet", "training_data_int8.parquet")
# napi.download_dataset("numerai_tournament_data_int8.parquet", f"tournament_data_int8_{current_round}.parquet")
# napi.download_dataset("numerai_validation_data_int8.parquet", f"validation_data_int8.parquet")
# napi.download_dataset("example_validation_predictions.parquet", "example_validation_predictions.parquet")
napi.download_dataset("features.json", "features.json")

with open("features.json", "r") as f:
    feature_metadata = json.load(f)

features_small = set(feature_metadata["feature_sets"]["small"])
features_medium = set(feature_metadata["feature_sets"]["medium"])

in_medium_not_in_small = features_medium - features_small
features = list(features_small) + list(in_medium_not_in_small)

train_pq_path = "training_data_int8.parquet"

# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
df_train = pd.read_parquet(train_pq_path, columns=read_columns)

eras = df_train.era.astype(int)
df_train["era"] = eras

training_data = df_train
gc.collect()

Collecting halo
  Downloading halo-0.0.31.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pyarrow
  Downloading pyarrow-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
     |████████████████████████████████| 25.6 MB 1.6 MB/s             
[?25hCollecting numerapi
  Downloading numerapi-2.9.4-py3-none-any.whl (26 kB)
Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 7.1 MB/s            
[?25hCollecting tabulate
  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
     |████████████████████████████████| 308 kB 26.4 MB/s            
[?25hCollecting log_symbols>=0.0.14
  Downloading log_symbols-0.0.14-py3-none-any.whl (3.1 kB)
Collecting spinners>=0.0.24
  Downloading spinners-0.0.24-py3-none-any.whl (5.5 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.

2022-01-16 20:49:23,234 INFO numerapi.utils: target file already exists
2022-01-16 20:49:23,235 INFO numerapi.utils: download complete
2022-01-16 20:49:23,778 INFO numerapi.utils: target file already exists
2022-01-16 20:49:23,779 INFO numerapi.utils: download complete


0



In [None]:
def era_boosting_train(model, X, y, era_col, proportion, ne, ni):
    features = X.columns
    model.fit(X, y)
    new_df = X.copy()
    new_df[TARGET_COL] = y
    new_df["era"] = era_col

    for i in range(ni-1):
        preds = model.predict(X)
        new_df["pred"] = preds
        era_scores = pd.Series(dtype='float32', index=new_df["era"].unique())
        for era in new_df["era"].unique():
            era_df = new_df[new_df["era"] == era]
            era_scores[era] = spearmanr(era_df["pred"], era_df[TARGET_COL])[0]
        
        era_scores.sort_values(inplace=False)
        worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index

        worst_df = new_df[new_df["era"].isin(worst_eras)]
        era_scores.sort_index(inplace=True)
        
        print(f"ne: {model.n_estimators}, iter: {i+1}, mean corr: {np.mean(era_scores)}, sharpe: {np.mean(era_scores)/np.std(era_scores)}")
        
        model.fit(worst_df[features], worst_df[TARGET_COL], init_model = model)
    return model



In [None]:
# For era boosting, the total number of estimators built will be ne*ni

ni = 9
ne = 250
proportion = 0.5
param_grid = {
    'n_estimators': ne, 
    'learning_rate': 0.003, 
    'num_leaves': 25, 
    'max_depth': 6, 
    'colsample_bytree': 0.1,
    "max_bin": 50,
}

temp_model = lightgbm.LGBMRegressor(**param_grid)
model = era_boosting_train(
    temp_model,
    training_data[features],
    training_data[TARGET_COL],
    era_col=training_data['era'],
    proportion=proportion, 
    ne=ne,
    ni=ni
)

ne: 250, iter: 1, mean corr: 0.06882654875516891, sharpe: 2.3689191621029577
ne: 250, iter: 2, mean corr: 0.07703043520450592, sharpe: 4.084805086898683
ne: 250, iter: 3, mean corr: 0.0848427340388298, sharpe: 5.144846619523415
ne: 250, iter: 4, mean corr: 0.08879073709249496, sharpe: 6.47585416343457
ne: 250, iter: 5, mean corr: 0.09365268051624298, sharpe: 6.301480959693916
ne: 250, iter: 6, mean corr: 0.09600390493869781, sharpe: 7.923205554730569
ne: 250, iter: 7, mean corr: 0.10016566514968872, sharpe: 6.990891605299559
ne: 250, iter: 8, mean corr: 0.10243947803974152, sharpe: 9.374425154250295


In [None]:
print('Reading minimal features of validation and tournament data...')
validation_data = pd.read_parquet('validation_data_int8.parquet',
                                  columns=read_columns)

print("predicting on validation data")
validation_data.loc[:, "prediction"] = model.predict(
    validation_data.loc[:, features])


Reading minimal features of validation and tournament data...
predicting on validation data


In [None]:
# compute feature correlations with target
all_feature_corrs = training_data.groupby('era').apply(lambda d: d[features].corrwith(d[TARGET_COL]))
# compute the volatility of the feature correlations
feature_corr_volatility = all_feature_corrs.std()

# calculate the feature exposures of the predictions
feature_exposure_list_val = []
for feature in features:
    feature_exposure_list_val.append(np.corrcoef(validation_data[feature], validation_data["prediction"])[0,1])
feature_exposure_list_val = pd.Series(feature_exposure_list_val, index=features)

riskiest_features = (feature_exposure_list_val.abs()*feature_corr_volatility).sort_values()[-100:].index.tolist()



In [None]:
feature_exposure_list_val.abs().max()

0.5022857243144736



In [None]:
# neutralize our predictions to the riskiest features
validation_data[f"preds_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"prediction"],
    neutralizers=riskiest_features,
    proportion=0.75,
    normalize=True,
    era_col=ERA_COL
)




In [None]:
evaluate(validation_data, ["prediction", "preds_neutral_riskiest_50"])

Metrics for prediction
Spearman Correlation: 0.021
Sharpe Ratio: 0.664
Max Feature Exposure:  0.513
Metrics for preds_neutral_riskiest_50
Spearman Correlation: 0.0224
Sharpe Ratio: 0.942
Max Feature Exposure:  0.1959


Unnamed: 0,prediction,preds_neutral_riskiest_50
spearman,0.021,0.0224
sharpe,0.664,0.942
max_feature_exposure,0.513,0.1959




In [None]:
print('Reading minimal features of tournament data...')
napi.download_dataset("numerai_tournament_data.parquet", f"tournament_data_{current_round}.parquet")

tournament_data = pd.read_parquet(f'tournament_data_{current_round}.parquet',
                                  columns=read_columns)

print("predicting on tournament data")
tournament_data.loc[:, f"prediction"] = model.predict(
    tournament_data.loc[:, features])

predicting on tournament data


In [None]:
# calculate the feature exposures of the predictions
feature_exposure_list_tourny = []
for feature in features:
    feature_exposure_list_tourny.append(np.corrcoef(tournament_data[feature], tournament_data["prediction"])[0,1])
feature_exposure_list_tourny = pd.Series(feature_exposure_list_tourny, index=features)

riskiest_features = (feature_exposure_list_tourny.abs()*feature_corr_volatility).sort_values()[-100:].index.tolist()

# neutralize our predictions to the riskiest features
tournament_data[f"preds_neutral_riskiest_100"] = neutralize(
    df=tournament_data,
    columns=[f"prediction"],
    neutralizers=riskiest_features,
    proportion=0.75,
    normalize=True,
    era_col=ERA_COL
)




In [None]:
tournament_data["prediction"] = tournament_data["preds_neutral_riskiest_100"].rank(pct=True)
tournament_data["prediction"].to_csv(f"tournament_predictions_{current_round}.csv")

