Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


# Evaluating Financial Machine Learning Models on Numerai

In [None]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


Please install GPU version of TF


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 38.0 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip install numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [None]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
'''
##################### CLASSES AND FUNCTIONS ##################################
'''
def my_loss_fn(y_true, y_pred):
    squared_difference = tf.square(y_true - y_pred)
    return tf.reduce_mean(squared_difference, axis=-1)
    
def Cluster_Kmeans(df,X,clusters):
    
    print('Call algorithm K-means')
    kmeans = KMeans(n_clusters=clusters, random_state=rand, init = 'random')
    print('Fitting algorithm K-means')
    kmeans.fit(X)
    print('Finished and Fitted')
    X_clusters_kmeans = df
    X_clusters_kmeans['k-means'] = kmeans.labels_
    return X_clusters_kmeans

def KMeans_Clustering_XGBRegressor(self,df,group,features):
  
    df_features_X = group[features]
    df_features_X['target'] = group.target
    X = df_features_X[df_features_X.columns[0:-1]]
    Y = df_features_X[df_features_X.columns[-1]]
    x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.20,random_state=rand)

    model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, verbosity=1, nthread=6)
    model.fit(x_train, y_train)

    return model, x_train,x_test,y_train,y_test

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
seed = 3
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_242.zip: 383MB [00:07, 49.8MB/s]                           
2020-12-13 08:25:32,908 INFO numerapi.base_api: unzipping file...


In [None]:
#https://forum.numer.ai/t/model-diagnostics-risk-metrics/900

TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

#-----------------------------------------------------

# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]

# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


# Read the csv file into a pandas Dataframe
def read_csv(file_path):
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
        dtypes = {x: np.float16 for x in column_names if
                  x.startswith(('feature', 'target'))}
    return pd.read_csv(file_path, dtype=dtypes)

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 1min 3s, sys: 2.5 s, total: 1min 6s
Wall time: 1min 11s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
##############################################################################
########################  DIVIDE DATA  in X and Y  ###########################
##############################################################################

In [None]:
print('Call algorithm K-means')
kmeans = KMeans(n_clusters=10, random_state=rand, init = 'random')
print('Fitting algorithm K-means')
kmeans.fit(X)
print('Finished and Fitted')
X_clusters_kmeans = training_data
X_clusters_kmeans['k-means'] = kmeans.labels_

In [None]:
training_data["id_seq"] = list(range(len(training_data)))
tournament_data["id_seq"] = list(range(len(tournament_data)))

In [None]:
X_group_1 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 0]
X_group_2 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 1]
X_group_3 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 2]
X_group_4 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 3]
X_group_5 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 4]
X_group_6 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 5]
X_group_7 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 6]
X_group_8 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 7]
X_group_9 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 8]
X_group_10 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 9]

In [None]:
def Grouping_Models(df_train):
  
    epochs=50
    batch_size = len(df_train) // 1000 * 3
    df_train_all = df_train[feature_cols]
    df_train_all[TARGET_NAME] = df_train['target'].values
    X = df_train_all[df_train_all.columns[0:-1]]
    Y = df_train_all[df_train_all.columns[-1]]
    x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.20,random_state=rand)
    input_data = tf.keras.Input(shape=(len(feature_cols),))
    # tf.keras.layers.PReLU(alpha_initializer="zeros", alpha_regularizer=None, alpha_constraint=None, shared_axes=None)
    # tf.keras.layers.ELU(alpha=1.0)
    # tf.keras.layers.ThresholdedReLU(theta=1.0)
    # tf.keras.layers.LeakyReLU(alpha=0.3)
    layer = tf.keras.layers.LeakyReLU(alpha=0.1)
    final_layer = tf.keras.layers.Softmax(axis=-1)
    x = tf.keras.layers.Dense(len(feature_cols), activation=layer)(input_data)
    x = tf.keras.layers.Dense(len(feature_cols) // 2, activation=layer)(x)
    x = tf.keras.layers.Dense(len(feature_cols) // 4, activation=layer)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    optimizer = tf.keras.optimizers.Adam( learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam")
    model = tf.keras.Model(input_data, output)
    # model.compile(optimizer=optimizer, loss=pearson_cumsom_loss, metrics=['mae', 'mse'])
    model.compile(optimizer=optimizer, loss=my_loss_fn, metrics=['mae', 'mse'])
    #model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, y_test))
    return model

In [None]:
model_group_1 = Grouping_Models(X_group_1)

In [None]:
model_group_2 = Grouping_Models(X_group_2)

In [None]:
model_group_2 = Grouping_Models(X_group_2)
model_group_3 = Grouping_Models(X_group_3)
model_group_4 = Grouping_Models(X_group_4)
model_group_5 = Grouping_Models(X_group_5)
model_group_6 = Grouping_Models(X_group_6)
model_group_7 = Grouping_Models(X_group_7)
model_group_8 = Grouping_Models(X_group_8)
model_group_9 = Grouping_Models(X_group_9)
model_group_10 = Grouping_Models(X_group_10)

In [None]:
########################## PREDICTIONS ON LEBELS ##############################

In [None]:
X_group_1_td = tournament_data.loc[tournament_data['k-means'] == 0]
X_group_2_td = tournament_data.loc[tournament_data['k-means'] == 1]
X_group_3_td = tournament_data.loc[tournament_data['k-means'] == 2]
X_group_4_td = tournament_data.loc[tournament_data['k-means'] == 3]
X_group_5_td = tournament_data.loc[tournament_data['k-means'] == 4]
X_group_6_td = tournament_data.loc[tournament_data['k-means'] == 5]
X_group_7_td = tournament_data.loc[tournament_data['k-means'] == 6]
X_group_8_td = tournament_data.loc[tournament_data['k-means'] == 7]
X_group_9_td = tournament_data.loc[tournament_data['k-means'] == 8]
X_group_10_td = tournament_data.loc[tournament_data['k-means'] == 9]

In [None]:
labels_tournament = kmeans.predict(tournament_data[feature_cols])
tournament_data['k-means'] = labels_tournament

In [None]:
prediction_group_1 = X_group_1_td['id'].to_frame()
prediction_group_1["prediction"] = model_group_1.predict(X_group_1_td[feature_cols])
prediction_group_1["original_target"] = X_group_1_td["target"]

In [None]:
prediction_group_2 = X_group_2_td['id'].to_frame()
prediction_group_2["prediction"] = model_group_2.predict(X_group_2_td[feature_cols])
prediction_group_2["original_target"] = X_group_2_td["target"]

In [None]:
prediction_group_3 = X_group_3_td['id'].to_frame()
prediction_group_3["prediction"] = model_group_3.predict(X_group_3_td[feature_cols])
prediction_group_3["original_target"] = X_group_3_td["target"]

In [None]:
prediction_group_4 = X_group_4_td['id'].to_frame()
prediction_group_4["prediction"] = model_group_4.predict(X_group_4_td[feature_cols])
prediction_group_4["original_target"] = X_group_4_td["target"]

In [None]:
prediction_group_5 = X_group_5_td['id'].to_frame()
prediction_group_5["prediction"] = model_group_5.predict(X_group_5_td[feature_cols])
prediction_group_5["original_target"] = X_group_5_td["target"]

In [None]:
prediction_group_6 = X_group_6_td['id'].to_frame()
prediction_group_6["prediction"] = model_group_6.predict(X_group_6_td[feature_cols])
prediction_group_6["original_target"] = X_group_6_td["target"]

In [None]:
prediction_group_7 = X_group_7_td['id'].to_frame()
prediction_group_7["prediction"] = model_group_7.predict(X_group_7_td[feature_cols])
prediction_group_7["original_target"] = X_group_7_td["target"]

In [None]:
prediction_group_8 = X_group_8_td['id'].to_frame()
prediction_group_8["prediction"] = model_group_8.predict(X_group_8_td[feature_cols])
prediction_group_8["original_target"] = X_group_8_td["target"]

In [None]:
prediction_group_9 = X_group_9_td['id'].to_frame()
prediction_group_9["prediction"] = model_group_9.predict(X_group_9_td[feature_cols])
prediction_group_9["original_target"] = X_group_9_td["target"]

In [None]:
prediction_group_10 = X_group_10_td['id'].to_frame()
prediction_group_10["prediction"] = model_group_10.predict(X_group_10_td[feature_cols])
prediction_group_10["original_target"] = X_group_10_td["target"]

In [None]:
########################## UNION OF PREDICTIONS ##############################

In [None]:
vertical_stack = pd.concat([prediction_group_1, prediction_group_2], axis=0)
vertical_stack.sort_values()

In [None]:
prediction_group_2

In [None]:
vertical_stack

In [None]:
####################################################################################################################################################################
####################################################################################################################################################################
####################################################################################################################################################################
####################################################################################################################################################################
####################################################################################################################################################################

In [None]:
##################################################################
#####################   MORE METRICS   ###########################
##################################################################

In [None]:
TRAIN_EVAL_PREFIX = "train"
VAL_EVAL_PREFIX = "val"

#Some evaluation metrics
def ar1(x):
    return np.corrcoef(x[:-1], x[1:])[0,1]

def autocorr_penalty(x):
    n = len(x)
    p = ar1(x)
    return np.sqrt(1 + 2*np.sum([((n - i)/n)*p**i for i in range(1,n)]))

def smart_sharpe(x):
    return np.mean(x)/(np.std(x, ddof=1)*autocorr_penalty(x))

def numerai_sharpe(x):
    return ((np.mean(x) - 0.010415154) / np.std(x)) * np.sqrt(12)

def spearmanr(target, pred):
    return np.corrcoef(
        target,
        pred.rank(pct=True, method="first")
    )[0, 1]

#-----------------------------------------------------
def get_baisc_per_era_metrics(df:pd.DataFrame, 
                        isVal=None, 
                        fig_name="per_era_scores.png") -> pd.Series:
    
    prefix=None
    scores = pd.Series(dtype=float)

    preds_ = df[PREDICTION_NAME]
    #Some checks for deciding between training and tournament data
    if isVal:
        #scores["tournament_corr_example_preds"] = spearmanr(preds_, example_preds[PREDICTION_NAME])
        df = df[df.data_type == "validation"]
        prefix=VAL_EVAL_PREFIX
        print("predicting on validation...")
    else:
        df = df
        prefix=TRAIN_EVAL_PREFIX
        print("predicting on train...")

    #-----------------------------------------------------

    #Metric Calculations
    print("getting per era scores")
    era_scores = df.groupby("era").apply(
        lambda x: spearmanr(x[TARGET_NAME], x[PREDICTION_NAME]))
    
    era_scores.sort_index(inplace=True)
    era_scores.plot(kind="bar")
    print("performance over time")
    plt.pyplot.savefig(f"{prefix}_{fig_name}")
    plt.pyplot.show()

    #-----------------------------------------------------
    
    scores[f"{prefix}_mean"] = preds_.mean()
    scores[f"{prefix}_std_dev"] = preds_.std()
    scores[f"{prefix}_less_than_half"] = (preds_<0.5).mean()
    scores[f"{prefix}_less_than_mean"] = (preds_<preds_.mean()).mean()

    scores[f"{prefix}_autocorrelation"] = ar1(era_scores)
    scores[f"{prefix}_mean correlation"] = np.mean(era_scores)
    scores[f"{prefix}_Median Correlation"] = np.median(era_scores)
    scores[f"{prefix}_Variance"] = np.var(era_scores)
    scores[f"{prefix}_Std. Dev."] = np.std(era_scores)
    scores[f"{prefix}_sharpe"] = np.mean(era_scores)/np.std(era_scores)
    scores[f"{prefix}_smart sharpe"] = smart_sharpe(era_scores)
    scores[f"{prefix}_Numerai sharpe"] = numerai_sharpe(era_scores)

    print(scores)
    del era_scores
    del preds_
    gc.collect()
    return scores


In [None]:
def neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    
    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
    gc.collect()
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    gc.collect()
    return scores / scores.std()


def calculate_feature_exposure(df, feature_names) -> list:
    exposures = []
    for feature_name in feature_names:
        exposures.append(spearmanr(df[feature_name], df[PREDICTION_NAME]))
        
    max_feat_exposure = np.max(np.abs(exposures))
    square_sum_feature_exposure = np.sum([e**2 for e in exposures])
    feature_exposure = np.std(exposures)

    #print(max_feat_exposure, square_sum_feature_exposure)

    return [feature_exposure, max_feat_exposure, square_sum_feature_exposure]


def get_more_metrics(df, feature_names, isVal=None) -> pd.Series:
    
    more_metrics = pd.Series(dtype=float)
    metric_prefix=None
    assert PREDICTION_NAME in df.columns

    if isVal is None:
        isVal = "validation" in df["data_type"].unique() #max CPU times: user 65.1 ms

    print(isVal)
    if isVal:
        df = df[df["data_type"]=="validation"]
        metric_prefix = VAL_EVAL_PREFIX
    else:
        metric_prefix = TRAIN_EVAL_PREFIX

    assert metric_prefix is not None

    #-----------------------------------------------------

    #per-era scores
    
    print("predicting per-era scores...")
    scores_per_era = df.groupby("era").apply(
        lambda df: spearmanr(df[PREDICTION_NAME], df[TARGET_NAME]))
    
    more_metrics[f"{metric_prefix}_var"] = scores_per_era.std()

    #-----------------------------------------------------
    
    #Neutralize
    #This takes a significant amount of memory for calculation
    print(df.shape)
    print("Neutralizing...")
    df[f"neutral_{PREDICTION_NAME}"] = neutralize(df, PREDICTION_NAME, feature_names)
    feature_neutral_mean = df.groupby("era").apply(
        lambda x: spearmanr(x["neutral_"+PREDICTION_NAME].values, x[TARGET_NAME])).mean()

    more_metrics[f"{metric_prefix}_feature_neutral_mean"] = feature_neutral_mean
    gc.collect()

    #-----------------------------------------------------
    print("Calculating Feature Exposure...")
    feature_exposure, max_feat_exposure, square_sum_feature_exposure = calculate_feature_exposure(df, feature_names)

    more_metrics[f"{metric_prefix}_feat_exposure"] = feature_exposure
    more_metrics[f"{metric_prefix}_max_feat_exposure"] = max_feat_exposure
    more_metrics[f"{metric_prefix}_square_sum_feature_exposure"] = square_sum_feature_exposure


    #-----------------------------------------------------
    print("Drawdown...")
    rolling_max = (scores_per_era+1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (scores_per_era+1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

    more_metrics[f"{metric_prefix}_max_drawdown"] = max_drawdown

    return more_metrics

In [None]:
def get_all_metrics(model, 
                    feature_names:list=feature_names, 
                    fig_name="per_era_scores")->pd.Series:

    training_preds = model.predict(training_data[feature_names].values)
    training_data[PREDICTION_NAME] = np.array(training_preds).reshape(-1,1)

    tournament_preds = model.predict(tournament_data[feature_names].values)
    tournament_data[PREDICTION_NAME] = np.array(tournament_preds).reshape(-1,1)

    del training_preds
    del tournament_preds

    print("evaluating on training data...")
    tr_per_era_scores = get_baisc_per_era_metrics(training_data, isVal=False, fig_name=fig_name)
    tr_more_metrics = get_more_metrics(training_data, feature_names ,isVal=False)
    gc.collect()

    print("evaluating on validation data...")
    val_per_era_scores = get_baisc_per_era_metrics(tournament_data, isVal=True, fig_name=fig_name)
    val_more_metrics = get_more_metrics(tournament_data, feature_names ,isVal=True)
    gc.collect()

    return pd.concat([
                      tr_per_era_scores, val_per_era_scores,
                      tr_more_metrics, val_more_metrics,
                      ])
    

In [None]:
#############################################################################
###########################  CREATING SOME MODELS  ##########################
#############################################################################

In [None]:
models = dict()

#Neural Net
nn_model = tf.keras.models.Sequential([
                                       tf.keras.layers.Input(shape=(125,)),
                                       tf.keras.layers.Dense(64, activation="relu"),
                                       tf.keras.layers.Dense(1, activation="sigmoid")
])
nn_model.compile(loss="mse", optimizer="adam", metrics = [tf.keras.metrics.RootMeanSquaredError()])

models["keras_mlp_simple"] = nn_model

model_XGBRegressor_1 = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, nthread=15)
models["model_XGB_1"] = model_XGBRegressor_1


model_XGBRegressor_2 = XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=50, colsample_bytree=0.5, nthread=15)
models["model_XGB_2"] = model_XGBRegressor_2

model_XGBRegressor_3 = XGBRegressor(colsample_bytree=0.4,
            gamma=0,
            learning_rate=0.07,
            max_depth=3,
            min_child_weight=1.5,
            n_estimators=1500,                                                                 
            reg_alpha=0.75,
            reg_lambda=0.45,
            subsample=0.6,
            seed=64)

models["model_XGB_3"] = model_XGBRegressor_3

model_XGBRegressor_4 = XGBRegressor(colsample_bytree=0.4,
            gamma=0,
            learning_rate=0.07,
            max_depth=3,
            min_child_weight=1.5,
            n_estimators=750,                                                                 
            reg_alpha=0.75,
            reg_lambda=0.45,
            subsample=0.6,
            seed=rand)

models["model_XGB_4"] = model_XGBRegressor_4

del nn_model
del model_XGBRegressor_1
del model_XGBRegressor_2
del model_XGBRegressor_3
del model_XGBRegressor_4

gc.collect()

140

In [None]:
models

In [None]:
#############################################################################
####################   TRAINING MORE MODELS   ###############################
#############################################################################

In [None]:
%%time
for model_name in models:
    print(f"Fitting {model_name}...")

    if "keras" in model_name:
        models[model_name].fit(training_data[feature_names].values, training_data[TARGET_NAME].values, 
             batch_size=512, 
             epochs=40,
             validation_data=(validation_data[feature_names].values, validation_data[TARGET_NAME].values),
             )
    else:
        models[model_name].fit(training_data[feature_names].values, training_data[TARGET_NAME].values)

    gc.collect()

In [None]:
##############################################################################
########################    EVALUATING MODELS    #############################
##############################################################################

In [None]:
%%time
all_model_metrics = dict()
for model_name in models:
    
    print(f"\n----{model_name}----")
    model_metrics = get_all_metrics(models[model_name], feature_names, fig_name = f"{model_name}.png")
    all_model_metrics[model_name] = model_metrics
    
    gc.collect()
    

In [None]:
metric_df = pd.DataFrame.from_dict(all_model_metrics)
metric_df

In [None]:
metric_trans = metric_df.T
metric_trans["val_Numerai sharpe"] > 1

keras_mlp_simple     True
model_XGB_1          True
model_XGB_2          True
model_XGB_3          True
model_XGB_4         False
Name: val_Numerai sharpe, dtype: bool

In [None]:
##############################################################################
######################## MAKE PREDICTIONS ####################################
##############################################################################

In [None]:
model_XGBRegressor_2 = XGBRegressor(colsample_bytree=0.4,
            gamma=0,
            learning_rate=0.07,
            max_depth=3,
            min_child_weight=1.5,
            n_estimators=750,                                                                 
            reg_alpha=0.75,
            reg_lambda=0.45,
            subsample=0.6,
            seed=rand)
model_XGBRegressor_2.fit(training_data[feature_names], Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.07, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=None, n_estimators=750,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.6, verbosity=1)

In [None]:
%%time
'''
print("Generating predictions on training data...")
training_preds = model_XGBRegressor_2.predict(training_data[feature_names])
training_data[PREDICTION_NAME] = training_preds
'''
print("Generating predictions on tournament data...")
tournament_preds = model_XGBRegressor_2.predict(tournament_data[feature_names])
#tournament_data[PREDICTION_NAME] = tournament_preds

Generating predictions on tournament data...
CPU times: user 52.9 s, sys: 12.5 s, total: 1min 5s
Wall time: 1min 5s


In [None]:
tournament_preds

array([0.48645765, 0.48823375, 0.53461075, ..., 0.51955205, 0.51429075,
       0.48515308], dtype=float32)

In [None]:
tournament_data[PREDICTION_NAME] = tournament_preds

In [None]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [None]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.486458
n000920ed083903f    0.488234
n0038e640522c4a6    0.534611
n004ac94a87dc54b    0.499468
n0052fe97ea0c05f    0.503206
                      ...   
nffb2a2201b9ff7d    0.522439
nffd4c35b633f37c    0.520846
nffd5f5a45f2765e    0.519552
nffe5151d105645b    0.514291
nfff9d68164c79ca    0.485153
Name: prediction, Length: 1601973, dtype: float32

In [None]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

era121 
era122 
era123 
era124 
era125 
era126 
era127 
era128 
era129 
era130 
era131 
era132 
era575 
era576 
era577 
era578 
era579 
era580 
era581 
era582 
era583 
era584 
era585 
era586 
era587 
era588 
era589 
era590 
era591 
era592 
era593 
era594 
era595 
era596 
era597 
era598 
era599 
era600 
era601 
era602 
era603 
era604 
era605 
era606 
era607 
era608 
era609 
era610 
era611 
era612 
era613 
era614 
era615 
era616 
era617 
era618 
era619 
era620 
era621 
era622 
era623 
era624 
era625 
era626 
era627 
era628 
era629 
era630 
era631 
era632 
era633 
era634 
era635 
era636 
era637 
era638 
era639 
era640 
era641 
era642 
era643 
era644 
era645 
era646 
era647 
era648 
era649 
era650 
era651 
era652 
era653 
era654 
era655 
era656 
era657 
era658 
era659 
era660 
era661 
era662 
era663 
era664 
era665 
era666 
era667 
era668 
era669 
era670 
era671 
era672 
era673 
era674 
era675 
era676 
era677 
era678 
era679 
era680 
era681 
era682 
era683 
era684 
era685 
era686 
era687 


In [None]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

2020-12-13 12:30:47,852 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-13 12:30:47,853 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.441554
n000920ed083903f    0.436820
n0038e640522c4a6    0.640175
n004ac94a87dc54b    0.510690
n0052fe97ea0c05f    0.529954
                      ...   
nffb2a2201b9ff7d    0.610909
nffd4c35b633f37c    0.702733
nffd5f5a45f2765e    0.546415
nffe5151d105645b    0.554896
nfff9d68164c79ca    0.407693
Name: prediction, Length: 1601973, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI FISAGOL: jmfisagol@gmail.com
public_id = "7TISUDGAWEVO2B35ECOQQXU2RWXGZN3I"
secret_key = "QJYUWIMFEEDNZ4GHUO6VSSKPMRCBFJIMJ7BZ65ESIWRN4YHGYHSRJDNL64TAG7EH"
model_id = "d49c26a4-aa5b-4490-9d58-300c5e05d996"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-06 12:23:40,372 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...


In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI FISAGOL_1: joanmarc92@hotmail.com
public_id = "3LNFQEKJM2EFGHLRC5WK344GZTE3Y2XK"
secret_key = "CCMP4MU2YDUZQYNO5BUZY6SQHX3NKBPSZDECBBFM6TLNCLY4VISKQ6Z6CBZGJ65Y"
model_id = "0bb6b078-8525-4521-9ed2-5e7d13cc88ca"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-13 12:31:11,921 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...


In [None]:
##############################################################################
########################  SAVE AND LOAD THE MODEL  ###########################
##############################################################################

In [None]:
# SAVE MODEL
model.save_weights('numeraiThe_Model.h5', overwrite=True, save_format=None, options=None)

In [None]:
# LOAD MODEL
model.load_weights('numeraiThe_Model.h5', by_name=False, skip_mismatch=False, options=None)

In [None]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor 
import torch
from torch.autograd import grad


trainval=pd.read_parquet("numerai_training_validation_target_nomi.parquet")
train = trainval[trainval.data_type=='train']

target = "target_nomi" 
feature_columns = [c for c in trainval if c.startswith("feature")] 

# fit an initial model
model_init = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, nthread=6)
model_init.fit(train[feature_columns], train[target])

# get prediction from initial model as starting point to improve upon
base_margin = model_init.predict(train[feature_columns])

# get indexes for each era
era_idx = [np.where(train.era==uera)[0] for uera in train.era.unique()]


# define adjusted sharpe in terms of cost adjusted numerai sharpe
def numerai_sharpe(x):
    return (x.mean() -0.010415154) / x.std()

def skew(x):
    mx = x.mean()
    m2 = ((x-mx)**2).mean()
    m3 = ((x-mx)**3).mean()
    return m3/(m2**1.5)    

def kurtosis(x):
    mx = x.mean()
    m4 = ((x-mx)**4).mean()
    m2 = ((x-mx)**2).mean()
    return (m4/(m2**2))-3

def adj_sharpe(x):
    return numerai_sharpe(x) * (1 + ((skew(x) / 6) * numerai_sharpe(x)) - ((kurtosis(x) / 24) * (numerai_sharpe(x) ** 2)))

# use correlation as the measure of fit
def corr(pred, target):
    pred_n = pred - pred.mean(dim=0)
    pred_n = pred_n / pred_n.norm(dim=0)

    target_n = target - target.mean(dim=0)
    target_n = target_n / target_n.norm(dim=0)
    l = torch.matmul(pred_n.T, target_n)
    return l

# definte a custom objective for XGBoost
def adj_sharpe_obj(ytrue, ypred):
    # convert to pytorch tensors
    ypred_th = torch.tensor(ypred, requires_grad=True)
    ytrue_th = torch.tensor(ytrue)
    all_corrs = []

    # get correlations in each era
    for ee in era_idx:
        score = corr(ypred_th[ee], ytrue_th[ee])
        all_corrs.append(score)

    all_corrs = torch.stack(all_corrs)

    # calculate adjusted sharpe using correlations
    loss = -adj_sharpe(all_corrs)
    print(f'Current loss:{loss}')

    # calculate gradient and convert to numpy
    loss_grads = grad(loss, ypred_th, create_graph=True)[0]
    loss_grads = loss_grads.detach().numpy()

    # return gradient and ones instead of Hessian diagonal
    return loss_grads, np.ones(loss_grads.shape)


model_adj_sharpe = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=200, nthread=6, colsample_bytree=0.1)
model_adj_sharpe.fit(train[feature_columns], train[target], base_margin=base_margin)