Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


# Evaluating Financial Machine Learning Models on Numerai

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 38.0 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip install numerapi
!pip install catboost;

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 101kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################

In [None]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd

import tensorflow as tf
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import tensorflow as tf
from keras.utils import np_utils 
from sklearn import preprocessing
from keras.wrappers.scikit_learn import KerasClassifier
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import xgboost as xgb
import matplotlib as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC as svc
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
'''
##################### CLASSES AND FUNCTIONS ##################################
'''
def my_loss_fn(y_true, y_pred):
    squared_difference = tf.square(y_true - y_pred)
    return tf.reduce_mean(squared_difference, axis=-1)


In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
seed = 3
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_241.zip: 382MB [00:07, 48.9MB/s]                           
2020-12-06 08:57:26,159 INFO numerapi.base_api: unzipping file...


In [None]:
#https://forum.numer.ai/t/model-diagnostics-risk-metrics/900

TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

#-----------------------------------------------------

# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]

# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


# Read the csv file into a pandas Dataframe
def read_csv(file_path):
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
        dtypes = {x: np.float16 for x in column_names if
                  x.startswith(('feature', 'target'))}
    return pd.read_csv(file_path, dtype=dtypes)

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 1min 8s, sys: 3.6 s, total: 1min 12s
Wall time: 1min 12s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
TRAIN_EVAL_PREFIX = "train"
VAL_EVAL_PREFIX = "val"

In [None]:
##############################################################################
########################  DIVIDE DATA  in X and Y  ###########################
##############################################################################

In [None]:
# NORAML DATA DISTRIBUTED

X_train = training_data[training_data.columns[3:-1]]
Y_train = training_data[training_data.columns[-1]]

X_tournament = tournament_data[tournament_data.columns[3:-1]]
Y_tournament = tournament_data[tournament_data.columns[-1]]

X_validation = validation_data[validation_data.columns[3:-1]]
Y_validation = validation_data[validation_data.columns[-1]]

#PCA WITHOUT SCALER

pca2 = PCA(n_components=125, random_state=rand)
pca_2 = pca2.fit_transform(training_data[feature_names])
df_zero = pd.DataFrame(pca_2, columns=[feature_names[0:125]])
X_zero = df_zero

# SCALED DATA

df_x = training_data[feature_names]
ss = StandardScaler()
df_x[feature_names] = ss.fit_transform(df_x[feature_names])
X_x = df_x

# PCA WITH SCALED DATA

df_z = df_x
pca2 = PCA(n_components=125, random_state=rand)
pca_2 = pca2.fit_transform(df_z)
df_z = pd.DataFrame(pca_2, columns=[feature_names[0:125]])
X_z = df_z

In [None]:
##################################################################
#####################   MORE METRICS   ###########################
##################################################################

In [None]:
TRAIN_EVAL_PREFIX = "train"
VAL_EVAL_PREFIX = "val"

#Some evaluation metrics
def ar1(x):
    return np.corrcoef(x[:-1], x[1:])[0,1]

def autocorr_penalty(x):
    n = len(x)
    p = ar1(x)
    return np.sqrt(1 + 2*np.sum([((n - i)/n)*p**i for i in range(1,n)]))

def smart_sharpe(x):
    return np.mean(x)/(np.std(x, ddof=1)*autocorr_penalty(x))

def numerai_sharpe(x):
    return ((np.mean(x) - 0.010415154) / np.std(x)) * np.sqrt(12)

def spearmanr(target, pred):
    return np.corrcoef(
        target,
        pred.rank(pct=True, method="first")
    )[0, 1]

#-----------------------------------------------------
def get_baisc_per_era_metrics(df:pd.DataFrame, 
                        isVal=None, 
                        fig_name="per_era_scores.png") -> pd.Series:
    
    prefix=None
    scores = pd.Series(dtype=float)

    preds_ = df[PREDICTION_NAME]
    #Some checks for deciding between training and tournament data
    if isVal:
        #scores["tournament_corr_example_preds"] = spearmanr(preds_, example_preds[PREDICTION_NAME])
        df = df[df.data_type == "validation"]
        prefix=VAL_EVAL_PREFIX
        print("predicting on validation...")
    else:
        df = df
        prefix=TRAIN_EVAL_PREFIX
        print("predicting on train...")

    #-----------------------------------------------------

    #Metric Calculations
    print("getting per era scores")
    era_scores = df.groupby("era").apply(
        lambda x: spearmanr(x[TARGET_NAME], x[PREDICTION_NAME]))
    
    era_scores.sort_index(inplace=True)
    era_scores.plot(kind="bar")
    print("performance over time")
    plt.pyplot.savefig(f"{prefix}_{fig_name}")
    plt.pyplot.show()

    #-----------------------------------------------------
    
    scores[f"{prefix}_mean"] = preds_.mean()
    scores[f"{prefix}_std_dev"] = preds_.std()
    scores[f"{prefix}_less_than_half"] = (preds_<0.5).mean()
    scores[f"{prefix}_less_than_mean"] = (preds_<preds_.mean()).mean()

    scores[f"{prefix}_autocorrelation"] = ar1(era_scores)
    scores[f"{prefix}_mean correlation"] = np.mean(era_scores)
    scores[f"{prefix}_Median Correlation"] = np.median(era_scores)
    scores[f"{prefix}_Variance"] = np.var(era_scores)
    scores[f"{prefix}_Std. Dev."] = np.std(era_scores)
    scores[f"{prefix}_sharpe"] = np.mean(era_scores)/np.std(era_scores)
    scores[f"{prefix}_smart sharpe"] = smart_sharpe(era_scores)
    scores[f"{prefix}_Numerai sharpe"] = numerai_sharpe(era_scores)

    print(scores)
    del era_scores
    del preds_
    gc.collect()
    return scores


In [None]:
def neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    
    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
    gc.collect()
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    gc.collect()
    return scores / scores.std()


def calculate_feature_exposure(df, feature_names) -> list:
    exposures = []
    for feature_name in feature_names:
        exposures.append(spearmanr(df[feature_name], df[PREDICTION_NAME]))
        
    max_feat_exposure = np.max(np.abs(exposures))
    square_sum_feature_exposure = np.sum([e**2 for e in exposures])
    feature_exposure = np.std(exposures)

    #print(max_feat_exposure, square_sum_feature_exposure)

    return [feature_exposure, max_feat_exposure, square_sum_feature_exposure]


def get_more_metrics(df, feature_names, isVal=None) -> pd.Series:
    
    more_metrics = pd.Series(dtype=float)
    metric_prefix=None
    assert PREDICTION_NAME in df.columns

    if isVal is None:
        isVal = "validation" in df["data_type"].unique() #max CPU times: user 65.1 ms

    print(isVal)
    if isVal:
        df = df[df["data_type"]=="validation"]
        metric_prefix = VAL_EVAL_PREFIX
    else:
        metric_prefix = TRAIN_EVAL_PREFIX

    assert metric_prefix is not None

    #-----------------------------------------------------

    #per-era scores
    
    print("predicting per-era scores...")
    scores_per_era = df.groupby("era").apply(
        lambda df: spearmanr(df[PREDICTION_NAME], df[TARGET_NAME]))
    
    more_metrics[f"{metric_prefix}_var"] = scores_per_era.std()

    #-----------------------------------------------------
    
    #Neutralize
    #This takes a significant amount of memory for calculation
    print(df.shape)
    print("Neutralizing...")
    df[f"neutral_{PREDICTION_NAME}"] = neutralize(df, PREDICTION_NAME, feature_names)
    feature_neutral_mean = df.groupby("era").apply(
        lambda x: spearmanr(x["neutral_"+PREDICTION_NAME].values, x[TARGET_NAME])).mean()

    more_metrics[f"{metric_prefix}_feature_neutral_mean"] = feature_neutral_mean
    gc.collect()

    #-----------------------------------------------------
    print("Calculating Feature Exposure...")
    feature_exposure, max_feat_exposure, square_sum_feature_exposure = calculate_feature_exposure(df, feature_names)

    more_metrics[f"{metric_prefix}_feat_exposure"] = feature_exposure
    more_metrics[f"{metric_prefix}_max_feat_exposure"] = max_feat_exposure
    more_metrics[f"{metric_prefix}_square_sum_feature_exposure"] = square_sum_feature_exposure


    #-----------------------------------------------------
    print("Drawdown...")
    rolling_max = (scores_per_era+1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (scores_per_era+1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

    more_metrics[f"{metric_prefix}_max_drawdown"] = max_drawdown

    return more_metrics

In [None]:
def get_all_metrics(model, 
                    feature_names:list=feature_names, 
                    fig_name="per_era_scores")->pd.Series:

    training_preds = model.predict(training_data[feature_names].values)
    training_data[PREDICTION_NAME] = np.array(training_preds).reshape(-1,1)

    tournament_preds = model.predict(tournament_data[feature_names].values)
    tournament_data[PREDICTION_NAME] = np.array(tournament_preds).reshape(-1,1)

    del training_preds
    del tournament_preds

    print("evaluating on training data...")
    tr_per_era_scores = get_baisc_per_era_metrics(training_data, isVal=False, fig_name=fig_name)
    tr_more_metrics = get_more_metrics(training_data, feature_names ,isVal=False)
    gc.collect()

    print("evaluating on validation data...")
    val_per_era_scores = get_baisc_per_era_metrics(tournament_data, isVal=True, fig_name=fig_name)
    val_more_metrics = get_more_metrics(tournament_data, feature_names ,isVal=True)
    gc.collect()

    return pd.concat([
                      tr_per_era_scores, val_per_era_scores,
                      tr_more_metrics, val_more_metrics,
                      ])
    

In [None]:
#############################################################################
###########################  CREATING SOME MODELS  ##########################
#############################################################################

In [None]:
models = dict()


#Linear model
lin_reg = LinearRegression()
models["lin_reg"] = lin_reg

#Neural Net
nn_model = tf.keras.models.Sequential([
                                       tf.keras.layers.Input(shape=(310,)),
                                       tf.keras.layers.Dense(64, activation="relu"),
                                       tf.keras.layers.Dense(1, activation="sigmoid")
])
nn_model.compile(loss="mse", optimizer="adam", metrics = [tf.keras.metrics.RootMeanSquaredError()])

models["keras_mlp_simple"] = nn_model

#Neural Net Complex

input_data = tf.keras.Input(shape=(len(feature_names),))
# tf.keras.layers.PReLU(alpha_initializer="zeros", alpha_regularizer=None, alpha_constraint=None, shared_axes=None)
# tf.keras.layers.ELU(alpha=1.0)
# tf.keras.layers.ThresholdedReLU(theta=1.0)
# tf.keras.layers.LeakyReLU(alpha=0.3)
layer = tf.keras.layers.LeakyReLU(alpha=0.1)
final_layer = tf.keras.layers.Softmax(axis=-1)
x = tf.keras.layers.Dense(len(feature_names), activation=layer)(input_data)
x = tf.keras.layers.Dense(len(feature_names) // 2, activation=layer)(x)
x = tf.keras.layers.Dense(len(feature_names) // 4, activation=layer)(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
optimizer = tf.keras.optimizers.Adam( learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam")
nn_model_complex = tf.keras.Model(input_data, output)
# model.compile(optimizer=optimizer, loss=pearson_cumsom_loss, metrics=['mae', 'mse'])
nn_model_complex.compile(optimizer=optimizer, loss=my_loss_fn, metrics=['mae', 'mse'])


models["keras_mlp_complex"] = nn_model_complex


#CatBoost Regressor
cat_regressor = CatBoostRegressor()
models["cat_reg"] = cat_regressor

model_XGBRegressor_1 = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, nthread=15)
models["model_XGB_1"] = model_XGBRegressor_1


model_XGBRegressor_2 = XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=50, colsample_bytree=0.5, nthread=15)
models["model_XGB_2"] = model_XGBRegressor_2


from sklearn.tree import DecisionTreeRegressor
model_tree = DecisionTreeRegressor()
models["model_decisiontree"] = model_tree

from sklearn.ensemble import RandomForestRegressor
model_RandomForest = RandomForestRegressor()
models["model_RandomForest"] = model_RandomForest

from sklearn.linear_model import LassoCV
model_lasso = LassoCV()
models["model_lasso"] = model_lasso

from sklearn.linear_model import RidgeCV
model_Ridge = Ridge()
models["model_Ridge"] = model_Ridge

from sklearn.linear_model import ElasticNetCV
model_elasticNet = ElasticNetCV()
models["model_elasticNet"] = model_elasticNet

del lin_reg
del nn_model
del nn_model_complex
del cat_regressor
del model_XGBRegressor_1
del model_XGBRegressor_2
del model_tree
del model_RandomForest
del model_lasso
del model_Ridge
del model_elasticNet


gc.collect()

597

In [None]:
models

{'cat_reg': <catboost.core.CatBoostRegressor at 0x7f127b4942e8>,
 'keras_mlp_complex': <tensorflow.python.keras.engine.functional.Functional at 0x7f127b494978>,
 'keras_mlp_simple': <tensorflow.python.keras.engine.sequential.Sequential at 0x7f127dc53dd8>,
 'lin_reg': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
 'model_XGB_1': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.1, gamma=0,
              importance_type='gain', learning_rate=0.01, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=None, n_estimators=2000,
              n_jobs=1, nthread=15, objective='reg:linear', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1),
 'model_XGB_2': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gam

In [None]:
#############################################################################
####################   TRAINING MORE MODELS   ###############################
#############################################################################

In [None]:
for model_name in models:
    print(f"Fitting {model_name}...")

    if "keras" in model_name:
        models[model_name].fit(training_data[feature_names].values, training_data[TARGET_NAME].values, 
             batch_size=512, 
             epochs=40,
             validation_data=(validation_data[feature_names].values, validation_data[TARGET_NAME].values),
             )
    else:
        models[model_name].fit(training_data[feature_names].values, training_data[TARGET_NAME].values)

    gc.collect()

In [None]:
##############################################################################
########################    EVALUATING MODELS    #############################
##############################################################################

In [None]:
%%time
all_model_metrics = dict()
for model_name in models:
    
    print(f"\n----{model_name}----")
    model_metrics = get_all_metrics(models[model_name], feature_names, fig_name = f"{model_name}.png")
    all_model_metrics[model_name] = model_metrics
    
    gc.collect()
    

In [None]:
metric_df = pd.DataFrame.from_dict(all_model_metrics)
metric_df

In [None]:
##############################################################################
######################## MAKE PREDICTIONS ####################################
##############################################################################

In [None]:
model_XGBRegressor_2 = XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=50, colsample_bytree=0.5, nthread=15)
model_XGBRegressor_2.fit(training_data[feature_names], training_data[TARGET_NAME])




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=15, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
%%time
print("Generating predictions on training data...")
training_preds = model_XGBRegressor_2.predict(training_data[feature_names])
training_data[PREDICTION_NAME] = training_preds

print("Generating predictions on tournament data...")
tournament_preds = model_XGBRegressor_2.predict(tournament_data[feature_names])
tournament_data[PREDICTION_NAME] = tournament_preds

Generating predictions on training data...
Generating predictions on tournament data...
CPU times: user 22.1 s, sys: 3.98 s, total: 26 s
Wall time: 16.6 s


In [None]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [None]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.489426
n000920ed083903f    0.496905
n0038e640522c4a6    0.515518
n004ac94a87dc54b    0.494838
n0052fe97ea0c05f    0.492425
                      ...   
nffaceb12ad39d16    0.500325
nffc221b08dfc0f8    0.498036
nffc2d28cca4e236    0.492774
nffc7b37882d91aa    0.496236
nffcfd64227bd537    0.499964
Name: prediction, Length: 1596723, dtype: float32

In [None]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

In [None]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

2020-12-06 12:23:01,687 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-06 12:23:01,688 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.425299
n000920ed083903f    0.493761
n0038e640522c4a6    0.523667
n004ac94a87dc54b    0.456342
n0052fe97ea0c05f    0.404118
                      ...   
nffaceb12ad39d16    0.422876
nffc221b08dfc0f8    0.510030
nffc2d28cca4e236    0.478429
nffc7b37882d91aa    0.584557
nffcfd64227bd537    0.467623
Name: prediction, Length: 1596723, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI
public_id = "7TISUDGAWEVO2B35ECOQQXU2RWXGZN3I"
secret_key = "QJYUWIMFEEDNZ4GHUO6VSSKPMRCBFJIMJ7BZ65ESIWRN4YHGYHSRJDNL64TAG7EH"
model_id = "d49c26a4-aa5b-4490-9d58-300c5e05d996"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-06 12:23:40,372 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...


In [None]:
##############################################################################
########################  SAVE AND LOAD THE MODEL  ###########################
##############################################################################

In [None]:
# SAVE MODEL
model.save_weights('numeraiThe_Model.h5', overwrite=True, save_format=None, options=None)

In [None]:
# LOAD MODEL
model.load_weights('numeraiThe_Model.h5', by_name=False, skip_mismatch=False, options=None)