Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


# Evaluating Financial Machine Learning Models on Numerai

In [None]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


Please install GPU version of TF


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 38.0 gigabytes of available RAM

You are using a high-RAM runtime!


In [1]:
!pip install numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [2]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [3]:
seed = 3
rand = np.random.seed(seed)

In [4]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_243.zip: 100%|█████████▉| 383M/384M [00:28<00:00, 14.8MB/s]2020-12-19 19:48:15,182 INFO numerapi.base_api: unzipping file...
./numerai_dataset_243.zip: 384MB [00:39, 14.8MB/s]                           

In [5]:
#https://forum.numer.ai/t/model-diagnostics-risk-metrics/900

TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

#-----------------------------------------------------

# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]

# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


# Read the csv file into a pandas Dataframe
def read_csv(file_path):
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
        dtypes = {x: np.float16 for x in column_names if
                  x.startswith(('feature', 'target'))}
    return pd.read_csv(file_path, dtype=dtypes)

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [6]:
%%time
print("# Loading data...")

training_data = read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 57.2 s, sys: 2.5 s, total: 59.7 s
Wall time: 59.8 s


In [7]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
##############################################################################
########################  DIVIDE DATA  in X and Y  ###########################
##############################################################################

In [9]:
################### ORIGINAL UTIL DATA  ######################

X_train = training_data[training_data.columns[3:-1]]
Y_train = training_data[training_data.columns[-1]]
X_train_eras = training_data[training_data.columns[0:2]]

X_tournament = tournament_data[tournament_data.columns[3:-1]]
Y_tournament = tournament_data[tournament_data.columns[-1]]
X_tournament_eras = tournament_data[tournament_data.columns[0:2]]

X_validation = validation_data[validation_data.columns[3:-1]]
Y_validation = validation_data[validation_data.columns[-1]]
X_validation_eras = validation_data[validation_data.columns[0:2]]


In [None]:
from sklearn.preprocessing import Normalizer

ss = StandardScaler()
x_train = training_data[feature_names]
x_tournament = tournament_data[feature_names]
x_validation = validation_data[feature_names]

################### NORMALIZE TRAINING ######################

x_train[feature_names] = ss.fit_transform(training_data[feature_names])

transformer_train = Normalizer().fit(x_train[feature_names])
x_train = transformer_train.transform(x_train[feature_names])

x_train = pd.DataFrame(x_train, columns=[feature_names])
x_train["target"] = Y_train.values
x_train["era"] = training_data[training_data.columns[0:1]].values
x_train["data_type"] = training_data[training_data.columns[1:2]].values
training_data = x_train

################### NORMALIZE TOURNAMENT ######################

x_tournament[feature_names] = ss.fit_transform(tournament_data[feature_names])

transformer_tournament = Normalizer().fit(x_tournament[feature_names])
x_tournament = transformer_tournament.transform(x_tournament[feature_names])

x_tournament = pd.DataFrame(x_tournament, columns=[feature_names])
x_tournament["target"] = Y_tournament.values
x_tournament["era"] = tournament_data[tournament_data.columns[0:1]].values
x_tournament["data_type"] = tournament_data[tournament_data.columns[1:2]].values
tournament_data = x_tournament

################### NORMALIZE VALIDATION ######################

x_validation[feature_names] = ss.fit_transform(validation_data[feature_names])

transformer_validation = Normalizer().fit(x_validation[feature_names])
x_validation = transformer_validation.transform(x_validation[feature_names])

x_validation = pd.DataFrame(x_validation, columns=[feature_names])
x_validation["target"]= Y_validation.values
x_validation["era"] = validation_data[validation_data.columns[0:1]].values
x_validation["data_type"] = validation_data[validation_data.columns[1:2]].values
validation_data = x_validation

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.iloc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)


In [None]:
##############################################################################
######################## MAKE PREDICTIONS ####################################
##############################################################################

In [None]:
model_XGBRegressor_2 = XGBRegressor(colsample_bytree=0.4,
            gamma=0,
            learning_rate=0.07,
            max_depth=3,
            min_child_weight=1.5,
            n_estimators=750,                                                                 
            reg_alpha=0.75,
            reg_lambda=0.45,
            subsample=0.6,
            seed=rand)
model_XGBRegressor_2.fit(training_data[feature_names], Y_train)

In [None]:
%%time

print("Generating predictions on tournament data...")
tournament_preds = model_XGBRegressor_2.predict(tournament_data[feature_names])
#tournament_data[PREDICTION_NAME] = tournament_preds

In [None]:
tournament_preds

array([0.48645765, 0.48823375, 0.53461075, ..., 0.51955205, 0.51429075,
       0.48515308], dtype=float32)

In [None]:
tournament_data[PREDICTION_NAME] = tournament_preds

In [None]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [None]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [None]:
tournament_data[PREDICTION_NAME]

In [None]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

In [None]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

2020-12-13 12:30:47,852 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-13 12:30:47,853 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
tournament_data[PREDICTION_NAME]

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI FISAGOL_1: joanmarc92@hotmail.com
public_id = "3LNFQEKJM2EFGHLRC5WK344GZTE3Y2XK"
secret_key = "CCMP4MU2YDUZQYNO5BUZY6SQHX3NKBPSZDECBBFM6TLNCLY4VISKQ6Z6CBZGJ65Y"
model_id = "0bb6b078-8525-4521-9ed2-5e7d13cc88ca"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-13 12:31:11,921 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...
