Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


# Evaluating Financial Machine Learning Models on Numerai

In [1]:
!pip install numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [2]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import math
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [3]:
seed = 3
rand = np.random.seed(seed)

In [4]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_244.zip: 100%|█████████▉| 384M/385M [00:25<00:00, 17.0MB/s]2020-12-27 14:05:49,550 INFO numerapi.base_api: unzipping file...
./numerai_dataset_244.zip: 385MB [00:39, 17.0MB/s]                           

In [5]:
#https://forum.numer.ai/t/model-diagnostics-risk-metrics/900

TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [6]:
%%time
print("# Loading data...")

training_data = pd.read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = pd.read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 57.5 s, sys: 6.93 s, total: 1min 4s
Wall time: 1min 4s


In [8]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
##############################################################################
######################## MAKE PREDICTIONS ####################################
##############################################################################

In [9]:
%%time
model_XGBRegressor_2 = LGBMRegressor()
model_XGBRegressor_2.fit(training_data[feature_names], training_data[TARGET_NAME])

CPU times: user 1min 57s, sys: 335 ms, total: 1min 58s
Wall time: 31.9 s


In [10]:
%%time
tournament_data[PREDICTION_NAME] = model_XGBRegressor_2.predict(tournament_data[feature_names])

CPU times: user 20.2 s, sys: 6.68 s, total: 26.9 s
Wall time: 14.5 s


In [11]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.488480
n000920ed083903f    0.487922
n0038e640522c4a6    0.516811
n004ac94a87dc54b    0.499933
n0052fe97ea0c05f    0.502267
                      ...   
nffd1f8ec8ff3b23    0.490741
nffeab1e87d5bbb7    0.497121
nffeb444b05635f6    0.504650
nfffdcd9dba8e5d1    0.477709
nffff1a37ed8064a    0.506131
Name: prediction, Length: 1612518, dtype: float64

In [None]:
import scipy
import numpy
from sklearn.preprocessing import MinMaxScaler

def full_neutralization(df, feature_names=feature_names, pred_name=PREDICTION_NAME):
    df[pred_name] = df.groupby("era").apply(lambda x: normalize_and_neutralize(x, [pred_name], feature_names))
    scaled_preds = MinMaxScaler().fit_transform(df[[pred_name]])
    return scaled_preds

def _neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    scores = scores - proportion * exposures.dot(numpy.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()
def _normalize(df):
    X = (df.rank(method="first") - 0.5) / len(df)
    return scipy.stats.norm.ppf(X)

def normalize_and_neutralize(df, columns, by, proportion=1.0):
    # Convert the scores to a normal distribution
    df[columns] = _normalize(df[columns])
    df[columns] = _neutralize(df, columns, by, proportion)
    return df[columns]


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.489182
n000920ed083903f    0.485614
n0038e640522c4a6    0.539690
n004ac94a87dc54b    0.501278
n0052fe97ea0c05f    0.499365
                      ...   
nffd1f8ec8ff3b23    0.493374
nffeab1e87d5bbb7    0.500904
nffeb444b05635f6    0.504746
nfffdcd9dba8e5d1    0.500511
nffff1a37ed8064a    0.516003
Name: prediction, Length: 1612518, dtype: float64

In [None]:
x = full_neutralization(tournament_data)

2020-12-27 11:05:46,444 INFO numexpr.utils: NumExpr defaulting to 4 threads.


In [None]:
tournament_data[PREDICTION_NAME] = x

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.584231
n000920ed083903f    0.477602
n0038e640522c4a6    0.696773
n004ac94a87dc54b    0.615730
n0052fe97ea0c05f    0.529691
                      ...   
nffd1f8ec8ff3b23    0.549677
nffeab1e87d5bbb7    0.478294
nffeb444b05635f6    0.531310
nfffdcd9dba8e5d1    0.810583
nffff1a37ed8064a    0.759185
Name: prediction, Length: 1612518, dtype: float64

In [12]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [13]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [14]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.488480
n000920ed083903f    0.487922
n0038e640522c4a6    0.516811
n004ac94a87dc54b    0.499933
n0052fe97ea0c05f    0.502267
                      ...   
nffd1f8ec8ff3b23    0.490741
nffeab1e87d5bbb7    0.497121
nffeb444b05635f6    0.504650
nfffdcd9dba8e5d1    0.477709
nffff1a37ed8064a    0.506131
Name: prediction, Length: 1612518, dtype: float64

In [15]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

era121 
era122 
era123 
era124 
era125 
era126 
era127 
era128 
era129 
era130 
era131 
era132 
era575 
era576 
era577 
era578 
era579 
era580 
era581 
era582 
era583 
era584 
era585 
era586 
era587 
era588 
era589 
era590 
era591 
era592 
era593 
era594 
era595 
era596 
era597 
era598 
era599 
era600 
era601 
era602 
era603 
era604 
era605 
era606 
era607 
era608 
era609 
era610 
era611 
era612 
era613 
era614 
era615 
era616 
era617 
era618 
era619 
era620 
era621 
era622 
era623 
era624 
era625 
era626 
era627 
era628 
era629 
era630 
era631 
era632 
era633 
era634 
era635 
era636 
era637 
era638 
era639 
era640 
era641 
era642 
era643 
era644 
era645 
era646 
era647 
era648 
era649 
era650 
era651 
era652 
era653 
era654 
era655 
era656 
era657 
era658 
era659 
era660 
era661 
era662 
era663 
era664 
era665 
era666 
era667 
era668 
era669 
era670 
era671 
era672 
era673 
era674 
era675 
era676 
era677 
era678 
era679 
era680 
era681 
era682 
era683 
era684 
era685 
era686 
era687 


In [16]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

2020-12-27 17:00:45,576 INFO numexpr.utils: NumExpr defaulting to 4 threads.


In [17]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.424455
n000920ed083903f    0.400952
n0038e640522c4a6    0.547099
n004ac94a87dc54b    0.536181
n0052fe97ea0c05f    0.554569
                      ...   
nffd1f8ec8ff3b23    0.398292
nffeab1e87d5bbb7    0.407355
nffeb444b05635f6    0.484525
nfffdcd9dba8e5d1    0.429581
nffff1a37ed8064a    0.531462
Name: prediction, Length: 1612518, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [18]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [19]:
# NameOfYourAI FISAGOL_1: joanmarc92@hotmail.com
public_id = "3LNFQEKJM2EFGHLRC5WK344GZTE3Y2XK"
secret_key = "CCMP4MU2YDUZQYNO5BUZY6SQHX3NKBPSZDECBBFM6TLNCLY4VISKQ6Z6CBZGJ65Y"
model_id = "0bb6b078-8525-4521-9ed2-5e7d13cc88ca"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [20]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-27 17:01:02,217 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...
