Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


In [None]:
!pip install numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [None]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import math
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor


In [None]:
'''
##################### CLASSES AND FUNCTIONS ##################################
'''
def my_loss_fn(y_true, y_pred):
    squared_difference = tf.square(y_true - y_pred)
    return tf.reduce_mean(squared_difference, axis=-1)

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
seed = 3
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_246.zip: 100%|█████████▉| 386M/386M [00:21<00:00, 19.8MB/s]2021-01-10 09:20:17,854 INFO numerapi.base_api: unzipping file...
./numerai_dataset_246.zip: 386MB [00:40, 19.8MB/s]                           

In [None]:
#https://forum.numer.ai/t/model-diagnostics-risk-metrics/900

TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

#-----------------------------------------------------

# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]

# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


# Read the csv file into a pandas Dataframe
def read_csv(file_path):
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
        dtypes = {x: np.float16 for x in column_names if
                  x.startswith(('feature', 'target'))}
    return pd.read_csv(file_path, dtype=dtypes)

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 54.2 s, sys: 2.49 s, total: 56.7 s
Wall time: 56.7 s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
##################################################################
#####################   MORE METRICS   ###########################
##################################################################

In [None]:
TRAIN_EVAL_PREFIX = "train"
VAL_EVAL_PREFIX = "val"

#Some evaluation metrics
def ar1(x):
    return np.corrcoef(x[:-1], x[1:])[0,1]

def autocorr_penalty(x):
    n = len(x)
    p = ar1(x)
    return np.sqrt(1 + 2*np.sum([((n - i)/n)*p**i for i in range(1,n)]))

def smart_sharpe(x):
    return np.mean(x)/(np.std(x, ddof=1)*autocorr_penalty(x))

def numerai_sharpe(x):
    return ((np.mean(x) - 0.010415154) / np.std(x)) * np.sqrt(12)

def spearmanr(target, pred):
    return np.corrcoef(
        target,
        pred.rank(pct=True, method="first")
    )[0, 1]

#-----------------------------------------------------
def get_baisc_per_era_metrics(df:pd.DataFrame, 
                        isVal=None, 
                        fig_name="per_era_scores.png") -> pd.Series:
    
    prefix=None
    scores = pd.Series(dtype=float)

    preds_ = df[PREDICTION_NAME]
    #Some checks for deciding between training and tournament data
    if isVal:
        #scores["tournament_corr_example_preds"] = spearmanr(preds_, example_preds[PREDICTION_NAME])
        df = df[df.data_type == "validation"]
        prefix=VAL_EVAL_PREFIX
        print("predicting on validation...")
    else:
        df = df
        prefix=TRAIN_EVAL_PREFIX
        print("predicting on train...")

    #-----------------------------------------------------

    #Metric Calculations
    print("getting per era scores")
    era_scores = df.groupby("era").apply(
        lambda x: spearmanr(x[TARGET_NAME], x[PREDICTION_NAME]))
    
    era_scores.sort_index(inplace=True)
    era_scores.plot(kind="bar")
    print("performance over time")
    plt.pyplot.savefig(f"{prefix}_{fig_name}")
    plt.pyplot.show()

    #-----------------------------------------------------
    
    scores[f"{prefix}_mean"] = preds_.mean()
    scores[f"{prefix}_std_dev"] = preds_.std()
    scores[f"{prefix}_less_than_half"] = (preds_<0.5).mean()
    scores[f"{prefix}_less_than_mean"] = (preds_<preds_.mean()).mean()

    scores[f"{prefix}_autocorrelation"] = ar1(era_scores)
    scores[f"{prefix}_mean correlation"] = np.mean(era_scores)
    scores[f"{prefix}_Median Correlation"] = np.median(era_scores)
    scores[f"{prefix}_Variance"] = np.var(era_scores)
    scores[f"{prefix}_Std. Dev."] = np.std(era_scores)
    scores[f"{prefix}_sharpe"] = np.mean(era_scores)/np.std(era_scores)
    scores[f"{prefix}_smart sharpe"] = smart_sharpe(era_scores)
    scores[f"{prefix}_Numerai sharpe"] = numerai_sharpe(era_scores)

    print(scores)
    del era_scores
    del preds_
    gc.collect()
    return scores


In [None]:
def neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    
    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
    gc.collect()
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    gc.collect()
    return scores / scores.std()


def calculate_feature_exposure(df, feature_names) -> list:
    exposures = []
    for feature_name in feature_names:
        exposures.append(spearmanr(df[feature_name], df[PREDICTION_NAME]))
        
    max_feat_exposure = np.max(np.abs(exposures))
    square_sum_feature_exposure = np.sum([e**2 for e in exposures])
    feature_exposure = np.std(exposures)

    #print(max_feat_exposure, square_sum_feature_exposure)

    return [feature_exposure, max_feat_exposure, square_sum_feature_exposure]


def get_more_metrics(df, feature_names, isVal=None) -> pd.Series:
    
    more_metrics = pd.Series(dtype=float)
    metric_prefix=None
    assert PREDICTION_NAME in df.columns

    if isVal is None:
        isVal = "validation" in df["data_type"].unique() #max CPU times: user 65.1 ms

    print(isVal)
    if isVal:
        df = df[df["data_type"]=="validation"]
        metric_prefix = VAL_EVAL_PREFIX
    else:
        metric_prefix = TRAIN_EVAL_PREFIX

    assert metric_prefix is not None

    #-----------------------------------------------------

    #per-era scores
    
    print("predicting per-era scores...")
    scores_per_era = df.groupby("era").apply(
        lambda df: spearmanr(df[PREDICTION_NAME], df[TARGET_NAME]))
    
    more_metrics[f"{metric_prefix}_var"] = scores_per_era.std()

    #-----------------------------------------------------
    
    #Neutralize
    #This takes a significant amount of memory for calculation
    print(df.shape)
    print("Neutralizing...")
    df[f"neutral_{PREDICTION_NAME}"] = neutralize(df, PREDICTION_NAME, feature_names)
    feature_neutral_mean = df.groupby("era").apply(
        lambda x: spearmanr(x["neutral_"+PREDICTION_NAME].values, x[TARGET_NAME])).mean()

    more_metrics[f"{metric_prefix}_feature_neutral_mean"] = feature_neutral_mean
    gc.collect()

    #-----------------------------------------------------
    print("Calculating Feature Exposure...")
    feature_exposure, max_feat_exposure, square_sum_feature_exposure = calculate_feature_exposure(df, feature_names)

    more_metrics[f"{metric_prefix}_feat_exposure"] = feature_exposure
    more_metrics[f"{metric_prefix}_max_feat_exposure"] = max_feat_exposure
    more_metrics[f"{metric_prefix}_square_sum_feature_exposure"] = square_sum_feature_exposure


    #-----------------------------------------------------
    print("Drawdown...")
    rolling_max = (scores_per_era+1).cumprod().rolling(window=100, min_periods=1).max()
    daily_value = (scores_per_era+1).cumprod()
    max_drawdown = (rolling_max - daily_value).max()

    more_metrics[f"{metric_prefix}_max_drawdown"] = max_drawdown

    return more_metrics

In [None]:
def get_all_metrics(model, 
                    feature_names:list=feature_names, 
                    fig_name="per_era_scores")->pd.Series:

    training_preds = model.predict(training_data[feature_names].values)
    training_data[PREDICTION_NAME] = np.array(training_preds).reshape(-1,1)

    tournament_preds = model.predict(tournament_data[feature_names].values)
    tournament_data[PREDICTION_NAME] = np.array(tournament_preds).reshape(-1,1)

    del training_preds
    del tournament_preds

    print("evaluating on training data...")
    tr_per_era_scores = get_baisc_per_era_metrics(training_data, isVal=False, fig_name=fig_name)
    tr_more_metrics = get_more_metrics(training_data, feature_names ,isVal=False)
    gc.collect()

    print("evaluating on validation data...")
    val_per_era_scores = get_baisc_per_era_metrics(tournament_data, isVal=True, fig_name=fig_name)
    val_more_metrics = get_more_metrics(tournament_data, feature_names ,isVal=True)
    gc.collect()

    return pd.concat([
                      tr_per_era_scores, val_per_era_scores,
                      tr_more_metrics, val_more_metrics,
                      ])
    

In [None]:
##############################################################################
######################## MAKE PREDICTIONS ####################################
##############################################################################

In [None]:
%%time
model_XGBRegressor_2 = LGBMRegressor()
model_XGBRegressor_2.fit(training_data[feature_names], training_data[TARGET_NAME])

CPU times: user 1min 56s, sys: 568 ms, total: 1min 56s
Wall time: 31.9 s


In [None]:
%%time
tournament_preds = model_XGBRegressor_2.predict(tournament_data[feature_names])
tournament_data[PREDICTION_NAME] = tournament_preds

CPU times: user 21.6 s, sys: 1.43 s, total: 23.1 s
Wall time: 10.8 s


In [None]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [None]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.488480
n000920ed083903f    0.487922
n0038e640522c4a6    0.516811
n004ac94a87dc54b    0.499933
n0052fe97ea0c05f    0.502267
                      ...   
nff8ba6894349f25    0.505012
nff970697639058c    0.500032
nffaba00aa8edfc5    0.486423
nffd34c51e1bdda8    0.482756
nfffd13db9c56406    0.501631
Name: prediction, Length: 1623097, dtype: float64

In [None]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

In [None]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

2021-01-10 12:06:07,065 INFO numexpr.utils: NumExpr defaulting to 4 threads.


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.424455
n000920ed083903f    0.400952
n0038e640522c4a6    0.547099
n004ac94a87dc54b    0.536181
n0052fe97ea0c05f    0.554569
                      ...   
nff8ba6894349f25    0.523639
nff970697639058c    0.496065
nffaba00aa8edfc5    0.387984
nffd34c51e1bdda8    0.363573
nfffd13db9c56406    0.590939
Name: prediction, Length: 1623097, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI FISAGOL: jmfisagol@gmail.com
public_id = "7TISUDGAWEVO2B35ECOQQXU2RWXGZN3I"
secret_key = "QJYUWIMFEEDNZ4GHUO6VSSKPMRCBFJIMJ7BZ65ESIWRN4YHGYHSRJDNL64TAG7EH"
model_id = "d49c26a4-aa5b-4490-9d58-300c5e05d996"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2021-01-10 12:06:25,673 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...
