Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


In [None]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
!pip install numerapi
!pip install vecstack;



In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [None]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import math
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor 

import torch.nn as nn
import torch.nn.functional as F
import torch
from vecstack import stacking

from sklearn import metrics

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5


In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

In [None]:
seed = 1
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

2021-09-05 10:48:21,351 INFO numerapi.utils: target file already exists
2021-09-05 10:48:21,352 INFO numerapi.utils: download complete
2021-09-05 10:48:21,354 INFO numerapi.base_api: unzipping file...


In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = pd.read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = pd.read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 1min 6s, sys: 9.4 s, total: 1min 16s
Wall time: 1min 15s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
X = training_data[feature_names]
Y = training_data[training_data.columns[-1]]

le = LabelEncoder()
Y_enc = le.fit_transform(Y)

In [None]:
#############################################################################
###########################  CREATING SOME MODELS  ##########################
#############################################################################

In [None]:
import tensorflow as tf

In [None]:
def spearman(target, pred):
    # spearman used for numerai CORR
    
    return np.corrcoef(target, pred.rank(pct=True, method="first"))[0, 1]

In [None]:
def pearson(y_true, y_pred):
    """ Calculate Pearson correlation """
    from scipy.stats import pearsonr
    return pearsonr(y_true, y_pred)

In [None]:
def loss_fn(target, pred):
    pred = torch.tensor(pred, requires_grad=True)
    target = torch.tensor(target)
    corrs = torch.stack([spearman(target[e], pred[e]) for e in era_idx])
    sharpe = adj_sharpe(corrs)
    gradient = torch.autograd.grad(sharpe, pred)[0].detach().numpy()
    hessian = np.ones_like(gradient)  # ones for hessian should be ~okay~
    return gradient, hessian

In [None]:
def spearman(y_true, y_pred):
    # Submissions are scored by spearman correlation
    """ Calculate Spearman correlation """
    from scipy.stats import spearmanr
    return spearmanr(y_true, y_pred).correlation

In [None]:
def my_logistic_obj(y_hat, dtrain):
    y = dtrain.get_label()
    p = y_hat 
    grad = 4 * p * y + p - 5 * y
    hess = (4 * y + 1) * (p * (1.0 - p))
    return grad, hess

In [None]:
def _get_ranks(arr: np.ndarray) -> np.ndarray:

    """
        Efficiently calculates the ranks of the data.
        Only sorts once to get the ranked data.
        :param arr: A 1D NumPy Array
        :return: A 1D NumPy Array containing the ranks of the data

    """
    temp = arr.argsort(kind='stable')
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(arr))
    return ranks

def spearmans_rho_custom(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:

    """
        Efficiently calculates the Spearman's Rho correlation using only NumPy
        :param y_true: The ground truth labels
        :param y_pred: The predicted labels
    """
    # Get ranked data
    true_rank = _get_ranks(y_true)
    pred_rank = _get_ranks(y_pred)
    return np.corrcoef(true_rank, pred_rank)[1][0]

In [None]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.20, stratify=Y, random_state=0)

In [None]:
era_idx = [np.where(training_data.era==uera)[0] for uera in training_data.era.unique()]

In [None]:
#https://stackoverflow.com/questions/61694081/lightgbm-error-valueerror-for-early-stopping-at-least-one-dataset-and-eval-m
gridParams = { 
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [16,32, 64], 
    'random_state' : [501],
    'num_boost_round' : [3000],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4], 
    }

In [None]:
lgb_estimator = LGBMRegressor(boosting_type = 'gbdt', 
                                   n_estimators=500, 
                                   learning_rate =  0.05, num_leaves =  64,  
                                   eval_metric  = 'logloss',
                                   verbose_eval=20, 
                                   early_stopping_rounds=10)

In [None]:
g_lgbm = GridSearchCV(estimator=lgb_estimator, param_grid=gridParams, n_jobs = 3, cv= 3)

In [None]:
lgb_model = g_lgbm.fit(X=train_X,y=train_Y, eval_set=(test_X,test_Y))

In [None]:
%%time
model = LGBMRegressor(random_state=rand,early_stopping_rounds=10,objective=spearmans_rho_custom)

CPU times: user 36 µs, sys: 2 µs, total: 38 µs
Wall time: 42.4 µs


In [None]:
model.fit(X=train_X,y=train_Y, eval_set=(test_X,test_Y))
gc.collect()

In [None]:
y_pred_model = lgb_model.predict(tournament_data[feature_names])
gc.collect()

54

In [None]:
tournament_data[PREDICTION_NAME] = y_pred_model

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.493060
n000920ed083903f    0.489776
n0038e640522c4a6    0.519625
n004ac94a87dc54b    0.501332
n0052fe97ea0c05f    0.501802
                      ...   
nffcc555b6bc06aa    0.499132
nffcfd14874a15c4    0.504296
nffdb0b5a746c4f7    0.503113
nffecef421ac6791    0.491763
nfff39308bdddb48    0.516572
Name: prediction, Length: 1806285, dtype: float64

In [None]:
def neutralize(df, columns, by, proportion=1.0):
  
    scores = df[columns]
    exposures = df[by].values
    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
    gc.collect()
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    gc.collect()
    return scores / scores.std()


In [None]:
%%time
tournament_data[PREDICTION_NAME] = neutralize(tournament_data, PREDICTION_NAME, feature_names)

2021-09-05 10:44:44,341 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-09-05 10:44:44,344 INFO numexpr.utils: NumExpr defaulting to 8 threads.


CPU times: user 9min 22s, sys: 9min 29s, total: 18min 51s
Wall time: 50.5 s


In [None]:
def minmax_norm(df):
    return (df - df.min()) / (df.max() - df.min())

In [None]:
tournament_data[PREDICTION_NAME] = minmax_norm(tournament_data[PREDICTION_NAME])

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.555137
n000920ed083903f    0.490854
n0038e640522c4a6    0.572200
n004ac94a87dc54b    0.569295
n0052fe97ea0c05f    0.570150
                      ...   
nffcc555b6bc06aa    0.529917
nffcfd14874a15c4    0.546924
nffdb0b5a746c4f7    0.524411
nffecef421ac6791    0.533887
nfff39308bdddb48    0.586922
Name: prediction, Length: 1806285, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv('BeeChain_X.csv')

!cp BeeChain_X.csv "drive/My Drive/BeeChain Foundation/"