Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


In [None]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
!pip install numerapi
!pip install vecstack;



In [None]:
# https://towardsdatascience.com/custom-loss-functions-for-gradient-boosting-f79c1b40466d
# https://github.com/manifoldai/mf-eng-public/blob/master/notebooks/custom_loss_lightgbm.ipynb

In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [None]:
import numerapi
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
import lightgbm 
from sklearn.datasets import make_friedman2, make_friedman1, make_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm
from sklearn.metrics import mean_squared_error
import seaborn as sns; sns.set()
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
def custom_asymmetric_objective(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual<0, -2*10.0*residual, -2*residual)
    hess = np.where(residual<0, 2*10.0, 2.0)
    return grad, hess

def custom_asymmetric_eval(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual**2)*10.0, residual**2) 
    return "custom_asymmetric_eval", np.mean(loss), False

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

In [None]:
seed = 1
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

2021-09-05 13:21:08,151 INFO numerapi.utils: starting download
./numerai_dataset_280.zip:  99%|█████████▉| 416M/420M [00:08<00:00, 51.2MB/s]2021-09-05 13:21:17,071 INFO numerapi.base_api: unzipping file...
./numerai_dataset_280.zip: 420MB [00:19, 51.2MB/s]                           

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = pd.read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = pd.read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

./numerai_dataset_280.zip: 420MB [00:29, 14.0MB/s]

# Loading data...





# All Loaded...
CPU times: user 1min 6s, sys: 11.3 s, total: 1min 17s
Wall time: 1min 18s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
X = training_data[feature_names]
Y = training_data[training_data.columns[-1]]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.20, random_state=rand)

In [None]:
#############################################################################
###########################  CREATING SOME MODELS  ##########################
#############################################################################

In [None]:
gbm = LGBMRegressor(random_state=rand,early_stopping_rounds = 10,n_estimators=10000)

In [None]:
gbm.set_params(**{'objective': custom_asymmetric_objective}, metrics = ["mse", 'mae'])

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              early_stopping_rounds=10, importance_type='split',
              learning_rate=0.1, max_depth=-1, metrics=['mse', 'mae'],
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=-1, num_leaves=31,
              objective=<function custom_asymmetric_objective at 0x7f1d638c6c20>,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
gbm.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric=custom_asymmetric_eval,
    verbose=True,
)

2021-09-05 13:23:00,286 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-09-05 13:23:00,288 INFO numexpr.utils: NumExpr defaulting to 8 threads.


[1]	valid_0's l1: 0.455254	valid_0's l2: 0.252522	valid_0's custom_asymmetric_eval: 0.253632
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l1: 0.426609	valid_0's l2: 0.225444	valid_0's custom_asymmetric_eval: 0.228035
[3]	valid_0's l1: 0.400827	valid_0's l2: 0.202832	valid_0's custom_asymmetric_eval: 0.207304
[4]	valid_0's l1: 0.377615	valid_0's l2: 0.183899	valid_0's custom_asymmetric_eval: 0.190506
[5]	valid_0's l1: 0.356736	valid_0's l2: 0.168009	valid_0's custom_asymmetric_eval: 0.176901
[6]	valid_0's l1: 0.337915	valid_0's l2: 0.154625	valid_0's custom_asymmetric_eval: 0.16586
[7]	valid_0's l1: 0.320912	valid_0's l2: 0.143303	valid_0's custom_asymmetric_eval: 0.156874
[8]	valid_0's l1: 0.306133	valid_0's l2: 0.133726	valid_0's custom_asymmetric_eval: 0.149613
[9]	valid_0's l1: 0.295635	valid_0's l2: 0.126684	valid_0's custom_asymmetric_eval: 0.144726
[10]	valid_0's l1: 0.288163	valid_0's l2: 0.121437	valid_0's custom_asymmetric_eval: 0.141397
[11]	val

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              early_stopping_rounds=10, importance_type='split',
              learning_rate=0.1, max_depth=-1, metrics=['mse', 'mae'],
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=-1, num_leaves=31,
              objective=<function custom_asymmetric_objective at 0x7f1d638c6c20>,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
%%time
y_pred_model = gbm.predict(tournament_data[feature_names])
gc.collect()

tournament_data[PREDICTION_NAME] = y_pred_model

CPU times: user 25.8 s, sys: 3.35 s, total: 29.2 s
Wall time: 8.3 s


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.315867
n000920ed083903f    0.290415
n0038e640522c4a6    0.243681
n004ac94a87dc54b    0.310756
n0052fe97ea0c05f    0.327616
                      ...   
nffcc555b6bc06aa    0.235904
nffcfd14874a15c4    0.310414
nffdb0b5a746c4f7    0.282280
nffecef421ac6791    0.313469
nfff39308bdddb48    0.302644
Name: prediction, Length: 1806285, dtype: float64

In [None]:
def neutralize(df, columns, by, proportion=1.0):
  
    scores = df[columns]
    exposures = df[by].values
    # constant column to make sure the series is completely neutral to exposures
    exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
    gc.collect()
    scores = scores - proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
    gc.collect()
    return scores / scores.std()


In [None]:
%%time
tournament_data[PREDICTION_NAME] = neutralize(tournament_data, PREDICTION_NAME, feature_names)

CPU times: user 8min 38s, sys: 8min 39s, total: 17min 17s
Wall time: 47.5 s


In [None]:
def minmax_norm(df):
    return (df - df.min()) / (df.max() - df.min())

In [None]:
tournament_data[PREDICTION_NAME] = minmax_norm(tournament_data[PREDICTION_NAME])

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.546585
n000920ed083903f    0.534937
n0038e640522c4a6    0.424623
n004ac94a87dc54b    0.502969
n0052fe97ea0c05f    0.411100
                      ...   
nffcc555b6bc06aa    0.306967
nffcfd14874a15c4    0.371504
nffdb0b5a746c4f7    0.539881
nffecef421ac6791    0.454630
nfff39308bdddb48    0.490544
Name: prediction, Length: 1806285, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv('BeeChain_4.csv')

!cp BeeChain_4.csv "drive/My Drive/BeeChain Foundation/"