# Imports and downloads

In [1]:
!pip install --upgrade numerapi
!pip install halo

import os
from halo import Halo
import json
import gc
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

!git clone https://github.com/numerai/example-scripts.git
%cd example-scripts
from utils import (
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks')
from metrics import evaluate

Collecting numerapi
  Downloading numerapi-2.9.4-py3-none-any.whl (26 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.9.4
Collecting halo
  Downloading halo-0.0.31.tar.gz (11 kB)
Collecting log_symbols>=0.0.14
  Downloading log_symbols-0.0.14-py3-none-any.whl (3.1 kB)
Collecting spinners>=0.0.24
  Downloading spinners-0.0.24-py3-none-any.whl (5.5 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: halo
  Building wheel for halo (setup.py) ... [?25l[?25hdone
  Created wheel for halo: filename=halo-0.0.31-py3-none-any.whl size=11260 sha256=8648456892c158469c7aa9914f7e5fefa8679185d6c14a717b90a2878d393ff2
  Stored in directory: /root/.cache/pip/wheels/95/ff/20/5d16a0059f20c5e60be2df845201e73af179a5a79a3d566f48
Successfully built halo
Installing collected packages: colorama, spinners, log-symbols, halo
Successfully installed colorama-0.4.4 halo-0.0.31 log-symbols-0.0.14 spinners-0.0.

In [2]:
# Download training and validation data
napi = numerapi.NumerAPI(verbosity="info")
current_round = napi.get_current_round(tournament=8)

train_pq_path = "numerai_training_data_new.parquet"
valid_pq_path = "numerai_validation_data.parquet"

napi.download_dataset("numerai_training_data_int8.parquet", train_pq_path)
napi.download_dataset("numerai_validation_data_int8.parquet", valid_pq_path)

napi.download_dataset("features.json", "features.json")


2022-01-05 07:56:47,754 INFO numerapi.utils: starting download
numerai_training_data_new.parquet: 1.01GB [00:27, 36.2MB/s]                            
2022-01-05 07:57:16,390 INFO numerapi.utils: starting download
numerai_validation_data.parquet: 228MB [00:07, 29.2MB/s]                           
2022-01-05 07:57:25,663 INFO numerapi.utils: starting download
features.json: 441kB [00:00, 1.03MB/s]                          


In [3]:
def get_by_group(df, part: int, splits=4):
  if part == splits:
      part = 0
  return df[df['era'] % splits == part]

In [4]:
with open("features.json", "r") as f:
    feature_metadata = json.load(f)

features = feature_metadata["feature_sets"]["small"]
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]
df_train = pd.read_parquet(train_pq_path, columns=read_columns)

eras = df_train.era.astype(int)
df_train["era"] = eras

In [88]:
df_train["era"]

id
n003bba8a98662e4      1
n003bee128c2fcfc      1
n0048ac83aff7194      1
n00691bec80d3e02      1
n00b8720a2fdc4f2      1
                   ... 
nffcc1dbdf2212e6    574
nffd71b7f6a128df    574
nffde3b371d67394    574
nfff1a1111b35e84    574
nfff2bd38e397265    574
Name: era, Length: 2412105, dtype: int64

In [5]:
gc.collect()

61

# Optuna Hyperparameter optimization

**Ordering:**


* Tune max depth, num leaves
* Tune feature_fraction etc using lightgbm tuner
* Tune learning rate



In [6]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.10.0-py3-none-any.whl (308 kB)
     |████████████████████████████████| 308 kB 5.1 MB/s 
[?25hCollecting cliff
  Downloading cliff-3.10.0-py3-none-any.whl (80 kB)
     |████████████████████████████████| 80 kB 6.8 MB/s 
Collecting alembic
  Downloading alembic-1.7.5-py3-none-any.whl (209 kB)
     |████████████████████████████████| 209 kB 57.9 MB/s 
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.1.6-py2.py3-none-any.whl (75 kB)
     |████████████████████████████████| 75 kB 3.5 MB/s 
Collecting cmd2>=1.0.0
  Downloading cmd2-2.3.3-py3-none-any.whl (149 kB)
     |████████████████████████████████| 149 kB 63.0 MB/s 
[?25hCollecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.8.0-py2.py3-none-any.whl (112 kB)
     |████████████████████████████████| 112 kB 45.4 MB/s 
Collecting autopage>=0.4.0
  Downloading autopage-

In [90]:
def correlation(y_true, y_pred):
  return "corr", np.corrcoef(y_true, y_pred)[0, 1], True

In [105]:
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import KFold
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        # "objective": trial.suggest_categorical("objective", ['regression', 'rank_xendcg'])
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "learning_rate": 0.04,
        "num_leaves": trial.suggest_int("num_leaves", 25, 32),
        "max_depth": trial.suggest_int("max_depth", 5, 7),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        # "lambda_l2": trial.suggest_int("lambda_l2", 0),
        # "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "feature_fraction": 0.8,
        "metric": "custom",
        "objective": "regression"
    }

    cv = KFold(n_splits=4, shuffle=True, random_state=10)

    cv_scores = []
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        dtrain = lgbm.Dataset(X_train, label=y_train)
        dtest = lgbm.Dataset(X_test, label=y_test)

        model = lgbm.LGBMRegressor(**param_grid)
        model.fit(
            X_train,
            y_train,
        )

        preds = model.predict(X_test)
        cv_scores.append(spearmanr(y_test, preds))
        print(f"fold {idx} done")

    return np.mean(cv_scores)

In [107]:
sampler = optuna.samplers.TPESampler() 
study = optuna.create_study(direction="maximize", study_name="LGBM Regression", sampler=sampler)
func = lambda trial: objective(trial, df_train[features], df_train[TARGET_COL])
study.optimize(func, n_trials=10, show_progress_bar=True)

[I 2022-01-05 09:38:24,349] A new study created in memory with name: LGBM Regression

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



  0%|          | 0/10 [00:00<?, ?it/s]

fold 0 done
fold 1 done
fold 2 done
fold 3 done
[I 2022-01-05 09:47:39,210] Trial 0 finished with value: 0.043561687627538474 and parameters: {'n_estimators': 493, 'num_leaves': 29, 'max_depth': 7}. Best is trial 0 with value: 0.043561687627538474.
fold 0 done
fold 1 done
fold 2 done
fold 3 done
[I 2022-01-05 10:03:30,807] Trial 1 finished with value: 0.0487962138352792 and parameters: {'n_estimators': 893, 'num_leaves': 28, 'max_depth': 6}. Best is trial 1 with value: 0.0487962138352792.
fold 0 done
fold 1 done
fold 2 done
fold 3 done
[I 2022-01-05 10:13:30,839] Trial 2 finished with value: 0.04209085968314114 and parameters: {'n_estimators': 535, 'num_leaves': 25, 'max_depth': 6}. Best is trial 1 with value: 0.0487962138352792.
fold 0 done
fold 1 done
fold 2 done
fold 3 done
[I 2022-01-05 10:20:22,610] Trial 3 finished with value: 0.040286980316480904 and parameters: {'n_estimators': 318, 'num_leaves': 32, 'max_depth': 7}. Best is trial 1 with value: 0.0487962138352792.
fold 0 done
f

In [108]:
optuna.visualization.plot_param_importances(study)

In [109]:
optuna.visualization.plot_optimization_history(study)

In [110]:
optuna.visualization.plot_slice(study)