Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
# https://www.kaggle.com/kansukehabano/numerai-training-new-data-for-low-ram

In [None]:
!pip install numerapi duckdb halo

Collecting numerapi
  Downloading numerapi-2.9.2-py3-none-any.whl (25 kB)
Collecting duckdb
  Downloading duckdb-0.3.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 6.3 MB/s 
[?25hCollecting halo
  Downloading halo-0.0.31.tar.gz (11 kB)
Collecting log_symbols>=0.0.14
  Downloading log_symbols-0.0.14-py3-none-any.whl (3.1 kB)
Collecting spinners>=0.0.24
  Downloading spinners-0.0.24-py3-none-any.whl (5.5 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: halo
  Building wheel for halo (setup.py) ... [?25l[?25hdone
  Created wheel for halo: filename=halo-0.0.31-py3-none-any.whl size=11260 sha256=7557d3fd5831da49d437aa9fbcd8010fd31241f4368a593fcc036d6a1599e30f
  Stored in directory: /root/.cache/pip/wheels/95/ff/20/5d16a0059f20c5e60be2df845201e73af179a5a79a3d566f48
Successfully built halo
Installing collected packages: colorama, spinners

In [None]:
import os
import glob
import gc
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
from lightgbm import LGBMRegressor, Dataset, train
from numerapi import NumerAPI
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm
)
import scipy

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Oct 16 20:20:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    22W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
class PandasDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        
        self.df = pd.read_parquet(pq_path)
        self.df['era'] = self.df['era'].astype('int')
    
    def get_by_group(self, group_id: int, cols=None):
        if group_id == self.splits:
            group_id = 0
        return self.df[self.df['era'] % self.splits == group_id]


import duckdb

class DuckDBDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        self.conn = duckdb.connect(":memory:")
        
    def _gen_select_statement(self) -> str:
        return f"SELECT * FROM parquet_scan('{self.pq_path}') "
    
    def _query(self, expression: str):
        return self. conn.execute(expression)
    
    def _fetch(self, ret_query, fetch_type, cols):
        if fetch_type == "pandas":
            return ret_query.fetchdf()
        elif fetch_type == "numpy":
            return ret_query.fetchdf()[cols].values
        
    def get_by_era(self, era: str, cols=None, fetch_type="pandas"):
        expression = self._gen_select_statement()
        expression += f"WHERE era = '{era}'"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)
    
    def get_by_group(self, group_id: int, cols=None, fetch_type="pandas"):
        if group_id == self.splits:
            group_id = 0
        expression = self._gen_select_statement()
        expression += f"WHERE CAST(era AS INT) % {self.splits} = {group_id}"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)

In [None]:
def minmax_norm(df):
    return (df - df.min()) / (df.max() - df.min())

In [None]:
def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)

In [None]:
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized

In [None]:
def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

In [None]:
##############################################################################
##############################################################################
##############################################################################

In [None]:
napi = numerapi.NumerAPI(verbosity="info")
current_round = napi.get_current_round(tournament=8)

train_pq_path = "numerai_training_data.parquet"
tournament_pq_path = "numerai_tournament_data.parquet"
valid_pq_path = "numerai_validation_data.parquet"
valid_preds_pq_path = "example_validation_predictions.parquet"

napi.download_dataset("numerai_training_data_int8.parquet", train_pq_path)
napi.download_dataset("numerai_tournament_data_int8.parquet", tournament_pq_path)
napi.download_dataset("numerai_validation_data_int8.parquet", valid_pq_path)
napi.download_dataset(valid_preds_pq_path, valid_preds_pq_path)

2021-10-16 20:20:49,612 INFO numerapi.utils: starting download
numerai_training_data.parquet: 1.01GB [00:58, 17.3MB/s]                            
2021-10-16 20:21:48,948 INFO numerapi.utils: starting download
numerai_tournament_data.parquet: 582MB [00:42, 13.7MB/s]                           
2021-10-16 20:22:32,323 INFO numerapi.utils: starting download
numerai_validation_data.parquet: 228MB [00:18, 12.5MB/s]                           
2021-10-16 20:22:51,431 INFO numerapi.utils: starting download
example_validation_predictions.parquet: 13.0MB [00:01, 7.11MB/s]                            


In [None]:
EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target"
ERA_COL = "era"

In [None]:
train_pq = pq.ParquetFile(train_pq_path)
gc.collect()
tournament_pq = pq.ParquetFile(tournament_pq_path)
gc.collect()
valid_pq = pq.ParquetFile(valid_pq_path)
gc.collect()

0

In [None]:
col_names = train_pq.schema.names
feature_cols = [col for col in col_names if "feature" in col]
target_cols = [col for col in col_names if "target" in col]

In [None]:
driver = PandasDriver(train_pq_path)

In [None]:
##############################################################################
#########################    TRAINING TIME   #################################
##############################################################################

In [None]:
params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "max_depth": 5,
    "num_leaves": 2 ** 5,
    "colsample_bytree": 0.1,
}
models = []
for group_id in tqdm(range(1, 5)):
    df = driver.get_by_group(group_id)
    model = LGBMRegressor(**params)
    model.fit(df[feature_cols].values, df[TARGET_COL].values)
    models.append(model)
    del df
    gc.collect()


  0%|          | 0/4 [00:00<?, ?it/s]

2021-10-16 20:22:59,971 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
validation_data = pd.read_parquet(valid_pq_path)
validation_preds = pd.read_parquet(valid_preds_pq_path)
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
del validation_preds
gc.collect()

19

In [None]:
!git clone https://github.com/numerai/example-scripts.git
%cd example-scripts

Cloning into 'example-scripts'...
remote: Enumerating objects: 866, done.[K
remote: Counting objects: 100% (596/596), done.[K
remote: Compressing objects: 100% (437/437), done.[K
remote: Total 866 (delta 289), reused 373 (delta 152), pack-reused 270[K
Receiving objects: 100% (866/866), 29.90 MiB | 25.49 MiB/s, done.
Resolving deltas: 100% (417/417), done.
/content/example-scripts


In [None]:
from utils import validation_metrics, neutralize, neutralize_series

In [None]:
for i, model in enumerate(models):
    validation_data.loc[:, f"preds_{i+1}"] = model.predict(validation_data.loc[:, feature_cols].values)
validation_stats = validation_metrics(validation_data, [f"preds_{i+1}" for i in range(len(models))] + [EXAMPLE_PREDS_COL], example_col=EXAMPLE_PREDS_COL, fast_mode=True)

In [None]:
print(validation_stats[["mean", "sharpe", "mmc_mean", "corr_plus_mmc_sharpe"]].to_markdown())

|               |      mean |   sharpe |     mmc_mean |   corr_plus_mmc_sharpe |
|:--------------|----------:|---------:|-------------:|-----------------------:|
| preds_1       | 0.0233212 | 0.700622 |  0.00402185  |               0.582638 |
| preds_2       | 0.0231148 | 0.709725 |  0.00381313  |               0.594807 |
| preds_3       | 0.023349  | 0.707149 |  0.00381045  |               0.589077 |
| preds_4       | 0.0244287 | 0.752996 |  0.00446553  |               0.642987 |
| example_preds | 0.0254531 | 0.957381 | -2.63205e-05 |               0.955276 |


In [None]:
napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")

2021-10-16 20:34:07,838 INFO numerapi.utils: starting download
numerai_training_data_int8.parquet: 1.01GB [00:49, 20.4MB/s]                            


In [None]:
training_data = pd.read_parquet('numerai_training_data_int8.parquet')

In [None]:
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda d: d[feature_cols].corrwith(d[TARGET_COL]))

In [None]:
riskiest_features = get_biggest_change_features(all_feature_corrs, 500)

In [None]:
validation_data['preds_4_neutralize'] = neutralize(df=validation_data,
                                                            columns=["preds_4"],
                                                            neutralizers=riskiest_features,
                                                            proportion=1,
                                                            normalize=True,
                                                            era_col=ERA_COL)

In [None]:
validation_data['preds_4_neutralize'] = minmax_norm(validation_data['preds_4_neutralize'])

In [None]:
###############################################################################
###############################################################################

In [None]:
model_to_submit = "preds_4_neutralize"
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions.csv")

In [None]:
!cp validation_predictions.csv "drive/My Drive/BeeChain Foundation/"

In [None]:
###############################################################################

In [None]:
'''
del validation_data
gc.collect()
'''

'\ndel validation_data\ngc.collect()\n'

In [None]:
###############################################################################
###############################################################################

In [None]:
napi.download_dataset("numerai_tournament_data_int8.parquet", f"numerai_tournament_data_int8_{current_round}.parquet")
gc.collect()
tournament_data = pd.read_parquet(f'numerai_tournament_data_int8_{current_round}.parquet')
gc.collect()

2021-10-16 20:38:21,256 INFO numerapi.utils: starting download
numerai_tournament_data_int8_286.parquet: 582MB [00:34, 16.9MB/s]                           


0

In [None]:
batch_size=2000
tournament_preds = np.zeros((len(tournament_data), len(models)))

tournament_batches = tournament_pq.iter_batches(batch_size)
gc.collect()
for i, batch in tqdm(enumerate(tournament_batches)):
    features = batch.to_pandas()[feature_cols]
    gc.collect()
    for j, model in enumerate(models):
        tournament_preds[i*batch_size:(i+1)*batch_size, j] = model.predict(features)
        gc.collect()


0it [00:00, ?it/s]

In [None]:
tournament_data[[ f"preds_{i+1}" for i in range(4)]] = tournament_preds
del tournament_preds

In [None]:
tournament_data

In [None]:
tournament_data["prediction"] = neutralize(df=tournament_data,
                                              columns=["preds_4"],
                                              neutralizers=riskiest_features,
                                              proportion=1,
                                              normalize=True,
                                              era_col=ERA_COL)

In [None]:
tournament_data["prediction"] = minmax_norm(tournament_data["prediction"])

In [None]:
tournament_data["prediction"].to_csv(f"tournament_predictions.csv")

In [None]:
!cp tournament_predictions.csv "drive/My Drive/BeeChain Foundation/"