Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


In [45]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [2]:
# https://www.kaggle.com/kansukehabano/numerai-training-new-data-for-low-ram

In [3]:
!pip install numerapi duckdb halo

Collecting numerapi
  Downloading numerapi-2.9.2-py3-none-any.whl (25 kB)
Collecting duckdb
  Downloading duckdb-0.3.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 20.2 MB/s 
[?25hCollecting halo
  Downloading halo-0.0.31.tar.gz (11 kB)
Collecting log_symbols>=0.0.14
  Downloading log_symbols-0.0.14-py3-none-any.whl (3.1 kB)
Collecting spinners>=0.0.24
  Downloading spinners-0.0.24-py3-none-any.whl (5.5 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: halo
  Building wheel for halo (setup.py) ... [?25l[?25hdone
  Created wheel for halo: filename=halo-0.0.31-py3-none-any.whl size=11260 sha256=1f4914685cbf8efadbf84a909c3c28e213590585af4f0add8ba84bd1f770c1fd
  Stored in directory: /root/.cache/pip/wheels/95/ff/20/5d16a0059f20c5e60be2df845201e73af179a5a79a3d566f48
Successfully built halo
Installing collected packages: colorama, spinner

In [4]:
import os
import glob
import gc
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
from lightgbm import LGBMRegressor, Dataset, train
from numerapi import NumerAPI
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm
)
import scipy

In [5]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Oct 16 19:45:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [7]:
class PandasDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        
        self.df = pd.read_parquet(pq_path)
        self.df['era'] = self.df['era'].astype('int')
    
    def get_by_group(self, group_id: int, cols=None):
        if group_id == self.splits:
            group_id = 0
        return self.df[self.df['era'] % self.splits == group_id]


import duckdb

class DuckDBDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        self.conn = duckdb.connect(":memory:")
        
    def _gen_select_statement(self) -> str:
        return f"SELECT * FROM parquet_scan('{self.pq_path}') "
    
    def _query(self, expression: str):
        return self. conn.execute(expression)
    
    def _fetch(self, ret_query, fetch_type, cols):
        if fetch_type == "pandas":
            return ret_query.fetchdf()
        elif fetch_type == "numpy":
            return ret_query.fetchdf()[cols].values
        
    def get_by_era(self, era: str, cols=None, fetch_type="pandas"):
        expression = self._gen_select_statement()
        expression += f"WHERE era = '{era}'"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)
    
    def get_by_group(self, group_id: int, cols=None, fetch_type="pandas"):
        if group_id == self.splits:
            group_id = 0
        expression = self._gen_select_statement()
        expression += f"WHERE CAST(era AS INT) % {self.splits} = {group_id}"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)

In [8]:
def minmax_norm(df):
    return (df - df.min()) / (df.max() - df.min())

In [9]:
def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)

In [10]:
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized

In [11]:
def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

In [12]:
##############################################################################
##############################################################################
##############################################################################

In [13]:
napi = numerapi.NumerAPI(verbosity="info")
current_round = napi.get_current_round(tournament=8)

train_pq_path = "numerai_training_data.parquet"
tournament_pq_path = "numerai_tournament_data.parquet"
valid_pq_path = "numerai_validation_data.parquet"
valid_preds_pq_path = "example_validation_predictions.parquet"

napi.download_dataset("numerai_training_data_int8.parquet", train_pq_path)
napi.download_dataset("numerai_tournament_data_int8.parquet", tournament_pq_path)
napi.download_dataset("numerai_validation_data_int8.parquet", valid_pq_path)
napi.download_dataset(valid_preds_pq_path, valid_preds_pq_path)

2021-10-16 19:45:34,504 INFO numerapi.utils: starting download
numerai_training_data.parquet: 1.01GB [01:03, 15.9MB/s]                            
2021-10-16 19:46:39,286 INFO numerapi.utils: starting download
numerai_tournament_data.parquet: 582MB [00:41, 13.9MB/s]                           
2021-10-16 19:47:21,964 INFO numerapi.utils: starting download
numerai_validation_data.parquet: 228MB [00:13, 16.3MB/s]                           
2021-10-16 19:47:36,816 INFO numerapi.utils: starting download
example_validation_predictions.parquet: 13.0MB [00:01, 7.21MB/s]                            


In [14]:
EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target"
ERA_COL = "era"

In [15]:
train_pq = pq.ParquetFile(train_pq_path)
gc.collect()
tournament_pq = pq.ParquetFile(tournament_pq_path)
gc.collect()
valid_pq = pq.ParquetFile(valid_pq_path)
gc.collect()

0

In [16]:
col_names = train_pq.schema.names
feature_cols = [col for col in col_names if "feature" in col]
target_cols = [col for col in col_names if "target" in col]

In [17]:
driver = PandasDriver(train_pq_path)

In [18]:
##############################################################################
#########################    TRAINING TIME   #################################
##############################################################################

In [19]:
params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "max_depth": 5,
    "num_leaves": 2 ** 5,
    "colsample_bytree": 0.1,
}
models = []
for group_id in tqdm(range(1, 5)):
    df = driver.get_by_group(group_id)
    model = LGBMRegressor(**params)
    model.fit(df[feature_cols].values, df[TARGET_COL].values)
    models.append(model)
    del df
    gc.collect()


  0%|          | 0/4 [00:00<?, ?it/s]

2021-10-16 19:47:45,820 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [20]:
validation_data = pd.read_parquet(valid_pq_path)
validation_preds = pd.read_parquet(valid_preds_pq_path)
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]
del validation_preds
gc.collect()

19

In [21]:
!git clone https://github.com/numerai/example-scripts.git
%cd example-scripts

Cloning into 'example-scripts'...
remote: Enumerating objects: 866, done.[K
remote: Counting objects: 100% (596/596), done.[K
remote: Compressing objects: 100% (437/437), done.[K
remote: Total 866 (delta 289), reused 373 (delta 152), pack-reused 270[K
Receiving objects: 100% (866/866), 29.90 MiB | 2.77 MiB/s, done.
Resolving deltas: 100% (417/417), done.
/content/example-scripts


In [22]:
from utils import validation_metrics, neutralize, neutralize_series

In [23]:
for i, model in enumerate(models):
    validation_data.loc[:, f"preds_{i+1}"] = model.predict(validation_data.loc[:, feature_cols].values)
validation_stats = validation_metrics(validation_data, [f"preds_{i+1}" for i in range(len(models))] + [EXAMPLE_PREDS_COL], example_col=EXAMPLE_PREDS_COL, fast_mode=True)

In [24]:
print(validation_stats[["mean", "sharpe", "mmc_mean", "corr_plus_mmc_sharpe"]].to_markdown())

|               |      mean |   sharpe |     mmc_mean |   corr_plus_mmc_sharpe |
|:--------------|----------:|---------:|-------------:|-----------------------:|
| preds_1       | 0.0233212 | 0.700622 |  0.00402185  |               0.582638 |
| preds_2       | 0.0231148 | 0.709725 |  0.00381313  |               0.594807 |
| preds_3       | 0.023349  | 0.707149 |  0.00381045  |               0.589077 |
| preds_4       | 0.0244287 | 0.752996 |  0.00446553  |               0.642987 |
| example_preds | 0.0254531 | 0.957381 | -2.63205e-05 |               0.955276 |


In [25]:
napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")

2021-10-16 20:00:59,597 INFO numerapi.utils: starting download
numerai_training_data_int8.parquet: 1.01GB [00:55, 18.2MB/s]                            


In [26]:
training_data = pd.read_parquet('numerai_training_data_int8.parquet')

In [27]:
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda d: d[feature_cols].corrwith(d[TARGET_COL]))

In [28]:
riskiest_features = get_biggest_change_features(all_feature_corrs, 100)

In [29]:
validation_data['preds_4_neutralize'] = neutralize(df=validation_data,
                                                            columns=["preds_4"],
                                                            neutralizers=riskiest_features,
                                                            proportion=1,
                                                            normalize=True,
                                                            era_col=ERA_COL)

In [30]:
validation_data['preds_4_neutralize'] = minmax_norm(validation_data['preds_4_neutralize'])

In [31]:
###############################################################################
###############################################################################

In [32]:
model_to_submit = "preds_4_neutralize"
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
validation_data["prediction"].to_csv(f"validation_predictions.csv")

In [46]:
!cp validation_predictions.csv "drive/My Drive/BeeChain Foundation/"


In [34]:
###############################################################################

In [35]:
'''
del validation_data
gc.collect()
'''

'\ndel validation_data\ngc.collect()\n'

In [36]:
###############################################################################
###############################################################################

In [37]:
napi.download_dataset("numerai_tournament_data_int8.parquet", f"numerai_tournament_data_int8_{current_round}.parquet")
gc.collect()
tournament_data = pd.read_parquet(f'numerai_tournament_data_int8_{current_round}.parquet')
gc.collect()

2021-10-16 20:05:23,618 INFO numerapi.utils: starting download
numerai_tournament_data_int8_286.parquet: 582MB [00:37, 15.4MB/s]                           


0

In [38]:
batch_size=2000
tournament_preds = np.zeros((len(tournament_data), len(models)))

tournament_batches = tournament_pq.iter_batches(batch_size)
gc.collect()
for i, batch in tqdm(enumerate(tournament_batches)):
    features = batch.to_pandas()[feature_cols]
    gc.collect()
    for j, model in enumerate(models):
        tournament_preds[i*batch_size:(i+1)*batch_size, j] = model.predict(features)
        gc.collect()


0it [00:00, ?it/s]

In [39]:
tournament_data[[ f"preds_{i+1}" for i in range(4)]] = tournament_preds
del tournament_preds

In [40]:
tournament_data

Unnamed: 0_level_0,era,data_type,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,feature_demisable_expiring_millepede,feature_unscheduled_malignant_shingling,feature_clawed_unwept_adaptability,feature_rubblier_chlorotic_stogy,feature_untumbled_histologic_inion,feature_piffling_inflamed_jupiter,feature_abstersive_emotional_misinterpreter,feature_unluckiest_mulley_benzyl,feature_escutcheoned_timocratic_kotwal,feature_integrated_extroversive_ambivalence,feature_vedic_mitral_swiz,feature_reclaimed_fallibilist_turpentine,feature_gone_honduran_worshipper,feature_insociable_exultant_tatum,feature_outdated_tapered_speciation,feature_leggiest_slaggiest_inez,feature_chaldean_vixenly_propylite,feature_hysteric_mechanized_recklinghausen,feature_glare_factional_assessment,feature_highland_eocene_berean,feature_seemlier_reorient_monandry,feature_expressed_abhominable_pruning,feature_castrated_presented_quizzer,feature_restricted_aggregately_workmanship,feature_scorbutic_intellectualism_mongoloid,feature_telephonic_shakable_bollock,feature_subglobular_unsalable_patzer,feature_syrian_coital_counterproof,feature_supergene_legible_antarthritic,feature_hypothetic_distressing_endemic,feature_torturesome_estimable_preferrer,...,feature_pronominal_rampant_megaspore,feature_dropsical_suctorial_mnemosyne,feature_corrugated_dotiest_committeewoman,feature_architectonic_godlier_southland,feature_fishiest_simulatory_roadholding,feature_unpruned_pedagoguish_inkblot,feature_forworn_hask_haet,feature_drawable_exhortative_dispersant,feature_metabolic_minded_armorist,feature_investigatory_inerasable_circumvallation,feature_centroclinal_incentive_lancelet,feature_unemotional_quietistic_chirper,feature_behaviorist_microbiological_farina,feature_lofty_acceptable_challenge,feature_coactive_prefatorial_lucy,target,target_nomi_20,target_nomi_60,target_jerome_20,target_jerome_60,target_janet_20,target_janet_60,target_ben_20,target_ben_60,target_alan_20,target_alan_60,target_paul_20,target_paul_60,target_george_20,target_george_60,target_william_20,target_william_60,target_arthur_20,target_arthur_60,target_thomas_20,target_thomas_60,preds_1,preds_2,preds_3,preds_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
n000101811a8a843,0575,test,2,0,4,0,3,0,4,1,0,1,0,1,0,0,1,0,0,4,4,4,1,0,4,4,0,1,2,4,4,0,1,0,2,0,0,0,1,4,...,0,0,4,4,4,4,4,2,2,2,0,1,0,1,3,,,,,,,,,,,,,,,,,,,,,,0.481158,0.494166,0.481880,0.490620
n001e1318d5072ac,0575,test,1,4,2,2,1,3,3,0,3,2,4,2,4,4,3,4,3,2,1,1,4,1,0,1,2,1,0,1,1,3,1,4,3,4,4,4,4,2,...,3,0,2,2,0,0,0,3,3,4,2,3,4,1,4,,,,,,,,,,,,,,,,,,,,,,0.501870,0.500088,0.509231,0.503155
n002a9c5ab785cbb,0575,test,1,2,2,3,1,1,3,0,1,1,2,3,4,4,2,4,2,3,1,2,2,1,2,2,3,1,1,1,1,3,3,4,1,2,3,4,2,1,...,0,0,4,4,0,0,0,0,3,1,1,1,0,3,1,,,,,,,,,,,,,,,,,,,,,,0.497624,0.496649,0.502524,0.498002
n002ccf6d0e8c5ad,0575,test,2,4,2,4,2,4,3,2,2,1,3,1,4,4,4,4,4,0,2,2,0,0,0,0,4,2,2,1,1,3,4,1,0,3,2,3,3,0,...,1,1,2,3,0,0,0,4,4,1,1,0,0,1,1,,,,,,,,,,,,,,,,,,,,,,0.523433,0.525567,0.525722,0.525715
n0051ab821295c29,0575,test,2,0,0,1,0,4,2,1,3,4,1,2,1,3,2,2,2,0,2,4,2,2,1,3,1,1,2,2,2,2,2,2,2,1,1,2,2,2,...,4,2,0,1,2,4,4,3,4,2,1,4,3,0,2,,,,,,,,,,,,,,,,,,,,,,0.487075,0.494763,0.489375,0.489303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcd0cc3c25e2c0,X,live,4,4,0,4,0,4,3,2,3,3,1,1,4,3,2,4,2,4,3,4,1,2,3,3,0,4,4,3,3,4,4,1,0,1,1,1,1,4,...,2,4,3,3,4,4,0,4,3,4,1,3,3,4,4,,,,,,,,,,,,,,,,,,,,,,0.499605,0.495059,0.494218,0.492529
nffd081d09c25655,X,live,2,4,2,3,4,4,3,1,1,2,1,2,4,4,4,4,4,0,1,0,2,3,2,2,3,1,1,3,3,2,3,1,4,3,3,4,4,1,...,0,2,3,3,2,2,0,2,3,2,4,1,1,4,1,,,,,,,,,,,,,,,,,,,,,,0.501818,0.503945,0.504887,0.503515
nffd15c6801bb79e,X,live,3,3,3,4,4,1,0,0,3,2,4,1,2,2,1,2,1,3,0,0,2,3,0,0,4,3,3,0,0,4,3,4,3,3,4,3,1,0,...,2,4,2,1,0,0,0,0,0,4,0,0,2,0,2,,,,,,,,,,,,,,,,,,,,,,0.492371,0.490597,0.489796,0.487244
nffd58ce10668108,X,live,2,3,1,3,2,4,2,0,4,1,3,0,3,3,4,3,4,2,4,4,3,3,4,4,3,1,1,4,4,2,3,3,3,2,3,3,3,4,...,1,0,3,3,4,4,0,1,3,3,3,0,1,3,1,,,,,,,,,,,,,,,,,,,,,,0.505171,0.505670,0.500270,0.502733


In [41]:
tournament_data["prediction"] = neutralize(df=tournament_data,
                                              columns=["preds_4"],
                                              neutralizers=riskiest_features,
                                              proportion=1,
                                              normalize=True,
                                              era_col=ERA_COL)

In [42]:
tournament_data["prediction"] = minmax_norm(tournament_data["prediction"])

In [43]:
tournament_data["prediction"].to_csv(f"tournament_predictions.csv")

In [47]:
!cp tournament_predictions.csv "drive/My Drive/BeeChain Foundation/"