In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
!pip install numerapi duckdb halo



In [None]:
import os
import glob
import gc
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import numerapi
from lightgbm import LGBMRegressor, Dataset, train
from numerapi import NumerAPI
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm
)
import scipy

In [None]:
class PandasDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        
        self.df = pd.read_parquet(pq_path)
        self.df['era'] = self.df['era'].astype('int')
    
    def get_by_group(self, group_id: int, cols=None):
        if group_id == self.splits:
            group_id = 0
        return self.df[self.df['era'] % self.splits == group_id]


import duckdb

class DuckDBDriver:
    def __init__(self, pq_path: str, splits=4):
        self.pq_path = pq_path
        self.splits = splits
        self.conn = duckdb.connect(":memory:")
        
    def _gen_select_statement(self) -> str:
        return f"SELECT * FROM parquet_scan('{self.pq_path}') "
    
    def _query(self, expression: str):
        return self. conn.execute(expression)
    
    def _fetch(self, ret_query, fetch_type, cols):
        if fetch_type == "pandas":
            return ret_query.fetchdf()
        elif fetch_type == "numpy":
            return ret_query.fetchdf()[cols].values
        
    def get_by_era(self, era: str, cols=None, fetch_type="pandas"):
        expression = self._gen_select_statement()
        expression += f"WHERE era = '{era}'"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)
    
    def get_by_group(self, group_id: int, cols=None, fetch_type="pandas"):
        if group_id == self.splits:
            group_id = 0
        expression = self._gen_select_statement()
        expression += f"WHERE CAST(era AS INT) % {self.splits} = {group_id}"
        ret_query = self._query(expression)
        return self._fetch(ret_query, fetch_type, cols)

In [None]:
def minmax_norm(df):
    return (df - df.min()) / (df.max() - df.min())

def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)

In [None]:
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized


def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

napi = numerapi.NumerAPI(verbosity="info")
current_round = napi.get_current_round(tournament=8)

tournament_pq_path = "numerai_tournament_data.parquet"
napi.download_dataset("numerai_tournament_data_int8.parquet", tournament_pq_path)

validation_pq_path = "numerai_validation_data.parquet"
napi.download_dataset("numerai_validation_data_int8.parquet", validation_pq_path)


print("# All Loaded...")

# Loading data...


2021-10-17 10:35:17,150 INFO numerapi.utils: starting download
numerai_tournament_data.parquet: 582MB [02:39, 3.65MB/s]                           
2021-10-17 10:37:58,012 INFO numerapi.utils: starting download
numerai_validation_data.parquet: 228MB [00:19, 11.6MB/s]                           

# All Loaded...
CPU times: user 17.2 s, sys: 4.57 s, total: 21.7 s
Wall time: 3min 1s





In [None]:
EXAMPLE_PREDS_COL = "example_preds"
TARGET_COL = "target"
ERA_COL = "era"

tournament_pq = pq.ParquetFile(tournament_pq_path)
gc.collect()

validation_pq = pq.ParquetFile(validation_pq_path)
gc.collect()

0

In [None]:
col_names = tournament_pq.schema.names
feature_cols = [col for col in col_names if "feature" in col]
target_cols = [col for col in col_names if "target" in col]

In [None]:
napi.download_dataset("numerai_tournament_data_int8.parquet", f"numerai_tournament_data_int8_{current_round}.parquet")
gc.collect()
tournament_data = pd.read_parquet(f'numerai_tournament_data_int8_{current_round}.parquet')
gc.collect()

2021-10-17 10:38:18,697 INFO numerapi.utils: starting download
numerai_tournament_data_int8_286.parquet: 582MB [00:48, 12.1MB/s]                           


0

In [None]:
napi.download_dataset("numerai_validation_data_int8.parquet", f"numerai_validation_data_int8_{current_round}.parquet")
gc.collect()
validation_data = pd.read_parquet(f'numerai_validation_data_int8_{current_round}.parquet')
gc.collect()

2021-10-17 10:39:11,878 INFO numerapi.utils: starting download
numerai_validation_data_int8_286.parquet: 228MB [00:18, 12.3MB/s]                           


0

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df1 = pd.read_csv(io.BytesIO(uploaded['tournament_predictions_0.csv']))

Saving tournament_predictions_0.csv to tournament_predictions_0.csv


In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df3 = pd.read_csv(io.BytesIO(uploaded['tournament_predictions_2.csv']))

Saving tournament_predictions_2.csv to tournament_predictions_2.csv


In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df4 = pd.read_csv(io.BytesIO(uploaded['tournament_predictions_3.csv']))

Saving tournament_predictions_3.csv to tournament_predictions_3.csv


In [None]:
PREDICTION_NAME = 'prediction'

In [None]:
ensemble = minmax_norm(df1[PREDICTION_NAME].rank()+df3[PREDICTION_NAME].rank()+df4[PREDICTION_NAME].rank())

2021-10-17 11:10:33,284 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
tournament_data[PREDICTION_NAME] = ensemble.values

In [None]:
tournament_data[PREDICTION_NAME]

id
n000101811a8a843    0.094478
n001e1318d5072ac    0.339292
n002a9c5ab785cbb    0.638921
n002ccf6d0e8c5ad    0.942604
n0051ab821295c29    0.273725
                      ...   
nffcd0cc3c25e2c0    0.732088
nffd081d09c25655    0.658461
nffd15c6801bb79e    0.042669
nffd58ce10668108    0.409653
nfffcdc415da8e62    0.564372
Name: prediction, Length: 1412932, dtype: float64

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"tournament_predictions.csv")

In [None]:
!cp tournament_predictions.csv "drive/My Drive/BeeChain Foundation/"

In [None]:
################################################################################

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df1_val = pd.read_csv(io.BytesIO(uploaded['validation_predictions_0.csv']))

Saving validation_predictions_0.csv to validation_predictions_0.csv


In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df3_val = pd.read_csv(io.BytesIO(uploaded['validation_predictions_2.csv']))

Saving validation_predictions_2.csv to validation_predictions_2.csv


In [None]:
from google.colab import files
uploaded = files.upload()
import io
import pandas as pd
df4_val = pd.read_csv(io.BytesIO(uploaded['validation_predictions_3.csv']))

Saving validation_predictions_3.csv to validation_predictions_3.csv


In [None]:
PREDICTION_NAME = 'prediction'

In [None]:
ensemble = minmax_norm(df1_val[PREDICTION_NAME].rank()+df3_val[PREDICTION_NAME].rank()+df4_val[PREDICTION_NAME].rank())

In [None]:
validation_data[PREDICTION_NAME] = ensemble.values

In [None]:
validation_data[PREDICTION_NAME]

id
n000777698096000    0.371298
n0009793a3b91c27    0.659135
n00099ccd6698ab0    0.842873
n0019e36bbb8702b    0.793830
n0028cb874439df8    0.144877
                      ...   
nffbe5152c321f92    0.768686
nffc011b4baa54c3    0.434585
nffc12b2a846ab4e    0.511987
nffc3c5ab0235de0    0.247307
nffe714f0da4d819    0.180033
Name: prediction, Length: 539658, dtype: float64

In [None]:
validation_data[PREDICTION_NAME].to_csv(f"validation_predictions.csv")

In [None]:
!cp validation_predictions.csv "drive/My Drive/BeeChain Foundation/"