In [None]:
from fast_soft_sort.tf_ops import soft_rank

import tensorflow as tf

import numpy as np

def pearson_corr(x, y):

    xy_t = tf.concat([x, y], axis=0)
    mean_t = tf.reduce_mean(xy_t, axis=1, keepdims=True)
    cov_t = ((xy_t-mean_t) @ tf.transpose(xy_t-mean_t))/(x.shape[1]-1)
    cov2_t = tf.linalg.diag(1/tf.sqrt(tf.linalg.diag_part(cov_t)))
    corr_matrix = cov2_t @ cov_t @ cov2_t
    corr = tf.reduce_mean(corr_matrix) * 2 - 1 # equivalent to taking element [0][1] assuming the 2x2 corr matrix is symmetric and the diagonals are 1
    return corr

def spearman_corr(x, y):

    ranks = soft_rank(x, regularization_strength=0.1)
    corr = pearson_corr(ranks, y)
    return corr

In [None]:
def pearson_r(y_true, y_pred):

    x = y_true
    y = y_pred
    mx = tf.reduce_mean(x, axis=1, keepdims=True)
    my = tf.reduce_mean(y, axis=1, keepdims=True)
    xm, ym = x - mx, y - my
    t1_norm = tf.nn.l2_normalize(xm, axis = 1)
    t2_norm = tf.nn.l2_normalize(ym, axis = 1)
    cosine = tf.losses.cosine_distance(t1_norm, t2_norm, axis = 1)
    return cosine

In [None]:
def _get_ranks(arr: np.ndarray) -> np.ndarray:

    """
        Efficiently calculates the ranks of the data.
        Only sorts once to get the ranked data.
        :param arr: A 1D NumPy Array
        :return: A 1D NumPy Array containing the ranks of the data
    """
    temp = arr.argsort(kind='stable')
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(arr))
    return ranks

def spearmans_rho_custom(y_true: np.ndarray, y_pred: np.ndarray) -> np.float32:

    """
        Efficiently calculates the Spearman's Rho correlation using only NumPy
        :param y_true: The ground truth labels
        :param y_pred: The predicted labels
    """
    # Get ranked data
    true_rank = _get_ranks(y_true)
    pred_rank = _get_ranks(y_pred)
    return np.corrcoef(true_rank, pred_rank)[1][0]

In [1]:
!pip install numerapi
!pip install vecstack;

Collecting numerapi
  Downloading numerapi-2.6.0-py3-none-any.whl (25 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.6.0
Collecting vecstack
  Downloading vecstack-0.4.0.tar.gz (18 kB)
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-py3-none-any.whl size=19880 sha256=dc42440f2c03d076156609e3020018de95cc8ffc22ae57ad9d1c87a8bdf6d48f
  Stored in directory: /root/.cache/pip/wheels/28/fe/0c/fe8e43660e3316d7ce204e59a79a72246c0ae9b6c5c79841c8
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################


In [2]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import math
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor 

import torch.nn as nn
import torch.nn.functional as F
import torch
from vecstack import stacking

from sklearn import metrics

def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred) ** 0.5


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn import linear_model
from sklearn import svm
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline, FeatureUnion

import keras
import theano
import tensorflow as tf

from collections import OrderedDict
import pickle
import time

In [4]:
pip install tsne

Collecting tsne
  Downloading tsne-0.3.1.tar.gz (547 kB)
[?25l[K     |▋                               | 10 kB 25.8 MB/s eta 0:00:01[K     |█▏                              | 20 kB 30.7 MB/s eta 0:00:01[K     |█▉                              | 30 kB 11.8 MB/s eta 0:00:01[K     |██▍                             | 40 kB 9.4 MB/s eta 0:00:01[K     |███                             | 51 kB 5.2 MB/s eta 0:00:01[K     |███▋                            | 61 kB 5.7 MB/s eta 0:00:01[K     |████▏                           | 71 kB 5.5 MB/s eta 0:00:01[K     |████▉                           | 81 kB 6.2 MB/s eta 0:00:01[K     |█████▍                          | 92 kB 6.3 MB/s eta 0:00:01[K     |██████                          | 102 kB 5.1 MB/s eta 0:00:01[K     |██████▋                         | 112 kB 5.1 MB/s eta 0:00:01[K     |███████▏                        | 122 kB 5.1 MB/s eta 0:00:01[K     |███████▉                        | 133 kB 5.1 MB/s eta 0:00:01[K     |████████▍ 

In [5]:
# t-SNE feature extraction
import tsne

In [6]:
sns.set_context('notebook', font_scale=1.25)
sns.set_style('darkgrid')

In [7]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [8]:
TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

BENCHMARK = 0
BAND = 0.2

In [9]:
seed = 1
rand = np.random.seed(seed)

In [10]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

2021-07-31 17:42:57,206 INFO numerapi.utils: starting download
./numerai_dataset_274.zip: 414MB [00:08, 47.4MB/s]                           
2021-07-31 17:43:05,953 INFO numerapi.base_api: unzipping file...


In [11]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [12]:
%%time
print("# Loading data...")

training_data = pd.read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = pd.read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 1min 4s, sys: 11.2 s, total: 1min 15s
Wall time: 1min 18s


In [13]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [19]:
# Xgboost Built-in Feature Importance

In [None]:
pip install shap

In [22]:
import numpy as np
import pandas as pd
import shap

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
import seaborn as sns # for correlation heatmap

from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

In [None]:
feature_importances=pd.DataFrame({'features':feature_names,'feature_importance':xgb.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

In [50]:
feature_importances_best = feature_importances[feature_importance > 0.003]

In [57]:
feature_names = feature_importances_best['features'].values