Created by Joan-Marc Fisa

- Numerai: [FisaGol](https://numer.ai/fisagol)

- Twitter: [@fisagol](https://twitter.com/fisagol)


# Evaluating Financial Machine Learning Models on Numerai

In [None]:
!pip install numerapi

Collecting numerapi
  Downloading https://files.pythonhosted.org/packages/81/9d/c583893e96721821560e48aea92dd22aef9fc727151f1efae8f8dc885555/numerapi-2.3.9-py3-none-any.whl
Installing collected packages: numerapi
Successfully installed numerapi-2.3.9


In [None]:
##################################################################
##################### LIBRARIES ##################################
##################################################################

In [None]:
import os
import gc
import csv
import sys
import glob
import time
from pathlib import Path
from multiprocessing import Pool

import numerapi

import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import sklearn
from sklearn import (
    feature_extraction, feature_selection, decomposition, linear_model,
    model_selection, metrics, svm, preprocessing, utils
)
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OrdinalEncoder, LabelEncoder,OneHotEncoder
from keras.models import Sequential, model_from_json, load_model
from keras.layers import Dense, Dropout, Activation,LSTM,Bidirectional, MaxPooling2D, Flatten,GRU
from keras.optimizers import SGD,Adam
from keras.regularizers import l2
from sklearn.model_selection import StratifiedKFold, GroupKFold, GridSearchCV,cross_val_score,KFold, RepeatedStratifiedKFold,train_test_split
from sklearn.metrics import log_loss, make_scorer, mean_squared_error,classification_report,accuracy_score
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils 
from sklearn import preprocessing
from xgboost import XGBRegressor 
from sklearn.cluster import KMeans
import matplotlib as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,MinMaxScaler

import math
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from lightgbm import LGBMRegressor

In [None]:
##############################################################################
########################## DOWLOAD DATA ######################################
##############################################################################

In [None]:
seed = 3
rand = np.random.seed(seed)

In [None]:
napi = numerapi.NumerAPI(verbosity="info")

napi.download_current_dataset(unzip=True)

current_ds = napi.get_current_round()
latest_round = os.path.join('numerai_dataset_'+str(current_ds))

./numerai_dataset_244.zip: 100%|█████████▉| 383M/385M [00:21<00:00, 20.4MB/s]2020-12-27 10:06:52,826 INFO numerapi.base_api: unzipping file...
./numerai_dataset_244.zip: 385MB [00:40, 20.4MB/s]                           

In [None]:
TOURNAMENT_NAME = "nomi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

In [None]:
##################################################################
##################### LOAD DATA ##################################
##################################################################

In [None]:
%%time
print("# Loading data...")

training_data = pd.read_csv(os.path.join(latest_round, "numerai_training_data.csv")).set_index("id")
tournament_data = pd.read_csv(os.path.join(latest_round, "numerai_tournament_data.csv")).set_index("id")
validation_data = tournament_data[tournament_data.data_type == "validation"]

print("# All Loaded...")

# Loading data...
# All Loaded...
CPU times: user 1min 2s, sys: 7.14 s, total: 1min 9s
Wall time: 1min 9s


In [None]:
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")

Loaded 310 features


In [None]:
##############################################################################
########################  DIVIDE DATA  in X and Y  ###########################
##############################################################################

In [None]:
training_data["id_seq"] = list(range(len(training_data)))
tournament_data["id_seq"] = list(range(len(tournament_data)))

In [None]:
%%time
X = training_data[feature_names]
print('Call algorithm K-means')
kmeans = KMeans(n_clusters=10, random_state=rand, init = 'random')
print('Fitting algorithm K-means')
kmeans.fit(X)
print('Finished and Fitted')
X_clusters_kmeans = training_data
X_clusters_kmeans['k-means'] = kmeans.labels_

Call algorithm K-means
Fitting algorithm K-means
Finished and Fitted
CPU times: user 12min 44s, sys: 821 ms, total: 12min 45s
Wall time: 12min 45s


In [None]:
X_group_1 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 0]
X_group_2 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 1]
X_group_3 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 2]
X_group_4 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 3]
X_group_5 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 4]
X_group_6 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 5]
X_group_7 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 6]
X_group_8 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 7]
X_group_9 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 8]
X_group_10 = X_clusters_kmeans.loc[X_clusters_kmeans['k-means'] == 9]

2020-12-21 08:31:19,033 INFO numexpr.utils: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-21 08:31:19,035 INFO numexpr.utils: NumExpr defaulting to 8 threads.


In [None]:
def Grouping_Models(df_train):
    %%time
    model = LGBMRegressor()
    model.fit(training_data[feature_names], training_data[TARGET_NAME])
    return model

In [None]:
model_group_1 = Grouping_Models(X_group_1)
model_group_2 = Grouping_Models(X_group_2)
model_group_3 = Grouping_Models(X_group_3)
model_group_4 = Grouping_Models(X_group_4)
model_group_5 = Grouping_Models(X_group_5)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.34 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


In [None]:
model_group_6 = Grouping_Models(X_group_6)
model_group_7 = Grouping_Models(X_group_7)
model_group_8 = Grouping_Models(X_group_8)
model_group_9 = Grouping_Models(X_group_9)
model_group_10 = Grouping_Models(X_group_10)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [None]:
########################## PREDICTIONS ON LEBELS ##############################

In [None]:
labels_tournament = kmeans.predict(tournament_data[feature_names])
tournament_data['k-means'] = labels_tournament

In [None]:
X_group_1_td = tournament_data.loc[tournament_data['k-means'] == 0]
X_group_2_td = tournament_data.loc[tournament_data['k-means'] == 1]
X_group_3_td = tournament_data.loc[tournament_data['k-means'] == 2]
X_group_4_td = tournament_data.loc[tournament_data['k-means'] == 3]
X_group_5_td = tournament_data.loc[tournament_data['k-means'] == 4]
X_group_6_td = tournament_data.loc[tournament_data['k-means'] == 5]
X_group_7_td = tournament_data.loc[tournament_data['k-means'] == 6]
X_group_8_td = tournament_data.loc[tournament_data['k-means'] == 7]
X_group_9_td = tournament_data.loc[tournament_data['k-means'] == 8]
X_group_10_td = tournament_data.loc[tournament_data['k-means'] == 9]

In [None]:
###########  PREDICTIONS  ##########

In [None]:
prediction_group_1 = pd.DataFrame()
prediction_group_1[PREDICTION_NAME] = model_group_1.predict(X_group_1_td[feature_names])
prediction_group_1["id_seq"] = X_group_1_td["id_seq"].values

In [None]:
prediction_group_2 = pd.DataFrame()
prediction_group_2[PREDICTION_NAME] = model_group_2.predict(X_group_2_td[feature_names])
prediction_group_2["id_seq"] = X_group_2_td["id_seq"].values

In [None]:
prediction_group_3 = pd.DataFrame()
prediction_group_3[PREDICTION_NAME] = model_group_3.predict(X_group_3_td[feature_names])
prediction_group_3["id_seq"] = X_group_3_td["id_seq"].values

In [None]:
prediction_group_4 = pd.DataFrame()
prediction_group_4[PREDICTION_NAME] = model_group_4.predict(X_group_4_td[feature_names])
prediction_group_4["id_seq"] = X_group_4_td["id_seq"].values

In [None]:
prediction_group_5 = pd.DataFrame()
prediction_group_5[PREDICTION_NAME] = model_group_5.predict(X_group_5_td[feature_names])
prediction_group_5["id_seq"] = X_group_5_td["id_seq"].values

In [None]:
prediction_group_6 = pd.DataFrame()
prediction_group_6[PREDICTION_NAME] = model_group_6.predict(X_group_6_td[feature_names])
prediction_group_6["id_seq"] = X_group_6_td["id_seq"].values

In [None]:
prediction_group_7 = pd.DataFrame()
prediction_group_7[PREDICTION_NAME] = model_group_7.predict(X_group_7_td[feature_names])
prediction_group_7["id_seq"] = X_group_7_td["id_seq"].values

In [None]:
prediction_group_8 = pd.DataFrame()
prediction_group_8[PREDICTION_NAME] = model_group_8.predict(X_group_8_td[feature_names])
prediction_group_8["id_seq"] = X_group_8_td["id_seq"].values

In [None]:
prediction_group_9 = pd.DataFrame()
prediction_group_9[PREDICTION_NAME] = model_group_9.predict(X_group_9_td[feature_names])
prediction_group_9["id_seq"] = X_group_9_td["id_seq"].values

In [None]:
prediction_group_10 = pd.DataFrame()
prediction_group_10[PREDICTION_NAME] = model_group_10.predict(X_group_10_td[feature_names])
prediction_group_10["id_seq"] = X_group_10_td["id_seq"].values

In [None]:
########################## UNION OF PREDICTIONS ##############################

In [None]:
vertical_stack = pd.concat([prediction_group_1, prediction_group_2,prediction_group_3,prediction_group_4,prediction_group_5
                            ,prediction_group_6,prediction_group_7,prediction_group_8,prediction_group_9
                            ,prediction_group_10], axis=0)
vertical_stack = vertical_stack.sort_values("id_seq")

In [None]:
tournament_data[PREDICTION_NAME] = vertical_stack[PREDICTION_NAME].values

In [None]:
model = LGBMRegressor()
model.fit(training_data[feature_names], training_data[TARGET_NAME])

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
tournament_data[PREDICTION_NAME]= model.predict(tournament_data[feature_names])

In [None]:
import scipy
import numpy
from sklearn.preprocessing import MinMaxScaler

def full_neutralization(df, feature_names=feature_names, pred_name=PREDICTION_NAME):
    df[pred_name] = df.groupby("era").apply(lambda x: normalize_and_neutralize(x, [pred_name], feature_names))
    scaled_preds = MinMaxScaler().fit_transform(df[[pred_name]])
    return scaled_preds

def _neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    scores = scores - proportion * exposures.dot(numpy.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()
def _normalize(df):
    X = (df.rank(method="first") - 0.5) / len(df)
    return scipy.stats.norm.ppf(X)

def normalize_and_neutralize(df, columns, by, proportion=1.0):
    # Convert the scores to a normal distribution
    df[columns] = _normalize(df[columns])
    df[columns] = _neutralize(df, columns, by, proportion)
    return df[columns]


In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.488480
n000920ed083903f    0.487922
n0038e640522c4a6    0.516811
n004ac94a87dc54b    0.499933
n0052fe97ea0c05f    0.502267
                      ...   
nffd1f8ec8ff3b23    0.490741
nffeab1e87d5bbb7    0.497121
nffeb444b05635f6    0.504650
nfffdcd9dba8e5d1    0.477709
nffff1a37ed8064a    0.506131
Name: prediction, Length: 1612518, dtype: float64

In [None]:
x = full_neutralization(tournament_data)

2020-12-27 10:16:13,251 INFO numexpr.utils: NumExpr defaulting to 4 threads.


In [None]:
x

array([[0.45524988],
       [0.39301798],
       [0.48105306],
       ...,
       [0.46123103],
       [0.50313655],
       [0.57449347]])

In [None]:
tournament_data[PREDICTION_NAME] = x

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.455250
n000920ed083903f    0.393018
n0038e640522c4a6    0.481053
n004ac94a87dc54b    0.519810
n0052fe97ea0c05f    0.530415
                      ...   
nffd1f8ec8ff3b23    0.426283
nffeab1e87d5bbb7    0.403981
nffeb444b05635f6    0.461231
nfffdcd9dba8e5d1    0.503137
nffff1a37ed8064a    0.574493
Name: prediction, Length: 1612518, dtype: float64

In [None]:
import numerapi
import torch
from torch.nn import Linear
from torch.nn import Sequential
from torch.functional import F

In [None]:
def exposures(x, y):
    x = x - x.mean(dim=0)
    x = x / x.norm(dim=0)
    y = y - y.mean(dim=0)
    y = y / y.norm(dim=0)
    return torch.matmul(x.T, y)

def reduce_exposure(prediction, features, max_exp):
    # linear model of features that will be used to partially neutralize predictions
    lin = Linear(features.shape[1],  1, bias=False)
    lin.weight.data.fill_(0.)
    model = Sequential(lin)
    optimizer = torch.optim.Adamax(model.parameters(), lr=1e-4)
    feats = torch.tensor(np.float32(features)-.5)
    pred = torch.tensor(np.float32(prediction))
    start_exp = exposures(feats, pred[:,None])
    # set target exposure for each feature to be <= current exposure
    # if current exposure is less than max_exp, or <= max_exp if  
    # current exposure is > max_exp
    targ_exp = torch.clamp(start_exp, -max_exp, max_exp)

    for i in range(100000):
        optimizer.zero_grad()
        # calculate feature exposures of current linear neutralization
        exps = exposures(feats, pred[:,None]-model(feats))
        # loss is positive when any exposures exceed their target
        loss = (F.relu(F.relu(exps)-F.relu(targ_exp)) + F.relu(F.relu(-exps)-F.relu(-targ_exp))).sum()
        print(f'       loss: {loss:0.7f}', end='\r')
        if loss < 1e-7:
            neutralizer = [p.detach().numpy() for p in model.parameters()]
            neutralized_pred = pred[:,None]-model(feats)
            break
        loss.backward()
        optimizer.step()
    return neutralized_pred, neutralizer

def reduce_all_exposures(df, column, neutralizers=[],
                                     normalize=True,
                                     gaussianize=True,
                                     era_col="era",
                                     max_exp=0.1):
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, '\r')
        df_era = df[df[era_col] == u]
        scores = df_era[column].values
        exposure_values = df_era[neutralizers].values
        
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                if gaussianize:
                    x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2)[0]

        scores, neut = reduce_exposure(scores, exposure_values, max_exp)

        scores /= scores.std()

        computed.append(scores.detach().numpy())

    return pd.DataFrame(np.concatenate(computed), columns=column, index=df.index)

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.455250
n000920ed083903f    0.393018
n0038e640522c4a6    0.481053
n004ac94a87dc54b    0.519810
n0052fe97ea0c05f    0.530415
                      ...   
nffd1f8ec8ff3b23    0.426283
nffeab1e87d5bbb7    0.403981
nffeb444b05635f6    0.461231
nfffdcd9dba8e5d1    0.503137
nffff1a37ed8064a    0.574493
Name: prediction, Length: 1612518, dtype: float64

In [None]:
data_rfe_10 = reduce_all_exposures(tournament_data,
                                   [PREDICTION_NAME],
                                   neutralizers=feature_names,
                                   era_col="era",
                                   max_exp=0.10)

era121 
era122 
era123 
era124 
era125 
era126 
era127 
era128 
era129 
era130 
era131 
era132 
era575 
era576 
era577 
era578 
era579 
era580 
era581 
era582 
era583 
era584 
era585 
era586 
era587 
era588 
era589 
era590 
era591 
era592 
era593 
era594 
era595 
era596 
era597 
era598 
era599 
era600 
era601 
era602 
era603 
era604 
era605 
era606 
era607 
era608 
era609 
era610 
era611 
era612 
era613 
era614 
era615 
era616 
era617 
era618 
era619 
era620 
era621 
era622 
era623 
era624 
era625 
era626 
era627 
era628 
era629 
era630 
era631 
era632 
era633 
era634 
era635 
era636 
era637 
era638 
era639 
era640 
era641 
era642 
era643 
era644 
era645 
era646 
era647 
era648 
era649 
era650 
era651 
era652 
era653 
era654 
era655 
era656 
era657 
era658 
era659 
era660 
era661 
era662 
era663 
era664 
era665 
era666 
era667 
era668 
era669 
era670 
era671 
era672 
era673 
era674 
era675 
era676 
era677 
era678 
era679 
era680 
era681 
era682 
era683 
era684 
era685 
era686 
era687 

In [None]:
# replace prediction with reduced feature exposure prediction and rescale to [0,1]
tournament_data[PREDICTION_NAME] = data_rfe_10[PREDICTION_NAME]
tournament_data[PREDICTION_NAME] -= tournament_data[PREDICTION_NAME].min()
tournament_data[PREDICTION_NAME] /= tournament_data[PREDICTION_NAME].max()

In [None]:
tournament_data[PREDICTION_NAME]

id
n0003aa52cab36c2    0.434823
n000920ed083903f    0.320781
n0038e640522c4a6    0.488313
n004ac94a87dc54b    0.563201
n0052fe97ea0c05f    0.578926
                      ...   
nffd1f8ec8ff3b23    0.383230
nffeab1e87d5bbb7    0.341325
nffeb444b05635f6    0.448802
nfffdcd9dba8e5d1    0.532270
nffff1a37ed8064a    0.649660
Name: prediction, Length: 1612518, dtype: float64

In [None]:
##############################################################################
######################### MAKE SUBMISSION #################################### 
##############################################################################

In [None]:
tournament_data[PREDICTION_NAME].to_csv(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv")

In [None]:
# NameOfYourAI FISAGOL_2: sescervello9@gmail.com
public_id = "YYGB2JJQG2ZTB72WVUHJJASCFUBK656A"
secret_key = "L6FF4CD6ROSSEMWZVTXBBLRHICBASW3CJVELD5IFZR7QCI46XBJLEOHZU7HCZ2RB"
model_id = "c6f3339f-f677-48a0-ac42-2995afc614a5"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [None]:
print("Uploading DataFrame in Numerai...")
submission_id = napi.upload_predictions(f"{TOURNAMENT_NAME}_{current_ds}_submission.csv", model_id=model_id)
print("DataFrame Uploaded...")

2020-12-27 10:19:04,599 INFO numerapi.base_api: uploading predictions...


Uploading DataFrame in Numerai...
DataFrame Uploaded...


In [None]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor 
import torch
from torch.autograd import grad


trainval=pd.read_parquet("numerai_training_validation_target_nomi.parquet")
train = trainval[trainval.data_type=='train']

target = "target_nomi" 
feature_columns = [c for c in trainval if c.startswith("feature")] 

# fit an initial model
model_init = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, nthread=6)
model_init.fit(train[feature_columns], train[target])

# get prediction from initial model as starting point to improve upon
base_margin = model_init.predict(train[feature_columns])

# get indexes for each era
era_idx = [np.where(train.era==uera)[0] for uera in train.era.unique()]


# define adjusted sharpe in terms of cost adjusted numerai sharpe
def numerai_sharpe(x):
    return (x.mean() -0.010415154) / x.std()

def skew(x):
    mx = x.mean()
    m2 = ((x-mx)**2).mean()
    m3 = ((x-mx)**3).mean()
    return m3/(m2**1.5)    

def kurtosis(x):
    mx = x.mean()
    m4 = ((x-mx)**4).mean()
    m2 = ((x-mx)**2).mean()
    return (m4/(m2**2))-3

def adj_sharpe(x):
    return numerai_sharpe(x) * (1 + ((skew(x) / 6) * numerai_sharpe(x)) - ((kurtosis(x) / 24) * (numerai_sharpe(x) ** 2)))

# use correlation as the measure of fit
def corr(pred, target):
    pred_n = pred - pred.mean(dim=0)
    pred_n = pred_n / pred_n.norm(dim=0)

    target_n = target - target.mean(dim=0)
    target_n = target_n / target_n.norm(dim=0)
    l = torch.matmul(pred_n.T, target_n)
    return l

# definte a custom objective for XGBoost
def adj_sharpe_obj(ytrue, ypred):
    # convert to pytorch tensors
    ypred_th = torch.tensor(ypred, requires_grad=True)
    ytrue_th = torch.tensor(ytrue)
    all_corrs = []

    # get correlations in each era
    for ee in era_idx:
        score = corr(ypred_th[ee], ytrue_th[ee])
        all_corrs.append(score)

    all_corrs = torch.stack(all_corrs)

    # calculate adjusted sharpe using correlations
    loss = -adj_sharpe(all_corrs)
    print(f'Current loss:{loss}')

    # calculate gradient and convert to numpy
    loss_grads = grad(loss, ypred_th, create_graph=True)[0]
    loss_grads = loss_grads.detach().numpy()

    # return gradient and ones instead of Hessian diagonal
    return loss_grads, np.ones(loss_grads.shape)


model_adj_sharpe = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=200, nthread=6, colsample_bytree=0.1)
model_adj_sharpe.fit(train[feature_columns], train[target], base_margin=base_margin)

In [None]:
# to neutralize a column in a df by many other columns on a per-era basis
def neutralize(df,
               columns,
               extra_neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    # need to do this for lint to be happy bc [] is a "dangerous argument"
    if extra_neutralizers is None:
        extra_neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, end="\r")
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (pd.Series(x).rank(method="first").values - .5) / len(x)
                scores2.append(x)
            scores = np.array(scores2).T
            extra = df_era[extra_neutralizers].values
            exposures = np.concatenate([extra], axis=1)
        else:
            exposures = df_era[extra_neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)


In [None]:
# to neutralize any series by any other series
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))

    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized


In [None]:
def unif(df):
    x = (df.rank(method="first") - 0.5) / len(df)
    return pd.Series(x, index=df.index)


def get_feature_neutral_mean(df):
    feature_cols = [c for c in df.columns if c.startswith("feature")]
    df.loc[:, "neutral_sub"] = neutralize(df, [PREDICTION_NAME],
                                          feature_cols)[PREDICTION_NAME]
    scores = df.groupby("era").apply(
        lambda x: correlation(x["neutral_sub"], x[TARGET_NAME])).mean()
    return np.mean(scores)
