In [1]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp postprocessing

<IPython.core.display.Javascript object>

# Postprocessing

The postprocessing procedure is very similar to preprocessing.

The only difference between a postprocessing step and a preprocessing step is that preprocessing works on `feature_` columns while postprocessing manipulates `prediction_` columns.

Therefore, we also inherit from `BaseProcessor` for postprocessing. The PostProcessor should take a `Dataset` as input and output a `Dataset` where either:
1. `prediction_` columns are manipulated or
2. A new prediction column is added with prefix `prediction_`.

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
#export
import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
import scipy.stats as sp
from tqdm.auto import tqdm
from typeguard import typechecked
from rich import print as rich_print
from sklearn.preprocessing import MinMaxScaler

from numerai_blocks.preprocessing import BaseProcessor, display_processor_info
from numerai_blocks.dataset import Dataset

<IPython.core.display.Javascript object>

## 1. Common postprocessing steps

### 1.1. Version agnostic

#### 1.1.1. Ensembling

Multiple prediction results can be ensembled in multiple ways, but we provide the most common use cases here.

In [5]:
#export
@typechecked
class MeanEnsembler(BaseProcessor):
    """ Take simple mean of multiple cols and store in new col. """
    def __init__(self, cols: list, final_col_name: str):
        super(MeanEnsembler, self).__init__()
        self.cols = cols
        self.final_col_name = final_col_name
        assert final_col_name.startswith("prediction"), f"final_col name should start with 'prediction'. Got {final_col_name}"

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        dataset.dataf.loc[:, self.final_col_name] = dataset.dataf.loc[:, self.cols].mean(axis=1)
        rich_print(f":stew: Ensembled [blue]'{self.cols}'[blue] with simple mean and saved in [bold]'{self.final_col_name}'[bold] :stew:")
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

#### 1.1.2. Feature Neutralization

Classic feature neutralization (subtracting linear model from scores)

In [6]:
#export
@typechecked
class FeatureNeutralizer(BaseProcessor):
    """ Feature """
    def __init__(self, feature_names: list,
                 pred_name: str = "prediction",
                 era_col: str = "era",
                 proportion: float = 0.5):
        super(FeatureNeutralizer, self).__init__()
        assert 0. <= proportion <= 1., f"'proportion' should be a float in range [0...1]. Got '{proportion}'."
        self.proportion = proportion
        self.feature_names = feature_names
        self.pred_name = pred_name
        self.era_col = era_col
        self.new_col_name = f"{self.pred_name}_neutralized_{self.proportion}"

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        neutralized_preds = dataset.dataf.groupby(self.era_col)\
            .apply(lambda x: self.normalize_and_neutralize(x, [self.pred_name], self.feature_names))
        dataset.dataf.loc[:, self.new_col_name] = MinMaxScaler().fit_transform(neutralized_preds)
        rich_print(f":robot: Neutralized [bold blue]'{self.pred_name}'[bold blue] with proportion [bold]'{self.proportion}'[/bold] :robot:")
        rich_print(f"New neutralized column = [bold green]'{self.new_col_name}'[/bold green].")
        return Dataset(**dataset.__dict__)

    def _neutralize(self, df, columns, by):
        scores = df[columns]
        exposures = df[by].values
        scores = scores - self.proportion * exposures.dot(np.linalg.pinv(exposures).dot(scores))
        return scores / scores.std()

    @staticmethod
    def _normalize(dataf: pd.DataFrame):
        normalized_ranks = (dataf.rank(method="first") - 0.5) / len(dataf)
        return sp.norm.ppf(normalized_ranks)

    def normalize_and_neutralize(self, df, columns, by):
        # Convert the scores to a normal distribution
        df[columns] = self._normalize(df[columns])
        df[columns] = self._neutralize(df, columns, by)
        return df[columns]

<IPython.core.display.Javascript object>

In [7]:
test_dataset = Dataset(pd.read_csv("test_assets/mini_numerai_version_1_data.csv"))
test_dataset.dataf.loc[:, 'prediction'] = np.random.uniform(size=len(test_dataset.dataf))

<IPython.core.display.Javascript object>

In [8]:
ft = FeatureNeutralizer(feature_names=test_dataset.feature_cols, pred_name='prediction', proportion=0.8)
new_dataset = ft.transform(test_dataset);

<IPython.core.display.Javascript object>

In [9]:
assert "prediction_neutralized_0.8" in new_dataset.prediction_cols
assert 0. in new_dataset.get_prediction_data['prediction_neutralized_0.8']
assert 1. in new_dataset.get_prediction_data['prediction_neutralized_0.8']

<IPython.core.display.Javascript object>

In [10]:
new_dataset.prediction_cols

['prediction', 'prediction_neutralized_0.8']

<IPython.core.display.Javascript object>

In [11]:
new_dataset.get_prediction_data

Unnamed: 0,prediction,prediction_neutralized_0.8
0,0.895532,1.0
1,0.894037,0.815053
2,0.170013,0.0
3,0.632977,0.461802
4,0.392452,0.184947
5,0.676905,0.538198
6,0.682949,0.617129
7,0.40292,0.29497
8,0.845842,0.70503
9,0.415415,0.382871


<IPython.core.display.Javascript object>

#### 1.1.3. Feature Penalization

In [12]:
#export
@typechecked
class FeaturePenalizer(BaseProcessor):
    """ Feature penalization with Tensorflow. """
    def __init__(self, model_list: list, max_exposure: float,
                 risky_feature_names: list = None, pred_name: str = "prediction", era_col: str = 'era'):
        super(FeaturePenalizer, self).__init__()
        self.model_list = model_list
        assert 0. <= max_exposure <= 1., f"'max_exposure' should be a float in range [0...1]. Got '{max_exposure}'."
        self.max_exposure = max_exposure
        self.risky_feature_names = risky_feature_names
        self.pred_name = pred_name
        self.era_col = era_col

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        risky_feature_names = dataset.feature_cols if not self.risky_feature_names else self.risky_feature_names
        for model_name in self.model_list:
            penalized_data = self.reduce_all_exposures(
                            df=dataset.dataf,
                            column=self.pred_name,
                            neutralizers=risky_feature_names,
                        )
            new_pred_col = f"prediction_{self.pred_name}_{model_name}_FP_{self.max_exposure}"
            dataset.dataf.loc[:, new_pred_col] = penalized_data[self.pred_name]
        return Dataset(**dataset.__dict__)

    def reduce_all_exposures(self, df: pd.DataFrame,
                             column: str = "prediction",
                             neutralizers: list = None,
                             normalize=True,
                             gaussianize=True,
                             ):
        if neutralizers is None:
            neutralizers = [x for x in df.columns if x.startswith("feature")]
        neutralized = []

        for era in tqdm(df[self.era_col].unique()):
            df_era = df[df[self.era_col] == era]
            scores = df_era[[column]].values
            exposure_values = df_era[neutralizers].values

            if normalize:
                scores2 = []
                for x in scores.T:
                    x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                    if gaussianize:
                        x = scipy.stats.norm.ppf(x)
                    scores2.append(x)
                scores = np.array(scores2)[0]

            scores, weights = self._reduce_exposure(scores, exposure_values,
                                                    len(neutralizers), None)

            scores /= tf.math.reduce_std(scores)
            scores -= tf.reduce_min(scores)
            scores /= tf.reduce_max(scores)
            neutralized.append(scores.numpy())

        predictions = pd.DataFrame(np.concatenate(neutralized),
                                   columns=[column], index=df.index)
        return predictions

    def _reduce_exposure(self, prediction, features, input_size=50, weights=None):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(input_size),
            tf.keras.experimental.LinearModel(use_bias=False),
        ])
        feats = tf.convert_to_tensor(features - 0.5, dtype=tf.float32)
        pred = tf.convert_to_tensor(prediction, dtype=tf.float32)
        if weights is None:
            optimizer = tf.keras.optimizers.Adamax()
            start_exp = self.__exposures(feats, pred[:, None])
            target_exps = tf.clip_by_value(start_exp, -self.max_exposure, self.max_exposure)
            self._train_loop(model, optimizer, feats, pred, target_exps)
        else:
            model.set_weights(weights)
        return pred[:,None] - model(feats), model.get_weights()


    def _train_loop(self, model, optimizer, feats, pred, target_exps):
        for i in range(1000000):
            loss, grads = self.__train_loop_body(model, feats, pred, target_exps)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
            if loss < 1e-7:
                break

    @tf.function(experimental_relax_shapes=True)
    def __train_loop_body(self, model, feats, pred, target_exps):
        with tf.GradientTape() as tape:
            exps = self.exposures(feats, pred[:, None] - model(feats, training=True))
            loss = tf.reduce_sum(tf.nn.relu(tf.nn.relu(exps) - tf.nn.relu(target_exps)) +
                                 tf.nn.relu(tf.nn.relu(-exps) - tf.nn.relu(-target_exps)))
        return loss, tape.gradient(loss, model.trainable_variables)

    @staticmethod
    @tf.function(experimental_relax_shapes=True, experimental_compile=True)
    def __exposures(x, y):
        x = x - tf.math.reduce_mean(x, axis=0)
        x = x / tf.norm(x, axis=0)
        y = y - tf.math.reduce_mean(y, axis=0)
        y = y / tf.norm(y, axis=0)
        return tf.matmul(x, y, transpose_a=True)

<IPython.core.display.Javascript object>

In [12]:
# TODO Test Feature penalizer

<IPython.core.display.Javascript object>

### 1.2. Version 1 specific

### 1.3. Version 2 specific

### 1.4. Signals specific

## 2. Custom PostProcessors

There are an almost unlimited number of ways to postprocess data. We invite the Numerai community to develop Numerai Classic and Signals preprocessors for `numerai-blocks`.

A new PostProcessor should inherit from `BaseProcessor` and implement a `transform` method. The `transform` method should take a `Dataset` as input and return a `Dataset` object as output. An example is given below.

We recommend adding `@typechecked` at the top of a new PostProcessor class to enforce types and provide useful debugging stacktraces.

To enable fancy logging output. Add the `@display_processor_info` decorator to the `transform` method.

Note that arbitrary metadata can be added or changed in the `Dataset` class during a postprocessing step.



In [13]:
#export
@typechecked
class AwesomePostProcessor(BaseProcessor):
    """
    - TEMPLATE -
    Do some awesome postprocessing.
    """
    def __init__(self, *args, **kwargs):
        super(AwesomePostProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        # Do processing
        ...
        # Add new column for manipulated data (optional)
        new_column_name = "NEW_COLUMN_NAME"
        dataset.dataf.loc[:, f"prediction_{new_column_name}"] = ...
        ...
        # Parse all contents of Dataset to the next pipeline step
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

------------------------------------------------------

In [14]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>