In [1]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp model_pipeline

<IPython.core.display.Javascript object>

# ModelPipeline and ModelPipelineCollection

The functionality below uses the `NumerFrame`, `PreProcessor`, `Model` and `PostProcessor` objects to easily propagate
data, generate predictions and postprocess them in one go.

Specifically, this section introduces two objects:
1. `ModelPipeline`: Run all preprocessing, models and postprocessing that you define and return a `NumerFrame`.
2. `ModelPipelineCollection`: Manage and run multiple `ModelPipeline` objects.

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
#export
import uuid
import pandas as pd
from tqdm.auto import tqdm
from typing import List, Union
from typeguard import typechecked
from rich import print as rich_print

from numerai_blocks.numerframe import NumerFrame, create_numerframe
from numerai_blocks.preprocessing import BaseProcessor, CopyPreProcessor, GroupStatsPreProcessor, FeatureSelectionPreProcessor
from numerai_blocks.model import BaseModel, ConstantModel, RandomModel
from numerai_blocks.postprocessing import Standardizer, MeanEnsembler, FeatureNeutralizer

<IPython.core.display.Javascript object>

## 1. ModelPipeline

`ModelPipeline` handles all preprocessing, model prediction and postprocessing.
After the `ModelPipeline` completes it returns a `NumerFrame` with the preprocessed data, metadata and postprocessed prediction columns.

In [5]:
#export
@typechecked
class ModelPipeline:
    """
    Execute all preprocessing, prediction and postprocessing for a given setup.

    :param models: Initiliazed numerai-blocks Models (Objects inheriting from BaseModel)
    :param preprocessors: List of initialized Preprocessors.
    :param postprocessors: List of initialized Postprocessors.
    :param copy_first: Whether to copy the NumerFrame as a first preprocessing step.
    Highly recommended in order to avoid surprise behaviour by manipulating the original dataset.
    :param pipeline_name: Unique name for pipeline. Only used for display purposes.
    """
    def __init__(self,
                 models: List[BaseModel],
                 preprocessors: List[BaseProcessor] = [],
                 postprocessors: List[BaseProcessor] = [],
                 copy_first = True,
                 standardize = True,
                 pipeline_name: str = None):
        self.pipeline_name = pipeline_name if pipeline_name else uuid.uuid4().hex
        self.models = models
        self.copy_first = copy_first
        self.standardize = standardize
        self.preprocessors = preprocessors
        self.postprocessors = postprocessors

    def preprocess(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        """ Run all preprocessing steps. Copies input by default. """
        if self.copy_first:
            dataf = CopyPreProcessor()(dataf)
        for preprocessor in tqdm(self.preprocessors,
                                 desc=f"{self.pipeline_name} Preprocessing:",
                                 position=0):
            rich_print(f":construction: Applying preprocessing: '[bold]{preprocessor.__class__.__name__}[/bold]' :construction:")
            dataf = preprocessor(dataf)
        return NumerFrame(dataf)

    def postprocess(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        """ Run all postprocessing steps. Standardizes model prediction by default. """
        if self.standardize:
            dataf = Standardizer()(dataf)
        for postprocessor in tqdm(self.postprocessors,
                                  desc=f"{self.pipeline_name} Postprocessing: ",
                                  position=0):
            rich_print(f":construction: Applying postprocessing: '[bold]{postprocessor.__class__.__name__}[/bold]' :construction:")
            dataf = postprocessor(dataf)
        return NumerFrame(dataf)

    def process_models(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        """ Run all models. """
        for model in tqdm(self.models,
                                  desc=f"{self.pipeline_name} Model prediction: ",
                                  position=0):
            rich_print(f":robot: Generating model predictions with '[bold]{model.__class__.__name__}[/bold]'. :robot:")
            dataf = model(dataf)
        return NumerFrame(dataf)

    def pipeline(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        """ Process full pipeline and return resulting NumerFrame. """
        preprocessed_dataf = self.preprocess(dataf)
        prediction_dataf = self.process_models(preprocessed_dataf)
        processed_prediction_dataf = self.postprocess(prediction_dataf)
        rich_print(f":checkered_flag: [green]Finished pipeline:[green] [bold blue]'{self.pipeline_name}'[bold blue]! :checkered_flag:")
        return processed_prediction_dataf

    def __call__(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        return self.pipeline(dataf)

<IPython.core.display.Javascript object>

In [6]:
# Example using several preprocessor, dummy models and postprocessors
model_names = ["test_0.5", "test_0.8"]

dataf = create_numerframe("test_assets/mini_numerai_version_1_data.csv", metadata={'version': 1})
preprocessors = [GroupStatsPreProcessor(), FeatureSelectionPreProcessor(feature_cols=['feature_intelligence_mean', 'feature_intelligence_std'])]
models = [ConstantModel(constant=0.5, model_name=model_names[0]), ConstantModel(constant=0.8, model_name=model_names[1])]
postprocessors = [MeanEnsembler(cols=[f"prediction_{name}" for name in model_names], final_col_name='prediction_ensembled'),
                  FeatureNeutralizer(feature_names=['feature_intelligence_mean', 'feature_intelligence_std'],
                                     pred_name='prediction_ensembled', proportion=0.8)]

<IPython.core.display.Javascript object>

In [7]:
test_pipeline = ModelPipeline(preprocessors=preprocessors, models=models,
                              postprocessors=postprocessors, pipeline_name="test_pipeline",
                              standardize=False)
processed_dataf = test_pipeline(dataf)

test_pipeline Preprocessing::   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Model prediction:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Postprocessing:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [8]:
assert processed_dataf.meta == dataf.meta
assert isinstance(processed_dataf, NumerFrame)
processed_dataf

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,target,id,era,data_type,prediction_test_0.5,prediction_test_0.8,prediction_ensembled,prediction_ensembled_neutralized_0.8
0,0.333333,0.246183,0.5,n000315175b67977,era1,train,0.5,0.8,0.65,0.0
1,0.208333,0.234359,0.25,n0014af834a96cdd,era1,train,0.5,0.8,0.65,0.36088
2,0.479167,0.327843,0.25,n001c93979ac41d4,era1,train,0.5,0.8,0.65,0.350519
3,0.416667,0.288675,0.25,n0034e4143f22a13,era1,train,0.5,0.8,0.65,0.45091
4,0.270833,0.128732,0.75,n00679d1a636062f,era1,train,0.5,0.8,0.65,0.378539
5,0.5,0.213201,0.5,n009aa2d32389eca,era1,train,0.5,0.8,0.65,0.36154
6,0.604167,0.39107,0.25,n009ef1a5fe009b6,era1,train,0.5,0.8,0.65,0.719895
7,0.770833,0.270906,0.25,n00ae5d51f55fb0f,era1,train,0.5,0.8,0.65,0.339932
8,0.625,0.291937,0.5,n00b0ac86d77aed7,era1,train,0.5,0.8,0.65,0.71071
9,0.5625,0.284545,0.75,n00c63366aeaf76a,era1,train,0.5,0.8,0.65,1.0


<IPython.core.display.Javascript object>

## 2. ModelPipelineCollection

`ModelPipelineCollection` can be used to easily manage and run multiple `ModelPipeline` objects.

`ModelPipelineCollection` simply takes a list of `ModelPipeline` objects as input.

In [9]:
#export
@typechecked
class ModelPipelineCollection:
    """
    Execute multiple initialized ModelPipelines in a sequence.
    :param pipelines: List of initialized ModelPipelines.
    """
    def __init__(self, pipelines: List[ModelPipeline]):
        self.pipelines = {pipe.pipeline_name: pipe for pipe in pipelines}
        self.pipeline_names = list(self.pipelines.keys())

    def process_all_pipelines(self, dataf: Union[pd.DataFrame, NumerFrame]) -> List[NumerFrame]:
        """ Process all pipelines and return list of resulting NumerFrames. """
        result_datafs = []
        for name, pipeline in tqdm(self.pipelines.items(),
                                   desc="Processing Pipeline Collection"):
            result_datafs.append(self.process_single_pipeline(dataf, name))
        return result_datafs

    def process_single_pipeline(self, dataf: Union[pd.DataFrame, NumerFrame], pipeline_name: str) -> NumerFrame:
        """ Run full model pipeline for given name in collection. """
        rich_print(f":construction_worker: [bold green]Processing model pipeline:[/bold green] '{pipeline_name}' :construction_worker:")
        pipeline = self.get_pipeline(pipeline_name)
        dataf = pipeline(dataf)
        return NumerFrame(dataf)

    def get_pipeline(self, pipeline_name: str) -> ModelPipeline:
        """ Retrieve model pipeline for given name. """
        available_pipelines = self.pipeline_names
        assert pipeline_name in available_pipelines, f"Requested pipeline '{pipeline_name}', but only the following models are in the collection: '{available_pipelines}'."
        return self.pipelines[pipeline_name]

    def __call__(self, dataf: Union[pd.DataFrame, NumerFrame]) -> List[NumerFrame]:
        return self.process_all_pipelines(dataf=dataf)

<IPython.core.display.Javascript object>

In [10]:
# Different pipeline with no preprocessing or postprocessing. Only RandomModel
test_pipeline2 = ModelPipeline(models=[RandomModel()], pipeline_name="test_pipeline2")

<IPython.core.display.Javascript object>

In this example we will process two `ModelPipeline`s with different characteristics on the same data.

In [11]:
collection = ModelPipelineCollection([test_pipeline, test_pipeline2])
assert collection.get_pipeline("test_pipeline2").pipeline_name == 'test_pipeline2'

<IPython.core.display.Javascript object>

In [12]:
result_datasets = collection(dataf=dataf)

Processing Pipeline Collection:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Preprocessing::   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Model prediction:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Postprocessing:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline2 Preprocessing:: 0it [00:00, ?it/s]

test_pipeline2 Model prediction:   0%|          | 0/1 [00:00<?, ?it/s]

test_pipeline2 Postprocessing: : 0it [00:00, ?it/s]

<IPython.core.display.Javascript object>

The `ModelPipelineCollection` returns a list of `NumerFrame` objects, retaining all metadata and adding prediction columns for each. Note that in this example, the 1st `NumerFrame` had a feature selection step, so it did not retain all columns. However, the second dataset retained all feature columns, because it did not have a feature selection step.

In [13]:
result_datasets[0]

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,target,id,era,data_type,prediction_test_0.5,prediction_test_0.8,prediction_ensembled,prediction_ensembled_neutralized_0.8
0,0.333333,0.246183,0.5,n000315175b67977,era1,train,0.5,0.8,0.65,0.0
1,0.208333,0.234359,0.25,n0014af834a96cdd,era1,train,0.5,0.8,0.65,0.36088
2,0.479167,0.327843,0.25,n001c93979ac41d4,era1,train,0.5,0.8,0.65,0.350519
3,0.416667,0.288675,0.25,n0034e4143f22a13,era1,train,0.5,0.8,0.65,0.45091
4,0.270833,0.128732,0.75,n00679d1a636062f,era1,train,0.5,0.8,0.65,0.378539
5,0.5,0.213201,0.5,n009aa2d32389eca,era1,train,0.5,0.8,0.65,0.36154
6,0.604167,0.39107,0.25,n009ef1a5fe009b6,era1,train,0.5,0.8,0.65,0.719895
7,0.770833,0.270906,0.25,n00ae5d51f55fb0f,era1,train,0.5,0.8,0.65,0.339932
8,0.625,0.291937,0.5,n00b0ac86d77aed7,era1,train,0.5,0.8,0.65,0.71071
9,0.5625,0.284545,0.75,n00c63366aeaf76a,era1,train,0.5,0.8,0.65,1.0


<IPython.core.display.Javascript object>

In [14]:
result_datasets[1]

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target,prediction_random
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5,0.1
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25,0.5
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25,0.9
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25,0.4
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75,0.8
5,n009aa2d32389eca,era1,train,0.5,0.5,0.25,0.25,0.75,0.75,0.75,...,0.75,0.0,0.0,0.75,0.5,0.0,0.25,0.0,0.5,0.6
6,n009ef1a5fe009b6,era1,train,0.5,0.25,0.25,0.75,1.0,1.0,1.0,...,1.0,0.5,0.5,0.75,0.5,0.5,0.5,1.0,0.25,1.0
7,n00ae5d51f55fb0f,era1,train,0.25,1.0,1.0,0.75,1.0,0.75,0.75,...,0.25,0.75,0.75,0.0,0.25,0.75,0.5,0.25,0.25,0.7
8,n00b0ac86d77aed7,era1,train,0.5,0.5,0.5,1.0,1.0,0.25,0.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,0.2
9,n00c63366aeaf76a,era1,train,0.5,1.0,1.0,0.25,0.75,0.25,0.25,...,0.0,1.0,1.0,0.75,0.5,1.0,1.0,0.75,0.75,0.3


<IPython.core.display.Javascript object>

Since metadata is not manipulated in these pipelines, metadata should be the same as the original `NumerFrame` for all resulting `NumerFrame` objects.

In [15]:
for result in result_datasets:
    assert dataf.meta == result.meta

<IPython.core.display.Javascript object>

In [16]:
result_datasets[0].meta

{'era_col': 'era', 'era_col_verified': True, 'version': 1}

<IPython.core.display.Javascript object>

-----------------------------------------------------------------------------

In [17]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_numerframe.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>