In [1]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp numerframe

<IPython.core.display.Javascript object>

# NumerFrame

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
# export
import uuid
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from rich import print as rich_print
from pandas_profiling import ProfileReport
from typing import Union, Tuple, Any, List

from numerai_blocks.misc import AttrDict

<IPython.core.display.Javascript object>

### The NumerFrame

`NumerFrame` is a data structure that extends `pd.DataFrame` with functionality specific for Numerai users. The main benefits include:
1. Add, export, import metadata. Furthermore, dynamically update or manipulate metadata within your Numerai data pipeline.
2. Automatically track features, targets, prediction and other columns + easily retrieve these sections.
3. Convenient functionality based on `era` or `friday_date`.
4. Integrations with other `numerai-blocks` (Preprocessors, ModelPipelines, Postprocessors, Evaluation and Submission) to better support and increase reliability of Numerai inference pipelines.

Besides, all functionality of Pandas DataFrames is still available in the `NumerFrame`. You therefore don't have to create new pipelines to process your data and make predictions when using `NumerFrame`.

We adopt the convention:
 1. All feature column names should start with `'feature'`.
 2. All target column names should start with `'target'`.
 3. All prediction column names should start with `'prediction'`.
 4. Data should contain an `era` or `friday_date` column, as is almost always the case with Numerai datasets.

Every column for which this does not hold will be classified as an `'aux'` column.

In [5]:
# export
class NumerFrame(pd.DataFrame):
    """
    Data structure which extends Pandas DataFrames and
    allows for additional Numerai specific functionality.
    """
    _metadata = ["meta", "feature_cols", "target_cols",
                 "prediction_cols", "not_aux_cols", "aux_cols"]
    meta = AttrDict()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__init_meta_attrs()
        if not "era_col_verified" in self.meta:
            self.__set_era_col()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        """ Dynamically track column groups. """
        self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
        self.target_cols = [col for col in self.columns if str(col).startswith("target")]
        self.prediction_cols = [
            col for col in self.columns if str(col).startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.columns if col not in self.not_aux_cols
        ]

    def __set_era_col(self):
        """ Each NumerFrame should have an era column to benefit from all functionality. """
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "friday_date" in self.columns:
            self.meta.era_col = "friday_date"
        else:
            raise AttributeError("NumerFrame must contain either an 'era' or 'friday_date' column.")
        self.meta.era_col_verified = True

    def add_metadata(self, *args, **kwargs):
        """ Parse arbitrary metadata (i.e. Python objects) to the meta attribute. """
        self.meta.update(*args, **kwargs)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Export all attributes in NumerFrame that can be serialized to json file."""
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(
            self.meta.__dict__, default=lambda o: "<not serializable>", **kwargs
        )
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Load arbitrary data into NumerFrame object from json file."""
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        self.meta.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]):
        """ Return NumerFrame from selection of columns. """
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self):
        """ Column with name 'target' (Main Numerai target column). """
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self):
        """ All columns for which name starts with 'prediction'."""
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self):
        """ All columns that are not features, targets or predictions. """
        return self.get_column_selection(cols=self.aux_cols)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[pd.DataFrame, Any]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: List[Any],
                      convert_to_tf = False,
                      aemlp_batch = False,
                      features: list = None,
                      targets: list = None,
                      *args, **kwargs) -> tuple:
        """
        Get feature target pair batch of 1 or multiple eras.
        :param eras: Selection of era names that should be present in era_col.
        :param convert_to_tf: Convert to tf.Tensor.
        :param aemlp_batch: Specific target batch for autoencoder training.
        y will contain three components: features, targets and targets.
        :param features: List of features to select. All by default
        :param targets: List of targets to select. All by default.
        *args, **kwargs are passed to initialization of Tensor.
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        features = features if features else self.feature_cols
        targets = targets if targets else self.target_cols
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]

        if convert_to_tf:
            X = tf.convert_to_tensor(X, *args, **kwargs)
            if aemlp_batch:
                y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
            else:
                y = tf.convert_to_tensor(y, *args, **kwargs)
        return X, y

    def profile_report(self, *args, **kwargs) -> ProfileReport:
        """
        DataFrame profiling. Might take a while to generate for large datasets.
        For more info: https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html
        *args, **kwargs will be passed to ProfileReport initialization.
        """
        return ProfileReport(self, *args, **kwargs)

<IPython.core.display.Javascript object>

In [6]:
#export
def create_numerframe(file_path: str, metadata: dict = None, *args, **kwargs) -> NumerFrame:
    """
    Convenience function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames (CSV, Parquet, Pickle, JSON and Excel).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file.
    :param metadata: Metadata to be stored in NumerFrame.meta.
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    # Suffix without dot
    suffix = Path(file_path).suffix
    if suffix in [".csv"]:
        dataf = pd.read_csv(file_path, *args, **kwargs)
    elif suffix in [".parquet"]:
        dataf = pd.read_parquet(file_path, *args, **kwargs)
    elif suffix in [".pkl", ".pickle"]:
        dataf = pd.read_pickle(file_path, *args, **kwargs)
    elif suffix in [".json"]:
        dataf = pd.read_json(file_path, *args, **kwargs)
    elif suffix in [".xls", ".xlsx", ".xlsm", "xlsb", ".odf", ".ods", ".odt"]:
        dataf = pd.read_excel(file_path, *args, **kwargs)
    else:
        raise NotImplementedError
    num_frame = NumerFrame(dataf)
    if metadata:
        num_frame.add_metadata(metadata)
    return num_frame

<IPython.core.display.Javascript object>

## NumerFrame Tests and Usage

A `NumerFrame` object can be initialized from memory just like you would with a Pandas DataFrame.
You then have the option to add metadata with `.add_metadata`. All metadata will be stored in the `meta` attribute.

### Initialize from memory

In [7]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df["id"] = id_col
df[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
df["era"] = range(100)

<IPython.core.display.Javascript object>

In [8]:
metadata = {
    "version": 42,
    "additional_info": "test_model",
    "multi_target": False,
    "tournament_type": "random",
}
memory_dataset = NumerFrame(df)
memory_dataset.add_metadata(metadata)
assert memory_dataset.meta.version == 42
assert memory_dataset.meta.tournament_type == "random"

<IPython.core.display.Javascript object>

Metadata stored in `.meta' and can be accessed as a dictionary or as attributes.

In [9]:
memory_dataset.meta

{'era_col': 'era',
 'era_col_verified': True,
 'version': 42,
 'additional_info': 'test_model',
 'multi_target': False,
 'tournament_type': 'random'}

<IPython.core.display.Javascript object>

In [10]:
memory_dataset.meta.version

42

<IPython.core.display.Javascript object>

In [11]:
memory_dataset.meta['version']

42

<IPython.core.display.Javascript object>

In [12]:
assert memory_dataset.meta.version == memory_dataset.meta['version']

<IPython.core.display.Javascript object>

From a `NumerFrame` you can directly generate a `pandas-profiling` report with the same arguments that are available for `ProfileReport`.
Docs: [pandas-profiling](https://pandas-profiling.github.io/pandas-profiling/docs/master/index.html)

In [13]:
# slow
report = memory_dataset.profile_report(title="Mini Numerai V1 report", explorative=True)
assert isinstance(report, ProfileReport)

<IPython.core.display.Javascript object>

### Initialize from file

You can also use the convenience function `create_numerframe` so `NumerFrame` can be easily initialized. `create_dataset` supports most extensions that Pandas offers, like `.csv`, `.parquet`, `.pkl`, `.pickle`, etc. Think of it as a more dynamic `pd.read_csv`, `pd.read_parquet`, etc. where you can also directly pass metadata.

In [14]:
metadata = {
    "version": 1,
    "multi_target": False,
    "tournament_type": "classic",
}

dataset = create_numerframe("test_assets/mini_numerai_version_1_data.csv",
                            metadata=metadata
                            )
assert dataset.meta.version == 1
assert not dataset.meta.multi_target
dataset.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

In [15]:
dataset.meta

{'era_col': 'era',
 'era_col_verified': True,
 'version': 1,
 'additional_info': 'test_model',
 'multi_target': False,
 'tournament_type': 'classic'}

<IPython.core.display.Javascript object>

`.get_feature_data` will retrieve all columns where the column name starts with `feature`.

In [16]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

`.get_target_data` retrieves all columns where the column name starts with "target". `get_single_target_data` only retrieves the column "target".

In [17]:
dataset.get_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [18]:
dataset.get_single_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

`.get_era_batch` will return a `tf.Tensor` or `np.ndarray` with feature data and target data for one or more eras. Convenient for creating neural network DataGenerators.

In [19]:
X, y = dataset.get_era_batch(['era1'], convert_to_tf=True, dtype=tf.float16)
X

2022-02-11 13:15:27.107475: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
       [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
       [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
       ...,
       [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
       [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
       [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>

<IPython.core.display.Javascript object>

For people training autoencoders + MLP you can get a target that contains 3 elements: features, targets and targets. Just define `aemlp_batch=True`.
More info on these kinds this setup: [AutoEncoder and multitask MLP on new dataset forum post](https://forum.numer.ai/t/autoencoder-and-multitask-mlp-on-new-dataset-from-kaggle-jane-street/4338).

In [20]:
X, y = dataset.get_era_batch(['era1'], convert_to_tf=True, aemlp_batch=True, dtype=tf.float16)
y

[<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
 array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
        [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
        [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
        ...,
        [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
        [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
        [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>]

<IPython.core.display.Javascript object>

`.aux_cols` denotes all columns that are not features, targets or prediction columns.

In [21]:
dataset.aux_cols

['id', 'era', 'data_type']

<IPython.core.display.Javascript object>

In [22]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era,data_type
0,n000315175b67977,era1,train
1,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

Arbitrary `.json` metadata can be stored into the `NumerFrame`. All metadata can also be exported to a `json` file.

In [23]:
dataset.export_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [24]:
dataset.import_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [25]:
assert dataset.meta.version == 1
assert not dataset.meta.multi_target

<IPython.core.display.Javascript object>

Because `NumerFrame` inherits from `pd.DataFrame` you still have all functionality of a normal DataFrame at your disposal, including copying.

In [26]:
dataf2 = dataset.copy()
assert dataf2.equals(dataset)

<IPython.core.display.Javascript object>

`NumerFrame` dynamically tracks which feature, target, aux and prediction columns there are at when initialized. For example, here we add predictions and upon `NumerFrame` initialization the column will be contained in `prediction_cols`. Prediction columns are all column names that start with `prediction`.

In [27]:
dataset.loc[:, "prediction_test_1"] = np.random.uniform(size=len(dataset))
new_dataset = NumerFrame(dataset)
assert new_dataset.prediction_cols == ["prediction_test_1"]
assert new_dataset.meta.version == 1

<IPython.core.display.Javascript object>

Arbitrary columns van be retrieved with `.get_column_selection`. The input argument can be either a string or a list with column names.

In [28]:
selection1 = dataset.get_column_selection("id")
selection1.head(2)

Unnamed: 0,id
0,n000315175b67977
1,n0014af834a96cdd


<IPython.core.display.Javascript object>

In [29]:
selection2 = dataset.get_column_selection(["id", "prediction_test_1"])
selection2.head(2)

Unnamed: 0,id,prediction_test_1
0,n000315175b67977,0.517459
1,n0014af834a96cdd,0.267303


<IPython.core.display.Javascript object>

In [30]:
assert isinstance(selection1, NumerFrame)

<IPython.core.display.Javascript object>

For convenience we can get a feature, target pair with one method. `X` will have all feature data and `y` all target data.

In [31]:
X, y = dataset.get_feature_target_pair(multi_target=False)
X.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [32]:
y.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

-----------------------------------------------

In [33]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_numerframe.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>