In [1]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp numerframe

<IPython.core.display.Javascript object>

# NumerFrame

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
# export
import uuid
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from rich import print as rich_print
from typing import Union, Tuple, Any, List

<IPython.core.display.Javascript object>

### The NumerFrame Object

Goals:
1. Create data structure with Numerai-specific functionality.
2. Keep flexibility of Pandas DataFrames.
3. Dynamically update which columns are in data after each processing step.
4. Dynamically update generated metadata after data processing step.

We adopt the convention:
 1. All feature column names should start with "feature".
 2. All target column names should start with "target".
 3. All prediction column names should start with "prediction".

Every column for which this does not hold will be classified as an "aux column".

In [5]:
# export
class AttrDict(dict):
    """ Access dictionary elements as attributes. """
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

class NumerFrame(pd.DataFrame):
    _metadata = ["meta", "feature_cols", "target_cols",
                 "prediction_cols", "not_aux_cols", "aux_cols"]
    meta = AttrDict()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__init_meta_attrs()
        if not "era_col_verified" in self.meta:
            self.__set_era_col()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        self.feature_cols = [col for col in self.columns if col.startswith("feature")]
        self.target_cols = [col for col in self.columns if col.startswith("target")]
        self.prediction_cols = [
            col for col in self.columns if col.startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.columns if col not in self.not_aux_cols
        ]

    def __set_era_col(self):
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "friday_date" in self.columns:
            self.meta.era_col = "friday_date"
        else:
            raise AttributeError("NumerFrame must contain either an 'era' or 'friday_date' column.")
        self.meta.era_col_verified = True

    def add_metadata(self, *args, **kwargs):
        self.meta.update(*args, **kwargs)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Export all attributes in Dataset that can be serialized to json file."""
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(
            self.meta.__dict__, default=lambda o: "<not serializable>", **kwargs
        )
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Load arbitrary data into Dataset object from json file."""
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        # Make sure there is no overwrite on DataFrame
        json_data.pop("dataf", None)
        self.meta.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]) -> pd.DataFrame:
        """Return DataFrame given selection of columns."""
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self) -> pd.DataFrame:
        """All columns that are not features, targets nor predictions."""
        return self.get_column_selection(cols=self.aux_cols)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[pd.DataFrame, Any]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: List[Any], convert_to_tf = False, aemlp_batch = False, *args, **kwargs) -> tuple:
        """
        Get feature target pair batch of 1 or multiple eras.
        :param eras: List of era names. They need to be present in era_col.
        :param convert_to_tf: Convert to tf.Tensor.
        :param aemlp_batch: Specific target batch for autoencoder training.
        y will contain three components: features, targets and targets.
        *args, **kwargs are passed to initialization of Tensor.
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][self.feature_cols].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][self.target_cols].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]

        if convert_to_tf:
            X = tf.convert_to_tensor(X, *args, **kwargs)
            if aemlp_batch:
                y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
            else:
                y = tf.convert_to_tensor(y, *args, **kwargs)
        return X, y

<IPython.core.display.Javascript object>

In [6]:
#export
def create_numerframe(file_path: str, metadata: dict = None, *args, **kwargs):
    """
    Convenience function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames (.csv, .parquet and .pickle).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file.
    :param metadata: Metadata to be stored in NumerFrame.meta.
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    # Suffix without dot
    suffix = Path(file_path).suffix
    if suffix == ".csv":
        dataf = pd.read_csv(file_path, *args, **kwargs)
    elif suffix == ".parquet":
        dataf = pd.read_parquet(file_path, *args, **kwargs)
    elif suffix in [".pkl", ".pickle"]:
        dataf = pd.read_pickle(file_path, *args, **kwargs)
    else:
        raise NotImplementedError
    num_frame = NumerFrame(dataf)
    if metadata:
        num_frame.add_metadata(metadata)
    return num_frame

<IPython.core.display.Javascript object>

## NumerFrame Tests

A `NumerFrame` object can be initialized from memory by providing a Pandas DataFrame and add metadata with `.add_metadata`. All metadata will be stored in the `meta` attribute.

### Initialize from memory

In [7]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df["id"] = id_col
df[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
df["era"] = range(100)

<IPython.core.display.Javascript object>

In [8]:
metadata = {
    "version": 42,
    "additional_info": "test_model",
    "multi_target": False,
    "tournament_type": "random",
}
memory_dataset = NumerFrame(df)
memory_dataset.add_metadata(metadata)
assert memory_dataset.meta.version == 42
assert memory_dataset.meta.tournament_type == "random"

<IPython.core.display.Javascript object>

In [9]:
memory_dataset.meta

{'era_col': 'era',
 'era_col_verified': True,
 'version': 42,
 'additional_info': 'test_model',
 'multi_target': False,
 'tournament_type': 'random'}

<IPython.core.display.Javascript object>

### Initialize from file

You can also use the convenience function `create_dataset` so `NumerFrame` can be easily initialized. `create_dataset` supports most extensions that Pandas offers, like `.csv`, `.parquet`, `.pkl`, `.pickle`, etc.

In [10]:
metadata = {
    "version": 1,
    "multi_target": False,
    "tournament_type": "classic",
}

dataset = create_numerframe("test_assets/mini_numerai_version_1_data.csv",
                            metadata=metadata
                            )
assert dataset.meta.version == 1
assert not dataset.meta.multi_target
dataset.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

In [11]:
dataset.meta

{'era_col': 'era',
 'era_col_verified': True,
 'version': 1,
 'additional_info': 'test_model',
 'multi_target': False,
 'tournament_type': 'classic'}

<IPython.core.display.Javascript object>

`get_feature_data` will retrieve all columns where the column name starts with `feature`.

In [12]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [13]:
X, y = dataset.get_era_batch(['era1'], convert_to_tf=True, dtype=tf.float16)
X

2022-02-07 15:41:43.298024: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
       [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
       [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
       ...,
       [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
       [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
       [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>

<IPython.core.display.Javascript object>

In [14]:
X, y = dataset.get_era_batch(['era1'], convert_to_tf=True, aemlp_batch=True, dtype=tf.float16)
y

[<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
 array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
        [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
        [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
        ...,
        [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
        [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
        [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>]

<IPython.core.display.Javascript object>

`aux_cols` denotes all columns that are not features, targets of prediction columns.

In [15]:
dataset.aux_cols

['id', 'era', 'data_type']

<IPython.core.display.Javascript object>

In [16]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era,data_type
0,n000315175b67977,era1,train
1,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

Arbitrary `.json` metadata can be stored into the `Dataset`. All metadata can also be exported to a `json` file.

In [17]:
dataset.export_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [18]:
dataset.import_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [19]:
assert dataset.meta.version == 1
assert not dataset.meta.multi_target

<IPython.core.display.Javascript object>

In [20]:
dataf2 = dataset.copy()
assert dataf2.equals(dataset)

<IPython.core.display.Javascript object>

`get_target_data` retrieves all columns where the column name starts with "target". `get_single_target_data` only retrieves the column "target".

In [21]:
dataset.get_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [22]:
dataset.get_single_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [23]:
dataset.loc[:, "prediction_test_1"] = np.random.uniform(size=len(dataset))
new_dataset = NumerFrame(dataset)
assert new_dataset.prediction_cols == ["prediction_test_1"]
assert new_dataset.meta.version == 1

<IPython.core.display.Javascript object>

Arbitrary columns van be retrieved with `.get_column_selection`. The input argument can be either a string or a list with column names.

In [24]:
dataset.get_column_selection("id").head(2)

Unnamed: 0,id
0,n000315175b67977
1,n0014af834a96cdd


<IPython.core.display.Javascript object>

In [25]:
dataset.get_column_selection(["id", "prediction_test_1"]).head(2)

Unnamed: 0,id,prediction_test_1
0,n000315175b67977,0.208435
1,n0014af834a96cdd,0.46119


<IPython.core.display.Javascript object>

For convenience we can get a feature, target pair with one method. `X` will have all feature data and `y` all target data.

In [26]:
#hide_input
show_doc(NumerFrame.get_feature_target_pair)

<h4 id="NumerFrame.get_feature_target_pair" class="doc_header"><code>NumerFrame.get_feature_target_pair</code><a href="__main__.py#L92" class="source_link" style="float:right">[source]</a></h4>

> <code>NumerFrame.get_feature_target_pair</code>(**`multi_target`**=*`False`*)

Get split of feature and target columns.
:param multi_target: Returns only 'target' column by default.
Returns all target columns when set to True.

<IPython.core.display.Javascript object>

In [27]:
X, y = dataset.get_feature_target_pair(multi_target=False)
X.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [28]:
y.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

-----------------------------------------------

In [29]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_numerframe.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>