In [1]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp dataset

<IPython.core.display.Javascript object>

# Dataset

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
# export
import uuid
import numpy as np
import pandas as pd
from copy import deepcopy
import json
from pathlib import Path
from typing import Union, Tuple
from rich import print as rich_print

<IPython.core.display.Javascript object>

### The Dataset Object

Goals:
1. Create dynamic Numerai dataset where we can add metadata.
2. Numerai specific functionality while keeping the flexibility of Pandas DataFrames.
3. Dynamically update which columns are in data after each processing step.
4. Dynamically update generated metadata after data processing step.

We adopt the convention:
 1. All feature column names should start with "feature".
 2. All target column names should start with "target".
 3. All prediction column names should start with "prediction".
 4. Every column for which this does not hold will be classified as an "aux column".

In [5]:
# export
class Dataset:
    def __init__(self, dataf: pd.DataFrame, *args, **kwargs):
        self.dataf = dataf
        if not 'era' in self.dataf.columns:
            rich_print(":warning: [bold red]Warning[/bold red]: No 'era' column found in DataFrame. \
'era' column is mandatory for certain numerai-blocks functionality. :warning:")
        else:
            self.eras = self.dataf['era']
        self.__dict__.update(*args, **kwargs)
        self.all_cols = list(self.dataf.columns)
        self.feature_cols = [col for col in self.all_cols if col.startswith("feature")]
        self.target_cols = [col for col in self.all_cols if col.startswith("target")]
        self.prediction_cols = [
            col for col in self.all_cols if col.startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.all_cols if col not in self.not_aux_cols
        ]

    def copy_dataset(self):
        """Copy Dataset object"""
        return deepcopy(self)

    def copy_dataframe(self) -> pd.DataFrame:
        """Copy DataFrame part of Dataset"""
        return deepcopy(self.dataf)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Export all attributes in Dataset that can be serialized to json file."""
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(
            self.__dict__, default=lambda o: "<not serializable>", **kwargs
        )
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Load arbitrary data into Dataset object from json file."""
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        # Make sure there is no overwrite on DataFrame
        json_data.pop("dataf", None)
        self.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]) -> pd.DataFrame:
        """Return DataFrame given selection of columns."""
        return self.dataf.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self) -> pd.DataFrame:
        """All columns that are not features, targets nor predictions."""
        return self.get_column_selection(cols=self.aux_cols)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def merge_datasets(self, other, *args, **kwargs):
        """
        Merge Dataset with other Dataset.
        :param other: Another Dataset.
        WARNING: Metadata of original Dataset will be kept in case of duplicates.
        *args, **kwargs will be passed to DataFrame merge operation.
        :return: Dataset with dataf and metadata merged.
        Metadata of original has priority in case of duplicate keys
        """
        # Merge DataFrames
        new_dataset, other_copy = self.copy_dataset(), other.copy_dataset()
        new_dataset.dataf = self.dataf.merge(other.dataf, *args, **kwargs)
        # Merge metadata
        other_copy.__dict__.pop('dataf', None)
        new_dataset.__dict__.update(**other_copy.__dict__)
        return Dataset(**new_dataset.__dict__)

    def __repr__(self) -> str:
        return f"Dataset of shape {self.dataf.shape}. Columns: {self.all_cols}"

    def __str__(self):
        return self.__repr__()

<IPython.core.display.Javascript object>

In [6]:
#export
def create_dataset(file_path: str, *args, **kwargs):
    """
    Convenience function to initialize Dataset object with arbitrary metadata.
    Supports file formats for which Pandas has a 'read_' function.
    For example, .csv, .parquet, .json, .pickle, .html and .xml.
    For more details check https://pandas.pydata.org/docs/reference/io.html
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    # Suffix without dot
    suffix = Path(file_path).suffix[1:]
    dataf = getattr(pd, f"read_{suffix}")(file_path)
    return Dataset(dataf, *args, **kwargs)

<IPython.core.display.Javascript object>

## Dataset Tests

A `Dataset` object can be initialized from memory by providing a Pandas DataFrame + metadata.

### Initialize from memory

In [7]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df["id"] = id_col
df[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
df["era"] = range(100)

<IPython.core.display.Javascript object>

In [8]:
metadata = {
    "version": 42,
    "additional_info": "test_model",
    "multi_target": False,
    "tournament_type": "random",
}
memory_dataset = Dataset(df, **metadata)
assert memory_dataset.version == 42
assert memory_dataset.tournament_type == "random"

<IPython.core.display.Javascript object>

### Initialize from file

You can also use the convenience function `create_dataset` so data can be loaded in using a file name. `create_dataset` supports most extensions that Pandas offers, like `.csv`, `.parquet`, `.json`, etc.

In [9]:
metadata = {
    "version": 1,
    "multi_target": False,
    "tournament_type": "classic",
}

dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv", **metadata)
assert dataset.version == 1
assert not dataset.multi_target
dataset.dataf.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

`get_feature_data` will retrieve all columns where the column name starts with `feature`.

In [10]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

`aux_cols` denotes all columns that are not features, targets of prediction columns.

In [11]:
dataset.aux_cols

['id', 'era', 'data_type']

<IPython.core.display.Javascript object>

In [12]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era,data_type
0,n000315175b67977,era1,train
1,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

Arbitrary `.json` metadata can be stored into the `Dataset`. All metadata can also be exported to a `json` file.

In [13]:
dataset.export_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [14]:
dataset.import_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [15]:
assert dataset.version == 1
assert not dataset.multi_target

<IPython.core.display.Javascript object>

In [16]:
dataf2 = dataset.copy_dataframe()
assert dataf2.equals(dataset.dataf)

<IPython.core.display.Javascript object>

`get_target_data` retrieves all columns where the column name starts with "target". `get_single_target_data` only retrieves the column "target".

In [17]:
dataset.get_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [18]:
dataset.get_single_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [19]:
dataset.dataf.loc[:, "prediction_test_1"] = np.random.uniform(size=len(dataset.dataf))
new_dataset = Dataset(dataset.dataf, dataset.__dict__)
assert new_dataset.prediction_cols == ["prediction_test_1"]
assert new_dataset.version == 1

<IPython.core.display.Javascript object>

Arbitrary columns van be retrieved with `.get_column_selection`. The input argument can be either a string or a list with column names.

In [20]:
new_dataset.get_column_selection("id").head(2)

Unnamed: 0,id
0,n000315175b67977
1,n0014af834a96cdd


<IPython.core.display.Javascript object>

In [21]:
new_dataset.get_column_selection(["id", "prediction_test_1"]).head(2)

Unnamed: 0,id,prediction_test_1
0,n000315175b67977,0.922156
1,n0014af834a96cdd,0.655724


<IPython.core.display.Javascript object>

For convenience we can get a feature, target pair with one method. `X` will have all feature data and `y` all target data.

In [22]:
#hide_input
show_doc(Dataset.get_feature_target_pair)

<h4 id="Dataset.get_feature_target_pair" class="doc_header"><code>Dataset.get_feature_target_pair</code><a href="__main__.py#L74" class="source_link" style="float:right">[source]</a></h4>

> <code>Dataset.get_feature_target_pair</code>(**`multi_target`**=*`False`*)

Get split of feature and target columns.
:param multi_target: Returns only 'target' column by default.
Returns all target columns when set to True.

<IPython.core.display.Javascript object>

In [23]:
X, y = new_dataset.get_feature_target_pair(multi_target=False)
X.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [24]:
y.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [25]:
merged_dataset = memory_dataset.merge_datasets(dataset, how='outer', on='id')
assert merged_dataset.version == 1
assert merged_dataset.additional_info

<IPython.core.display.Javascript object>

In [26]:
merged_dataset.dataf.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K,...,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target_y,prediction_test_1
0,0.687675,0.179687,0.742095,0.580776,0.909322,0.932122,0.876093,0.074179,0.168657,0.178403,...,,,,,,,,,,
1,0.943355,0.737912,0.157486,0.836629,0.644688,0.020207,0.366998,0.384462,0.602673,0.786345,...,,,,,,,,,,


<IPython.core.display.Javascript object>

-----------------------------------------------

In [27]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>