In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp dataset

<IPython.core.display.Javascript object>

# Dataset

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
# export
import uuid
import numpy as np
import pandas as pd
from copy import deepcopy
import json
from pathlib import Path
from typing import Union, Tuple
import datetime as dt
from functools import wraps
from rich import print as rich_print
from typeguard import typechecked

<IPython.core.display.Javascript object>

### Considerations

Goals:
1. Create dynamic Numerai dataset where we can add metadata.
2. Numerai specific functionality while keeping the flexibility of Pandas DataFrames.
3. Dynamically update which columns are in data after each processing step.
4. Dynamically update generated metadata after data processing step.

__Options:__
__1.__ Subclass from DataFrame and add functionality. Add metadata to DataFrame through `df.attrs['some_metadata'] = "some_metadata"`
1.1. Can be confusing because it is still a DataFrame under the hood and, `pd.read_csv` and `pd.read_parquet` returns the normal DataFrame, etc.
More info on subclassing DataFrames: [StackOverflow](https://stackoverflow.com/questions/22155951/how-can-i-subclass-a-pandas-dataframe), [Pandas Docs](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas).
1.2. [Metadata not persistent with parquet](https://stackoverflow.com/questions/14688306/adding-meta-information-metadata-to-pandas-dataframe).

__2.__ Develop dedicated `Dataset` class on which DataFrame is an attribute (`.dataf`)
2.1. Easy to add functionality and typecheck.
2.2. Does not work out of the box with sklearn Transformers, but can be easily made compatible with a single decorator.
2.3. Easy to export and import metadata.


We adopt the convention:
 1. All feature column names should start with "feature".
 2. All target column names should start with "target".
 3. All prediction column names should start with "prediction".
 4. Every column for which this does not hold will be classified as an "aux column".

In [None]:
#export
class Dataset:
    def __init__(self, dataf: pd.DataFrame, *args, **kwargs):
        self.dataf = dataf
        self.__dict__.update(*args, **kwargs)
        self.all_cols = list(self.dataf.columns)
        self.feature_cols = [col for col in self.all_cols if col.startswith("feature")]
        self.target_cols = [col for col in self.all_cols if col.startswith("target")]
        self.prediction_cols = [
            col for col in self.all_cols if col.startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.all_cols if col not in self.not_aux_cols
        ]

    def copy_dataset(self):
        """Copy Dataset object"""
        return deepcopy(self)

    def copy_dataframe(self) -> pd.DataFrame:
        """Copy DataFrame part of Dataset"""
        return deepcopy(self.dataf)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Export all attributes in Dataset that can be serialized to json file."""
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(
            self.__dict__, default=lambda o: "<not serializable>", **kwargs
        )
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Load arbitrary data into Dataset object from json file"""
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        # Make sure there is no overwrite on DataFrame
        json_data.pop("dataf", None)
        self.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]) -> pd.DataFrame:
        """Return DataFrame given selection of columns."""
        return self.dataf.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self) -> pd.DataFrame:
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self) -> pd.DataFrame:
        """All columns that are not features, targets or predictions."""
        return self.get_column_selection(cols=self.aux_cols)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Get split of features and targets
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def __repr__(self) -> str:
        return f"Dataset of shape {self.dataf.shape}. Columns: {self.all_cols}"

    def __str__(self):
        return self.__repr__()

<IPython.core.display.Javascript object>

In [None]:
def create_dataset(file_path: str, *args, **kwargs):
    """
    Convenience function to initialize Dataset object with arbitrary metadata.
    Supports file formats for which Pandas has a 'read_' function.
    For example, .csv, .parquet, .json, .pickle, .html and .xml.
    For more details check https://pandas.pydata.org/docs/reference/io.html
    """
    # Suffix without dot
    suffix = Path(file_path).suffix[1:]
    dataf = getattr(pd, f"read_{suffix}")(file_path)
    return Dataset(dataf, *args, **kwargs)

<IPython.core.display.Javascript object>

### Tests

### Initialize from memory

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df["id"] = id_col
df[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
df["era"] = range(100)

<IPython.core.display.Javascript object>

In [None]:
metadata = {
    "version": 42,
    "additional_info": "test_model",
    "multi_target": False,
    "tournament_type": "random",
}
dataset = Dataset(df, metadata)
assert dataset.version == 42
assert dataset.tournament_type == "random"

<IPython.core.display.Javascript object>

#### Initialize from file

In [None]:
metadata = {
    "version": 1,
    "additional_info": "mini_numerai_data_version_1",
    "multi_target": False,
    "tournament_type": "classic",
}

dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv", metadata)
assert dataset.version == 1
assert not dataset.multi_target
dataset.dataf.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

In [None]:
dataset.dataf.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

In [None]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [None]:
dataset.aux_cols

['id', 'era', 'data_type']

<IPython.core.display.Javascript object>

In [None]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era,data_type
0,n000315175b67977,era1,train
1,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

In [None]:
assert dataset.version == 1
assert dataset.multi_target == False

<IPython.core.display.Javascript object>

In [None]:
dataset.export_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [None]:
dataset.import_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [None]:
dataset.__dict__

{'dataf':                  id   era data_type  feature_intelligence1  \
 0  n000315175b67977  era1     train                   0.00   
 1  n0014af834a96cdd  era1     train                   0.00   
 2  n001c93979ac41d4  era1     train                   0.25   
 3  n0034e4143f22a13  era1     train                   1.00   
 4  n00679d1a636062f  era1     train                   0.25   
 5  n009aa2d32389eca  era1     train                   0.50   
 6  n009ef1a5fe009b6  era1     train                   0.50   
 7  n00ae5d51f55fb0f  era1     train                   0.25   
 8  n00b0ac86d77aed7  era1     train                   0.50   
 9  n00c63366aeaf76a  era1     train                   0.50   
 
    feature_intelligence2  feature_intelligence3  feature_intelligence4  \
 0                   0.50                   0.25                   0.00   
 1                   0.00                   0.00                   0.25   
 2                   0.50                   0.25                   0.25

<IPython.core.display.Javascript object>

In [None]:
dataf2 = dataset.copy_dataframe()
assert dataf2.equals(dataset.dataf)

<IPython.core.display.Javascript object>

In [None]:
dataset.get_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [None]:
dataset.get_single_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [None]:
dataset.dataf.loc[:, "prediction_test_1"] = np.random.uniform(size=len(dataset.dataf))
new_dataset = Dataset(dataset.dataf, dataset.__dict__)
assert new_dataset.prediction_cols == ["prediction_test_1"]
assert new_dataset.version == 1

<IPython.core.display.Javascript object>

In [None]:
new_dataset.get_column_selection("id").head(2)

Unnamed: 0,id
0,n000315175b67977
1,n0014af834a96cdd


<IPython.core.display.Javascript object>

In [None]:
new_dataset.get_column_selection(["id", "prediction_test_1"]).head(2)

Unnamed: 0,id,prediction_test_1
0,n000315175b67977,0.697065
1,n0014af834a96cdd,0.257719


<IPython.core.display.Javascript object>

In [None]:
X, y = new_dataset.get_feature_target_pair(multi_target=False)

<IPython.core.display.Javascript object>

In [None]:
X.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


<IPython.core.display.Javascript object>

In [None]:
y.head(2)

Unnamed: 0,target
0,0.5
1,0.25


<IPython.core.display.Javascript object>

In [None]:
str(dataset)

"Dataset of shape (10, 315). Columns: ['id', 'era', 'data_type', 'feature_intelligence1', 'feature_intelligence2', 'feature_intelligence3', 'feature_intelligence4', 'feature_intelligence5', 'feature_intelligence6', 'feature_intelligence7', 'feature_intelligence8', 'feature_intelligence9', 'feature_intelligence10', 'feature_intelligence11', 'feature_intelligence12', 'feature_charisma1', 'feature_charisma2', 'feature_charisma3', 'feature_charisma4', 'feature_charisma5', 'feature_charisma6', 'feature_charisma7', 'feature_charisma8', 'feature_charisma9', 'feature_charisma10', 'feature_charisma11', 'feature_charisma12', 'feature_charisma13', 'feature_charisma14', 'feature_charisma15', 'feature_charisma16', 'feature_charisma17', 'feature_charisma18', 'feature_charisma19', 'feature_charisma20', 'feature_charisma21', 'feature_charisma22', 'feature_charisma23', 'feature_charisma24', 'feature_charisma25', 'feature_charisma26', 'feature_charisma27', 'feature_charisma28', 'feature_charisma29', 'fe

<IPython.core.display.Javascript object>

-----------------------------------------------

In [None]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staker.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>