In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp dataset

<IPython.core.display.Javascript object>

# Dataset

In [None]:
#hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
import numpy as np
import pandas as pd
from copy import deepcopy
import json
from pathlib import Path
from typing import Union
import datetime as dt
from functools import wraps
from rich import print as rich_print
from typeguard import typechecked

<IPython.core.display.Javascript object>

### Considerations

Goal: Create dynamic Numerai dataset where we can add metadata and Numerai specific functionality while keeping the flexibility of Pandas DataFrames.

__Options:__
__1.__ Add metadata to DataFrame through `df.attrs['some_metadata'] = "some_metadata"`
1.1. Downside: [Not persistent with parquet](https://stackoverflow.com/questions/14688306/adding-meta-information-metadata-to-pandas-dataframe).

__2.__ Subclass from DataFrame and add functionality.
2.1. Cumbersome when used to using `pd.read_csv`, `pd.read_parquet`, etc.
2.2 More info: [StackOverflow](https://stackoverflow.com/questions/22155951/how-can-i-subclass-a-pandas-dataframe), [Pandas Docs](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas).

__3.__ Develop dedicated `Dataset` class on which DataFrame is an attribute (`.dataf`)
3.1. Easy to add functionality and typecheck.
3.2. Does not work out of the box with sklearn Transformers, but can be easily made compatible with a single decorator.
3.3. Easy to export and import metadata.


We adopt the convention:
 1. All feature column names should start with "feature".
 2. All target column names should start with "target".
 3. All prediction column names should start with "prediction".
 4. Every column for which this does not hold will be classified as an "aux column".

In [None]:
@typechecked
class Dataset:
    def __init__(self, dataf: pd.DataFrame, *args, **kwargs):
        self.dataf = dataf
        self.__dict__.update(*args, **kwargs)
        self.all_columns = list(self.dataf.columns)
        self.features = [col for col in self.all_columns if col.startswith("feature")]
        self.targets = [col for col in self.all_columns if col.startswith("target")]
        self.predictions = [col for col in self.all_columns if col.startswith("prediction")]
        self.not_aux_columns = self.features + self.targets + self.predictions
        self.aux_columns = [col for col in self.all_columns if col not in self.not_aux_columns]

    def copy_dataset(self):
        """ Copy Dataset object """
        return deepcopy(self)

    def copy_dataframe(self) -> pd.DataFrame:
        """ Copy DataFrame part of Dataset """
        return deepcopy(self.dataf)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """ Export all attributes in Dataset that can be serialized to json file. """
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(self.__dict__, default=lambda o: '<not serializable>', **kwargs)
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """ Load arbitrary data into Dataset object from json file """
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        # Make sure there is no overwrite on DataFrame
        json_data.pop('dataf', None)
        self.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]) -> pd.DataFrame:
        """ Return DataFrame given selection of columns. """
        return self.dataf.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self) -> pd.DataFrame:
        return self.dataf.loc[:, self.features]

    @property
    def get_target_data(self) -> pd.DataFrame:
        return self.dataf.loc[:, self.targets]

    @property
    def get_single_target_data(self) -> pd.DataFrame:
        return self.dataf.loc[:, ['target']]

    @property
    def get_prediction_data(self) -> pd.DataFrame:
        return self.dataf.loc[:, self.predictions]

    @property
    def get_aux_data(self) -> pd.DataFrame:
        """ All columns that are not features, targets or predictions. """
        return self.dataf.loc[:, self.aux_columns]

    def __repr__(self) -> str:
        return f"Dataset of shape {self.dataf.shape}. Columns: {self.all_columns}"

    def __str__(self):
        return self.__repr__()

<IPython.core.display.Javascript object>

### Tests

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df['id'] = id_col
df['target'] = np.random.normal(size=100)
df['era'] = range(100)

<IPython.core.display.Javascript object>

In [None]:
metadata = {"version": 2, "additional_info": "test_model", "multi_target": False, "tournament_type": "classic"}
dataset = Dataset(df, metadata)

<IPython.core.display.Javascript object>

In [None]:
dataset.dataf.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K,id,target,era
0,0.272877,0.307079,0.259438,0.131335,0.648017,0.435034,0.166647,0.276702,0.397706,0.310357,d20e20bc33054781987c8f4f3b5e9000,-2.594008,0
1,0.292326,0.809736,0.53878,0.127201,0.941278,0.420972,0.371896,0.169763,0.267573,0.010176,1e8243a1b8f54d54a62e1bd414633467,0.397229,1


<IPython.core.display.Javascript object>

In [None]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K
0,0.272877,0.307079,0.259438,0.131335,0.648017,0.435034,0.166647,0.276702,0.397706,0.310357
1,0.292326,0.809736,0.53878,0.127201,0.941278,0.420972,0.371896,0.169763,0.267573,0.010176


<IPython.core.display.Javascript object>

In [None]:
dataset.aux_columns

['id', 'era']

<IPython.core.display.Javascript object>

In [None]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era
0,d20e20bc33054781987c8f4f3b5e9000,0
1,1e8243a1b8f54d54a62e1bd414633467,1


<IPython.core.display.Javascript object>

In [None]:
assert dataset.version == 2
assert dataset.multi_target == False

<IPython.core.display.Javascript object>

In [None]:
dataset.export_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [None]:
dataset.import_json_metadata("config.json")

<IPython.core.display.Javascript object>

In [None]:
dataset.__dict__

{'dataf':     feature_A  feature_B  feature_C  feature_D  feature_E  feature_F  \
 0    0.272877   0.307079   0.259438   0.131335   0.648017   0.435034   
 1    0.292326   0.809736   0.538780   0.127201   0.941278   0.420972   
 2    0.115231   0.570173   0.705241   0.474374   0.927222   0.543154   
 3    0.400020   0.153829   0.536381   0.331764   0.415466   0.159677   
 4    0.794389   0.668113   0.172011   0.682105   0.808155   0.468832   
 ..        ...        ...        ...        ...        ...        ...   
 95   0.967203   0.514418   0.940342   0.769946   0.417536   0.552767   
 96   0.294590   0.777028   0.751917   0.390948   0.549639   0.519838   
 97   0.355617   0.230165   0.345002   0.619607   0.623383   0.080130   
 98   0.664745   0.328989   0.080115   0.019733   0.537722   0.210853   
 99   0.125461   0.942354   0.174399   0.133732   0.010703   0.780507   
 
     feature_G  feature_H  feature_I  feature_K  \
 0    0.166647   0.276702   0.397706   0.310357   
 1    0.371

<IPython.core.display.Javascript object>

In [None]:
dataf2 = dataset.copy_dataframe()
assert dataf2.equals(dataset.dataf)

<IPython.core.display.Javascript object>

In [None]:
dataset.get_single_target_data.head(2)

Unnamed: 0,target
0,-2.594008
1,0.397229


<IPython.core.display.Javascript object>

In [None]:
dataset.dataf.loc[:, 'prediction_test_1'] = np.random.uniform(size=len(dataset.dataf))
new_dataset = Dataset(dataset.dataf, dataset.__dict__)
assert new_dataset.predictions == ['prediction_test_1']
assert new_dataset.version == 2

<IPython.core.display.Javascript object>

In [None]:
new_dataset.get_column_selection("id").head(2)

Unnamed: 0,id
0,d20e20bc33054781987c8f4f3b5e9000
1,1e8243a1b8f54d54a62e1bd414633467


<IPython.core.display.Javascript object>

In [None]:
new_dataset.get_column_selection(["id", "prediction_test_1"]).head(2)

Unnamed: 0,id,prediction_test_1
0,d20e20bc33054781987c8f4f3b5e9000,0.201969
1,1e8243a1b8f54d54a62e1bd414633467,0.878643


<IPython.core.display.Javascript object>

In [None]:
str(dataset)

"Dataset of shape (100, 14). Columns: ['feature_A', 'feature_B', 'feature_C', 'feature_D', 'feature_E', 'feature_F', 'feature_G', 'feature_H', 'feature_I', 'feature_K', 'id', 'target', 'era']"

<IPython.core.display.Javascript object>

-----------------------------------------------

In [None]:
#hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script; notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04a_model.ipynb.
Converted 04b_modelpipeline.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_prediction_dataset.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>