In [None]:
#hide
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
#default_exp download

<IPython.core.display.Javascript object>

# Download

> Downloaders for Numerai Classic and Numerai Signals.

In [None]:
#hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import os
import json
import shutil
from numerapi import NumerAPI, SignalsAPI
from pathlib import Path, PosixPath
from abc import ABC, abstractmethod
from rich.tree import Tree
from rich.console import Console
from rich import print as rich_print
from transparentpath import TransparentPath as GCSPath

<IPython.core.display.Javascript object>

## 1. Base

`BaseDownloader` is an object which implements logic common to all downloaders.

To implement a new Downloader, you should inherit from `BaseDownloader` and be sure to implement at least methods `download_training_data` and `download_inference_data`.

In [None]:
#export
class BaseDownloader(ABC):
    """
    Abstract base class for downloaders.
    :param directory_path: Base folder to download files to.
    """
    def __init__(self, directory_path: str):
        self.dir = Path(directory_path)
        self._create_directory()

    @abstractmethod
    def download_training_data(self, *args, **kwargs):
        """ Download all necessary files needed for training. """
        raise NotImplementedError(f"No method for downloading training data is implemented in '{self.__class__.__name__}'")

    @abstractmethod
    def download_inference_data(self, *args, **kwargs):
        """ Download minimal amount of files needed for weekly inference. """
        raise NotImplementedError(f"No method for downloading inference data is implemented in '{self.__class__.__name__}'.")

    def remove_base_directory(self):
        """ Remove download directory with all contents. """
        abs_path = self.dir.resolve()
        rich_print(f":warning: [red]Deleting directory for '{self.__class__.__name__}[/red]' :warning:\nPath: '{abs_path}'")
        shutil.rmtree(abs_path)

    def configure_gcs_path(self, bucket_name: str):
        """
        Connect to Google Cloud Storage (GCS) bucket.
        :param bucket_name: Valid GCS bucket that you have access to.

        Credentials are detected automatically with the following process:
        1.The environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set and points to a valid `.json` file.
        2. You have a valid Cloud SDK installation. In that case you might see the warning : UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. It is up to you to decide what to do with it.
        3.The machine running the code is itself a GCP machine.
        """
        GCSPath.set_global_fs("gcs", bucket=bucket_name)
        self.dir = GCSPath(self.dir)
        self._create_directory()
        rich_print(f":cloud: Path {self.dir} configured for Google Cloud Storage. :cloud:")

    def _append_folder(self, folder: str) -> Path:
        """
        Return base directory appended with 'folder'.
        Create directory if it does not exist.
        """
        dir = Path(self.dir / folder)
        dir.mkdir(parents=True, exist_ok=True)
        return dir

    @staticmethod
    def _load_json(file_path: str, verbose = False, *args, **kwargs) -> dict:
        """ Load JSON from file and return as dictionary. """
        with open(file_path) as json_file:
            json_data = json.load(json_file, *args, **kwargs)
        if verbose:
            rich_print(json_data)
        return json_data

    def _create_directory(self):
        """ Create base directory if it does not exist. """
        if not self.dir.is_dir():
            rich_print(f"No existing directory found at '[blue]{self.dir}[/blue]'. Creating directory...")
            self.dir.mkdir(parents=True, exist_ok=True)

    @property
    def get_all_files(self) -> list:
        """ Return all contents in directory. """
        return list(self.dir.iterdir())

    @property
    def is_empty(self) -> bool:
        """ Check if directory is empty."""
        return not bool(self.get_all_files)

    def __call__(self, *args, **kwargs):
        """
        The most common use case will be to get weekly inference data. So calling the class itself returns inference data.
        """
        self.download_inference_data(*args, **kwargs)

<IPython.core.display.Javascript object>

## 2. Numerai Classic

In [None]:
#export
class NumeraiClassicDownloader(BaseDownloader):
    """
    Downloading from NumerAPI for Numerai Classic data

    :param directory_path: Base folder to download files to.
    All *args, **kwargs will be passed to NumerAPI initialization.
    """
    def __init__(self, directory_path: str, *args, **kwargs):
        super(NumeraiClassicDownloader, self).__init__(directory_path=directory_path)
        self.napi = NumerAPI(*args, **kwargs)
        self.current_round = self.napi.get_current_round()
        # NumerAPI filenames corresponding to version, class and data type
        self.version_mapping = {1: {"train":
                                        {"int8": ['numerai_training_data_int8.csv', 'numerai_validation_data_int8.csv'],
                                         "float": ['numerai_training_data.csv', 'numerai_validation_data.csv']},
                                    "inference": {"int8": ['numerai_tournament_data_int8.csv'],
                                                  "float": ['numerai_tournament_data.csv']},
                                    "example": ['example_predictions.csv', 'example_validation_predictions.csv']},
                                2: {"train":
                                        {"int8": ['numerai_training_data_int8.parquet', 'numerai_validation_data_int8.parquet'],
                                         "float": ['numerai_training_data.parquet', 'numerai_validation_data.parquet']},
                                    "inference": {"int8": ['numerai_tournament_data_int8.parquet'],
                                                  "float": ['numerai_tournament_data.parquet']},
                                    "example": ['example_predictions.parquet', 'example_validation_predictions.parquet']}
                                }

    def download_training_data(self, subfolder: str = "", version: int = 2, int8: bool = False):
        """
        Get Numerai classic training and validation data.
        :param subfolder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param int8: Integer version of data
        """
        dir = self._append_folder(subfolder)
        data_type = "int8" if int8 else "float"
        train_val_files = self._get_version_mapping(version)['train'][data_type]
        for file in train_val_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)))


    def download_inference_data(self, subfolder: str = "", version: int = 2, int8: bool = False, round_num: int = None):
        """
        Get Numerai classic inference data.
        :param subfolder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param int8: Integer version of data
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        dir = self._append_folder(subfolder)
        data_type = "int8" if int8 else "float"
        inference_files = self._get_version_mapping(version)['inference'][data_type]
        rich_print(f":file_folder: [green]Downloading inference data for round[/green] '{round_num if round_num else self.current_round}'.")
        for file in inference_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)),
                                         round_num=round_num)

    def download_single_dataset(self, filename: str, dest_path: str, round_num: int = None):
        """
        Download one of the available datasets through NumerAPI.

        :param filename: Name as listed in NumerAPI (Check NumerAPI().list_datasets())
        :param dest_path: Full path where file will be saved.
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        assert filename in self.napi.list_datasets(), f"Dataset '{filename}' not available in NumerAPI. Available datasets are {self.napi.list_datasets()}."
        rich_print(f":file_folder: [green]Downloading[/green] '{filename}' :file_folder:")
        self.napi.download_dataset(filename=filename,
                                   dest_path=dest_path,
                                   round_num=round_num)


    def download_example_data(self, subfolder: str = "", version: int = 2, round_num: int = None):
        """
        Download all example prediction data in specified folder for given version.

        :param subfolder: Specify folder to create folder within directory root. Saves in directory root by default.
        :param version: Numerai version (1=classic, 2=super massive dataset (parquet)
        :param round_num: Numerai tournament round number. Downloads latest round by default.
        """
        dir = self._append_folder(subfolder)
        example_files = self._get_version_mapping(version)['example']
        for file in example_files:
            self.download_single_dataset(filename=file,
                                         dest_path=str(dir.joinpath(file)),
                                         round_num=round_num)

    def get_classic_features(self, subfolder: str = "", *args, **kwargs) -> dict:
        """
        Download feature overview (stats and feature sets) through NumerAPI and load.
        :param subfolder: Specify folder to create folder within directory root. Saves in directory root by default.
        *args, **kwargs will be passed to the JSON loader.
        """
        dir = self._append_folder(subfolder)
        filename = "features.json"
        dest_path = str(dir.joinpath(filename))
        self.download_single_dataset(filename=filename,
                                     dest_path=dest_path)
        json_data = self._load_json(dest_path, *args, **kwargs)
        return json_data

    def _get_version_mapping(self, version: int) -> dict:
        """ Check if version is supported and return file mapping for version. """
        try:
            mapping_dictionary = self.version_mapping[version]
        except KeyError:
            raise NotImplementedError(f"Version '{version}' is not implemented. Available versions are {list(self.version_mapping.keys())}")
        return mapping_dictionary

<IPython.core.display.Javascript object>

### NumeraiClassicDownlaoder and BaseDownloader tests

In [None]:
#slow
test_dir_classic = "test_numclassic_general"
numer_classic_downloader = NumeraiClassicDownloader(test_dir_classic)

# Test building class
assert isinstance(numer_classic_downloader.dir, PosixPath)
assert numer_classic_downloader.dir.is_dir()

# Test is_empty
(numer_classic_downloader.dir / "test.txt").write_text("test")
rich_print(f"Directory contents:\n{numer_classic_downloader.get_all_files}")
assert not numer_classic_downloader.is_empty

# Downloading example data
numer_classic_downloader.download_example_data("test1/", version=1)
numer_classic_downloader.download_example_data("test2/", version=2, round_num=290)

# Features
feature_stats_test = numer_classic_downloader.get_classic_features()
assert isinstance(feature_stats_test, dict)
assert len(feature_stats_test['feature_sets']['legacy']) == 304

# Remove contents
numer_classic_downloader.remove_base_directory()
assert not os.path.exists(test_dir_classic)

2022-01-06 13:07:00,854 INFO numerapi.utils: starting download
test_numclassic_general/test1/example_predictions.csv: 51.2MB [00:24, 2.13MB/s]                            


2022-01-06 13:07:26,689 INFO numerapi.utils: starting download
test_numclassic_general/test1/example_validation_predictions.csv: 19.6MB [00:04, 4.61MB/s]                            


2022-01-06 13:07:33,721 INFO numerapi.utils: starting download
test_numclassic_general/test2/example_predictions.parquet: 33.5MB [00:35, 954kB/s]                             


2022-01-06 13:08:11,208 INFO numerapi.utils: starting download
test_numclassic_general/test2/example_validation_predictions.parquet: 13.0MB [00:03, 4.26MB/s]                            


2022-01-06 13:08:15,869 INFO numerapi.utils: starting download
test_numclassic_general/features.json: 441kB [00:00, 615kB/s]                            


<IPython.core.display.Javascript object>

### Example usage

#### Training

In [None]:
#slow
# Initialization
train_base_directory = "test_numclassic_train"
numer_classic_downloader = NumeraiClassicDownloader(train_base_directory)

# Uncomment line below to download training and validation data.
# numer_classic_downloader.download_training_data("train_val", version=2, int8=False)

# Get feature overview
numer_classic_downloader.get_classic_features();

2022-01-06 13:08:19,130 INFO numerapi.utils: starting download
test_numclassic_train/features.json: 441kB [00:00, 608kB/s]                            


{'feature_stats': {'feature_dichasial_hammier_spawner': {'legacy_uniqueness': 0.1778140232616494,
   'spearman_corr_w_target_nomi_20_mean': -0.0006870366768374209,
   'spearman_corr_w_target_nomi_20_sharpe': -0.06827874939205324,
   'spearman_corr_w_target_nomi_20_reversals': 7.490937957833377e-05,
   'spearman_corr_w_target_nomi_20_autocorr': -0.021193429374978014,
   'spearman_corr_w_target_nomi_20_arl': 3.3248407643312103},
  'feature_rheumy_epistemic_prancer': {'legacy_uniqueness': 0.2413505596136356,
   'spearman_corr_w_target_nomi_20_mean': 0.00024874591501294137,
   'spearman_corr_w_target_nomi_20_sharpe': 0.02533555481334663,
   'spearman_corr_w_target_nomi_20_reversals': 8.392516152487414e-05,
   'spearman_corr_w_target_nomi_20_autocorr': 0.10173904070244294,
   'spearman_corr_w_target_nomi_20_arl': 3.2830188679245285},
  'feature_pert_performative_hormuz': {'legacy_uniqueness': 0.6590919790936269,
   'spearman_corr_w_target_nomi_20_mean': -0.0005876714655874945,
   'spearman_

<IPython.core.display.Javascript object>

In [None]:
#hide
# Remove contents (for tests)
numer_classic_downloader.remove_base_directory()

<IPython.core.display.Javascript object>

__For the training example the directory structure will be:__

In [None]:
#hide_input
console = Console(record=True, width=100)

tree = Tree(f":file_folder: {train_base_directory} (base_directory)", guide_style="bold bright_black")
folder_tree = tree.add(":page_facing_up: features.json")
train_val_tree = tree.add(":file_folder: train_val")
train_val_tree.add(':page_facing_up: numerai_training_data.parquet')
train_val_tree.add(':page_facing_up: numerai_validation_data.parquet')

console.print(tree)

<IPython.core.display.Javascript object>

#### Inference

In [None]:
#slow
# initialization
inference_base_directory = "test_numclassic_inference"
numer_classic_downloader = NumeraiClassicDownloader(inference_base_directory)

# Download tournament (inference) data
numer_classic_downloader.download_inference_data("inference", version=2, int8=False)

# Remove folder when done with inference
numer_classic_downloader.remove_base_directory()

2022-01-06 13:08:21,980 INFO numerapi.utils: starting download
test_numclassic_inference/inference/numerai_tournament_data.parquet: 582MB [06:16, 1.55MB/s]                              


<IPython.core.display.Javascript object>

__For the inference example the directory structure will be:__

In [None]:
#hide_input
console = Console(record=True, width=100)

tree = Tree(f":file_folder: {inference_base_directory} (base_directory)", guide_style="bold bright_black")
inference_tree = tree.add(":file_folder: inference")
inference_tree.add(':page_facing_up: numerai_tournament_data.parquet')

console.print(tree)

<IPython.core.display.Javascript object>

## 3. Yahoo Finance

<IPython.core.display.Javascript object>

## 4. FinnHub

<IPython.core.display.Javascript object>

## 5. Bloomberg?

<IPython.core.display.Javascript object>

## 6. Custom Downloader

We invite the Numerai Community to implement new downloaders using interesting APIs. This is especially important for creating innovative Numerai Signals models.

A new Downloader can be created by inheriting from `BaseDownloader`. One also should implement `download_inference_data` and `download_training_data` so we have a common interface for downloading data.

In [None]:
class AwesomeCustomDownloader(BaseDownloader):
    """
    Download awesome financial data from who knows where.

    :param directory_path: Base folder to download files to.
    """
    def __init__(self, directory_path: str, *args, **kwargs):
        super(AwesomeCustomDownloader, self).__init__(directory_path=directory_path)

    def download_inference_data(self, *args, **kwargs):
        """ (minimal) weekly inference downloading here. """
        ...

    def download_training_data(self, *args, **kwargs):
        """ Training dataset downloading here. """
        ...

<IPython.core.display.Javascript object>

In [None]:
#hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script; notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04a_model.ipynb.
Converted 04b_modelpipeline.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_prediction_dataset.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>