# Introduction

## General setup

Import some general stuff, set up the sagemaker environment

In [20]:
import os
import pandas as pd
import numpy as np

from pandas import DataFrame

In [44]:
# We'll use typing to help our logic
import typing
from typing import List, Tuple, Optional

In [11]:
from boto3.session import Session as BotoSession
import sagemaker
from sagemaker.session import Session

boto_session = BotoSession(os.environ.get("AWS_ACCESS_KEY_ID"), os.environ.get("AWS_SECRET_ACCESS_KEY"))
region = boto_session.region_name
session = Session(boto_session=boto_session)
role = 'rolesAmazonSageMaker-ExecutionRole-20190703T193673'
bucket = 'sagemaker-eu-west-1-729071960169'
bucket_folder = "numerai"
prefix = f"{bucket_folder}/explore"

## Data

Second step is formatting the data, splitting it in several batches and splitting it for training, validation and testing

In [17]:
training_data = pd.read_csv("data/numerai_training_data.csv", index_col="id")
tournament = pd.read_csv("data/numerai_tournament_data.csv", index_col="id")

In [27]:
def create_batches(df: DataFrame, number_of_batches: int) -> DataFrame:
    """
    This method splits the dataframe into different batches.
    Arguments:
        df: The dataframe to split.
        number_of_batches: The number of batches
    """
    # Suffle the df first so it's actually random
    intermediate = df.sample(frac=1)
    number_of_rows = df.shape[0]
    list_of_dataframes = []
    
    for i in range(number_of_batches):
        start_index = (i * number_of_rows) // number_of_batches
        end_index = ((i + 1) * number_of_rows) // number_of_batches
        list_of_dataframes.append(df.iloc[start_index:end_index])
        
    return list_of_dataframes

In [75]:
number_of_batches = 10
list_of_dataframes = create_batches(training_data, number_of_batches)

# To be sure we have all items
assert sum([item.shape[0] for item in list_of_dataframes]) == training_data.shape[0]

In [63]:
from sklearn.model_selection import train_test_split
def split_train_validation_test(df: DataFrame, validation_frac: float, test_frac: float) -> Tuple[DataFrame, Optional[DataFrame], Optional[DataFrame]]:
    """
    Splits the dataframe in a train, validation and test dataframe, based on the parameters.
    validation_frac and test_frac must be larger or equal to 0, the sum must be between 0 and 1, not inclusive
    
    Arguments:
        df: The dataframe to split
        validation_frac: The fraction for the validation set
        test_frac: The fraction for the test test
        
    Returns:
        A tuple of 3 dataframes: Train, Validation, Test.
        If validation_frac or test_frac was 0, that dataframe will be None
    """
    assert 0 < validation_frac + test_frac < 1
    assert validation_frac >= 0 and test_frac >= 0
    test_val_frac = validation_frac + test_frac
    train, validation = train_test_split(df, test_size=test_val_frac)
    validation_over_test_frac = validation_frac / test_val_frac if test_frac != 0 else 1
    if validation_over_test_frac == 1:
        test = None
    elif validation_over_test_frac == 0:
        test = validation
        validation = None
    else:
        validation, test = train_test_split(validation, train_size=validation_over_test_frac)
    
    return (train, validation, test)

In [76]:
# Split the datasets into train, validation and test
list_of_split_dataframes = []
for i, batch in enumerate(list_of_dataframes):
    (train, validation, test) = split_train_validation_test(batch, 0.2, 0.2)
    list_of_split_dataframes.append({
        "train": train,
        "validation": validation,
        "test": test,
    })
list_of_dataframes = list_of_split_dataframes
list_of_split_dataframes = None

(10037, 313)

In [77]:
# Split dataframe into features and output
list_of_split_dataframes = []
for batch in list_of_dataframes:
    temp_dict = {}
    for data_type, dataframe in batch.items():
        if dataframe is not None:
            y_data = dataframe.iloc[:, -1]
            temp_dict[f"Y_{data_type}"] = y_data
            x_data = dataframe.iloc[:, 2: -1]
            temp_dict[f"X_{data_type}"] = x_data
    list_of_split_dataframes.append(temp_dict)
list_of_dataframes = list_of_split_dataframes
list_of_split_dataframes = None

In [80]:
# Upload everything to S3
list_of_s3_names = []
for i, batch in enumerate(list_of_dataframes):
    temp_dict = {}
    for data_type, dataframe in batch.items():
        if dataframe is not None:
            path_name = f"batch_{i}_{data_type}.csv"
            path_name = os.path.join("data", path_name)
            dataframe.to_csv(path_name, index=False, header=False)
            temp_dict[data_type] = session.upload_data(path_name, bucket=bucket, key_prefix=prefix)
        else:
            temp_dict[data_type] = None
    list_of_s3_names.append(temp_dict)
list_of_dataframes = list_of_s3_names
list_of_s3_names = None

In [81]:
list_of_dataframes

[{'Y_train': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_Y_train.csv',
  'X_train': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_X_train.csv',
  'Y_validation': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_Y_validation.csv',
  'X_validation': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_X_validation.csv',
  'Y_test': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_Y_test.csv',
  'X_test': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_0_X_test.csv'},
 {'Y_train': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_1_Y_train.csv',
  'X_train': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_1_X_train.csv',
  'Y_validation': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_1_Y_validation.csv',
  'X_validation': 's3://sagemaker-eu-west-1-729071960169/numerai\\explore/batch_1_X_validation.csv',
  'Y_test': 's3://sagemaker-eu-west-1-729071960169/nu

## Models 

The third step is setting up the models. We define general parameters as well as each model we want to test

In [14]:
models = []
# Set some defaults so we don't need to type this every time
training_instance_count = 1
training_instance_type = "ml.m4.xlarge"

model_kwargs = {
    "role": role,
    "train_instance_count": training_instance_count,
    "train_instance_type": training_instance_type,
    "sagemaker_session": session,
}

In [19]:
linear_learner = sagemaker.LinearLearner(
    predictor_type="regressor", #Also try multiclass?
    epochs=10,
    optimizer="auto", #Can also choose sgd, adam or rmsprop
    loss="auto", #Can also choose ‘logistic’, ‘squared_loss’, ‘absolute_loss’, ‘hinge_loss’, ‘eps_insensitive_squared_loss’, ‘eps_insensitive_absolute_loss’, ‘quantile_loss’, ‘huber_loss’ 
    learning_rate=None,
    output_path=os.path.join(prefix, "linear_learner"),
    **model_kwargs,
)
models.append(linear_learner)
# For multiclass there is also accuracy_top_k, f_beta and balance_multiclass_weights 

In [20]:
linear_learner_hyperparameters = {}

In [16]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
xgboost_container = get_image_uri(region, 'xgboost')
xgboost = Estimator(
    xgboost_container,
    output_path=os.path.join(prefix, "xgboost"),
    hyperparameters={
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"50",
    }
    **model_kwargs
)

## Training

# Scoring