# Introduction

## General setup

Import some general stuff, set up the sagemaker environment

In [1]:
import os
import pandas as pd
import numpy as np

from pandas import DataFrame

In [2]:
# We'll use typing to help our logic
import typing
from typing import List, Tuple, Optional

In [3]:
from boto3.session import Session as BotoSession
import sagemaker
from sagemaker.session import Session
from dotenv import load_dotenv

load_dotenv()
boto_session = BotoSession(
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"), 
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    profile_name="numerai",
    region_name="eu-west-1",
)
region = boto_session.region_name
session = Session(boto_session=boto_session)
role = 'arn:aws:iam::729071960169:role/service-role/AmazonSageMaker-ExecutionRole-20190703T193673'
bucket = 'sagemaker-eu-west-1-729071960169'
bucket_folder = "numerai"
prefix = "{}/explore".format(bucket_folder)

## Data

Second step is formatting the data, splitting it in several batches and splitting it for training, validation and testing

In [4]:
training_data = pd.read_csv("data/numerai_training_data.csv", index_col="id")
tournament = pd.read_csv("data/numerai_tournament_data.csv", index_col="id")

In [5]:
def create_batches(df: DataFrame, number_of_batches: int) -> DataFrame:
    """
    This method splits the dataframe into different batches.
    Arguments:
        df: The dataframe to split.
        number_of_batches: The number of batches
    """
    # Suffle the df first so it's actually random
    intermediate = df.sample(frac=1)
    number_of_rows = df.shape[0]
    list_of_dataframes = []
    
    for i in range(number_of_batches):
        start_index = (i * number_of_rows) // number_of_batches
        end_index = ((i + 1) * number_of_rows) // number_of_batches
        list_of_dataframes.append(df.iloc[start_index:end_index])
        
    return list_of_dataframes

In [6]:
number_of_batches = 10
list_of_dataframes = create_batches(training_data, number_of_batches)

# To be sure we have all items
assert sum([item.shape[0] for item in list_of_dataframes]) == training_data.shape[0]

In [7]:
from sklearn.model_selection import train_test_split
def split_train_validation_test(df: DataFrame, validation_frac: float, test_frac: float) -> Tuple[DataFrame, Optional[DataFrame], Optional[DataFrame]]:
    """
    Splits the dataframe in a train, validation and test dataframe, based on the parameters.
    validation_frac and test_frac must be larger or equal to 0, the sum must be between 0 and 1, not inclusive
    
    Arguments:
        df: The dataframe to split
        validation_frac: The fraction for the validation set
        test_frac: The fraction for the test test
        
    Returns:
        A tuple of 3 dataframes: Train, Validation, Test.
        If validation_frac or test_frac was 0, that dataframe will be None
    """
    assert 0 < validation_frac + test_frac < 1
    assert validation_frac >= 0 and test_frac >= 0
    test_val_frac = validation_frac + test_frac
    train, validation = train_test_split(df, test_size=test_val_frac, random_state=512) #Set the random state so we can get consistent results
    validation_over_test_frac = validation_frac / test_val_frac if test_frac != 0 else 1
    if validation_over_test_frac == 1:
        test = None
    elif validation_over_test_frac == 0:
        test = validation
        validation = None
    else:
        validation, test = train_test_split(validation, train_size=validation_over_test_frac, random_state=512) #Set the random state so we can get consistent results
    
    return (train, validation, test)

In [8]:
# Split the datasets into train, validation and test
list_of_split_dataframes = []
for i, batch in enumerate(list_of_dataframes):
    (train, validation, test) = split_train_validation_test(batch, 0.2, 0.2)
    list_of_split_dataframes.append({
        "train": train,
        "validation": validation,
        "test": test,
    })
list_of_dataframes = list_of_split_dataframes
list_of_split_dataframes = None

In [9]:
# Split dataframe into features and output
list_of_split_dataframes = []
for batch in list_of_dataframes:
    temp_dict = {}
    for data_type, dataframe in batch.items():
        if dataframe is not None:
            y_data = dataframe.iloc[:, -1].astype('float32')
            temp_dict[f"Y_{data_type}"] = y_data
            x_data = dataframe.iloc[:, 2: -1].astype('float32')
            temp_dict[f"X_{data_type}"] = x_data
    list_of_split_dataframes.append(temp_dict)
list_of_dataframes = list_of_split_dataframes
list_of_split_dataframes = None

In [15]:
# Upload everything to S3
list_of_s3_names = []
for i, batch in enumerate(list_of_dataframes):
    temp_dict = {}
    for data_type, dataframe in batch.items():
        if dataframe is not None:
            path_name = f"batch_{i}_{data_type}.csv"
            path_name = os.path.join("data", path_name)
            dataframe.to_csv(path_name, index=False, header=False)
            temp_dict[data_type] = session.upload_data(path_name, bucket=bucket, key_prefix=f"{prefix}/data")
        else:
            temp_dict[data_type] = None
    list_of_s3_names.append(temp_dict)

## Models 

The third step is setting up the models. We define general parameters as well as each model we want to test

In [16]:
models = []
# Set some defaults so we don't need to type this every time
training_instance_count = 1
training_instance_type = "ml.m4.xlarge"

model_kwargs = {
    "role": role,
    "train_instance_count": training_instance_count,
    "train_instance_type": training_instance_type,
    "sagemaker_session": session,
}

In [None]:
linear_learner = sagemaker.LinearLearner(
    predictor_type="regressor", #Also try multiclass?
    epochs=10,
    optimizer="auto", #Can also choose sgd, adam or rmsprop
    loss="auto", #Can also choose ‘logistic’, ‘squared_loss’, ‘absolute_loss’, ‘hinge_loss’, ‘eps_insensitive_squared_loss’, ‘eps_insensitive_absolute_loss’, ‘quantile_loss’, ‘huber_loss’ 
    learning_rate=None,
    output_path=f"s3://{bucket}/{prefix}/linear_learner/",
    data_location=f"s3://{bucket}/{prefix}/linear_learner/",
    **model_kwargs,
)
models.append(linear_learner)
# For multiclass there is also accuracy_top_k, f_beta and balance_multiclass_weights 

In [None]:
linear_learner_hyperparameters = {}

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
xgboost_container = get_image_uri(region, 'xgboost')
xgboost = Estimator(
    xgboost_container,
    output_path=f"s3://{bucket}/{prefix}/xgboost",
    hyperparameters={
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"50",
    },
    **model_kwargs
)

## Training

In [None]:
first_batch = list_of_dataframes[0]
train = xgboost.record_set(first_batch["X_train"].astype('float32').values, labels=first_batch["Y_train"].astype('float32').values, channel="train")
validation = xgboost.record_set(first_batch["X_validation"].astype('float32').values, labels=first_batch["Y_validation"].astype('float32').values, channel="validation")

In [None]:
linear_learner.fit(train)

In [None]:
# Upload everything to S3
second_batch = list_of_dataframes[1]
train_location = pd.concat([second_batch["Y_train"], second_batch["X_train"]], axis=1)
local_location = "data/batch_1_train.csv"
train_location.to_csv(local_location, index=False, header=False)
train_location = session.upload_data(local_location, bucket=bucket, key_prefix=f"{prefix}/data")
train_location = sagemaker.s3_input(train_location, content_type="text/csv")

In [None]:
validation_location = pd.concat([second_batch["Y_validation"], second_batch["X_validation"]], axis=1)
local_location = "data/batch_1_validation.csv"
validation_location.to_csv(local_location, index=False, header=False)
validation_location = session.upload_data(local_location, bucket=bucket, key_prefix=f"{prefix}/data")
validation_location = sagemaker.s3_input(validation_location, content_type="text.csv")

In [None]:
xgboost.fit({"train": train_location, "validation": validation_location})

## Loading existing models

In [26]:
linear_learner_training_job_name = "linear-learner-2019-08-13-19-33-40-060"
linear_learner = Estimator.attach(linear_learner_training_job_name)

2019-08-13 19:36:57 Starting - Preparing the instances for training
2019-08-13 19:36:57 Downloading - Downloading input data
2019-08-13 19:36:57 Training - Training image download completed. Training in progress.
2019-08-13 19:36:57 Uploading - Uploading generated training model
2019-08-13 19:36:57 Completed - Training job completed[31mDocker entrypoint called with argument(s): train[0m
[31m[08/13/2019 19:36:42 INFO 140051453298496] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_minimum_lr': u'aut

In [27]:
xgboost_training_job_name = "xgboost-2019-08-13-20-12-48-123"
xgboost = Estimator.attach(xgboost_training_job_name)

2019-08-13 20:15:59 Starting - Preparing the instances for training
2019-08-13 20:15:59 Downloading - Downloading input data
2019-08-13 20:15:59 Training - Training image download completed. Training in progress.
2019-08-13 20:15:59 Uploading - Uploading generated training model
2019-08-13 20:15:59 Completed - Training job completed[31mArguments: train[0m
[31m[2019-08-13:20:15:37:INFO] Running standalone xgboost training.[0m
[31m[2019-08-13:20:15:37:INFO] File size need to be processed in the node: 52.0mb. Available memory size in the node: 8451.02mb[0m
[31m[2019-08-13:20:15:37:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:15:37] S3DistributionType set as FullyReplicated[0m
[31m[20:15:37] 30108x310 matrix with 9333480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-08-13:20:15:37:INFO] Determined delimiter of CSV input is ','[0m
[31m[20:15:37] S3DistributionType set as FullyReplicated[0m
[31m[20:15:38] 10036x3

# Scoring

In [28]:
transformer_kwargs = {
    "instance_count": 1,
    "instance_type": "ml.c4.xlarge",
    "role": role,
}

In [29]:
linear_learner_transformer = linear_learner.transformer(
    output_path=f"s3://{bucket}/{prefix}/linear_learner/",
    **transformer_kwargs,
)



In [31]:
test_path = "data/batch_0_X_test.csv"
first_batch = list_of_dataframes[0]
first_batch["X_test"].iloc[:5000].to_csv(test_path, index=False, header=False)
test_data = session.upload_data(
    test_path,
    bucket=bucket,
    key_prefix=prefix,
)

In [None]:
linear_learner_transformer.transform(
    test_data, 
    content_type="text/csv", 
    split_type="Line"
)
linear_learner_transformer.wait()

In [36]:
ll_predictions = pd.read_csv('data/2019-08-15/linear_learner.csv', header=None)
for _, row in ll_predictions.iterrows():
    try:
        row[0] = row[0].replace('{"score":', '').replace("}", "")
    except IndexError:
        pass
ll_predictions = ll_predictions.astype('float32')

In [37]:
labels = pd.DataFrame(first_batch["Y_test"].iloc[:5000])

In [41]:
# Source: https://docs.google.com/document/d/1HvSw7VQZYUGPGYoI-y3s7P5LDtV6tNlIZd-4xwuyF58/edit#
def score_correlation(labels, prediction):
    ranked_prediction = prediction.rank(pct=True, method="first")
    return np.corrcoef(labels, ranked_prediction, rowvar=False)[0, 1]

In [None]:
score_correlation(labels, ll_predictions)

In [33]:
xgboost_transformer = xgboost.transformer(
    output_path=f"s3://{bucket}/{prefix}/xgboost/",
    **transformer_kwargs,
)



In [None]:
test_path = "data/batch_1_X_test.csv"
first_batch["X_test"].iloc[:5000].to_csv(test_path, index=False, header=False)
test_data = session.upload_data(
    test_path,
    bucket=bucket,
    key_prefix=prefix,
)

In [34]:
xgboost_transformer.transform(
    test_data, 
    content_type="text/csv", 
    split_type="Line"
)
xgboost_transformer.wait()

......................................!


In [39]:
xgb_predictions = pd.read_csv('data/2019-08-15/xgboost.csv', header=None)
xgb_predictions = xgb_predictions.astype('float32')

In [None]:
labels = pd.DataFrame(first_batch["Y_test"].iloc[:5000])

In [40]:
score_correlation(labels, predictions)

NameError: name 'score_correlation' is not defined

In [43]:
score_correlation(labels, ll_predictions)

0.023882117124156003

In [42]:
score_correlation(labels, xgb_predictions)

-0.008299822455109549

In [45]:
for i in range(11):
    new_predictions = (i/10) * ll_predictions +  (10-i)/10 * xgb_predictions
    print(f"{i} has corr score {score_correlation(labels, new_predictions)}")

0 has corr score -0.008299822455109549
1 has corr score 0.015451419313628829
2 has corr score 0.015452108909028086
3 has corr score 0.016332722233889364
4 has corr score 0.01936654793614102
5 has corr score 0.022465195605923586
6 has corr score 0.024592301871770365
7 has corr score 0.02515215482234501
8 has corr score 0.025047139294399803
9 has corr score 0.024442856697386817
10 has corr score 0.023882117124156003


### Combine models

In [50]:
for column in training_data.columns:
    print(column)

era
data_type
feature_intelligence1
feature_intelligence2
feature_intelligence3
feature_intelligence4
feature_intelligence5
feature_intelligence6
feature_intelligence7
feature_intelligence8
feature_intelligence9
feature_intelligence10
feature_intelligence11
feature_intelligence12
feature_charisma1
feature_charisma2
feature_charisma3
feature_charisma4
feature_charisma5
feature_charisma6
feature_charisma7
feature_charisma8
feature_charisma9
feature_charisma10
feature_charisma11
feature_charisma12
feature_charisma13
feature_charisma14
feature_charisma15
feature_charisma16
feature_charisma17
feature_charisma18
feature_charisma19
feature_charisma20
feature_charisma21
feature_charisma22
feature_charisma23
feature_charisma24
feature_charisma25
feature_charisma26
feature_charisma27
feature_charisma28
feature_charisma29
feature_charisma30
feature_charisma31
feature_charisma32
feature_charisma33
feature_charisma34
feature_charisma35
feature_charisma36
feature_charisma37
feature_charisma38
featur