In [None]:
## Load secrets from file
import os
FP_Secrets = 'Numerai.secrets'

if not os.path.exists(FP_Secrets):
    raise FileNotFoundError(f"'{FP_Secrets}' not found. Make sure the file exists.")

# Read API keys 
api_keys = {}
with open(FP_Secrets, 'r') as secrets_file:
    for line in secrets_file:
        key, value = line.strip().split('=')
        api_keys[key] = value

# Set your Numerai API credentials
PUBLIC_KEY = api_keys.get('PUBLIC_KEY')
SECRET_KEY = api_keys.get('SECRET_KEY')

if not PUBLIC_KEY or not SECRET_KEY:
    raise ValueError("API keys not found in the 'numerai.secrets' file.")

import numerapi

# Set your Numerai API credentials
napi = numerapi.NumerAPI(public_id=PUBLIC_KEY, secret_key=SECRET_KEY)


In [None]:
## Download the latest Numerai datasets
napi.download_dataset("v4.1/train.parquet", "train.parquet")
napi.download_dataset("v4.1/validation.parquet", "validation.parquet")
napi.download_dataset("v4.1/live.parquet", "live.parquet")
napi.download_dataset("v4.1/live_example_preds.parquet", "live_example_preds.parquet")
napi.download_dataset("v4.1/validation_example_preds.parquet", "validation_example_preds.parquet")
napi.download_dataset("v4.1/features.json", "features.json")
napi.download_dataset("v4.1/meta_model.parquet", "meta_model.parquet")

# Challenge: How might you use the additional files like 'features.json' and 'meta_model.parquet' in your ML models?


In [None]:
# Load the data into pandas DataFrames using `pd.read_parquet`
import pandas as pd

train_data = pd.read_parquet("train.parquet")
validation_data = pd.read_parquet("validation.parquet")
live_data = pd.read_parquet("live.parquet")
live_example_preds = pd.read_parquet("live_example_preds.parquet")
validation_example_preds = pd.read_parquet("validation_example_preds.parquet")

# Display basic info about the data
print("Training data shape:", train_data.shape)
print("Validation data shape:", validation_data.shape)
print("Live data shape:", live_data.shape)

In [None]:
## Initializes Numerai Data and NumerAPI
import numpy as np
import pandas as pd
import numerapi
import re

# Set your Numerai API credentials
napi = numerapi.NumerAPI(public_id=PUBLIC_KEY, secret_key=SECRET_KEY)

# Download the latest Numerai dataset
# napi.download_current_dataset(unzip=True)

f_pattern = r"numerai_dataset_\d+"
f_name = None
print(os.listdir())
for file in os.listdir():
    if re.match(f_pattern, file):
        f_name = file
        break

assert f_name != None
f_name = f_name.replace('.zip', '') 


In [None]:
## Loads data by chunks (My laptop does not have enough RAM)
t_data = os.path.join(f_name, "numerai_training_data.csv")
tor_data = os.path.join(f_name, "numerai_tournament_data.csv")
chunk_size = 50000 
num_chunks = 10
chunks = []
for i, chunk in enumerate(pd.read_csv(t_data, chunksize=chunk_size)):
    chunks.append(chunk)
    if i > num_chunks: break
train_data = pd.concat(chunks, axis=0)

chunks = []
for i, chunk in enumerate(pd.read_csv(tor_data, chunksize=chunk_size)):
    chunks.append(chunk)
    if i > num_chunks: break
tournament_data = pd.concat(chunks, axis=0)

# Display basic info about the data
print("Training data shape:", train_data.shape)
print("Tournament data shape:", tournament_data.shape)


In [None]:
for element in train_data['target']:
    if element not in [0, 0.25, 0.5, 0.75, 1]: print(element)

In [None]:
## View Dataset
feature_names = [
        f for f in train_data.columns if f.startswith("feature")
    ]
target_names = [f for f in train_data.columns if f not in feature_names]
print('Features:', feature_names, '\nLength of Features:', len(feature_names))
print('Targets:', target_names, '\nLength of Features:', len(target_names))


In [None]:
## More Dataset Viewing # Num Features = len(feature_names)
import torch as th
import torch.nn.functional as F

def augment_dataset(data):
    data_points = []
    y_outputs = F.one_hot(th.tensor((data['target'].to_numpy().reshape(-1, 1) * 4).astype(int)), num_classes=5).squeeze(dim=1)
    for features in [f for f in data.columns if f.startswith("feature")]:
        feature = data[features].to_numpy().reshape(-1, 1)
        data_points.append(th.tensor(feature))
    return th.cat(data_points, dim=1), y_outputs

dataset = augment_dataset(train_data)

       

In [None]:
print(dataset[0].shape)
print(dataset[1].shape)

In [None]:
from torch.utils.data import Dataset
BATCH_SIZE = 128

class NumerAIdataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

numerai_dataset = NumerAIdataset(*dataset)

numerai_dataset[0]

In [None]:
from torch.utils.data import DataLoader, random_split
train_size = int(0.9 * len(dataset))
validation_size = len(dataset) - train_size

train_dataset, validation_dataset = random_split(dataset,[train_size, validation_size])

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, )
validation_data_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, )


In [None]:
# Training the Object Transformer!!
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import os
def train_model(
    model, train_loader, val_loader,
    batch_size = 16, 
    epochs = 1000, 
    learning_rate = 5e-3, 
    log_interval = 50, 
    no_cuda = False, 
    seed = 1, 
    is_lstm=False,
    patience = 10):

  use_cuda = not no_cuda and th.cuda.is_available()
  device = th.device("cuda" if use_cuda else "cpu")
  print(device)
  kwargs = {}
  criterion = nn.CrossEntropyLoss()
  noise_level = 0.001  
  train_losses = []
  val_losses = []

  def train(model, device, train_loader, optimizer, is_lstm=is_lstm):
    model.train()
    model.to(device)

    total_loss = 0
    i = 0
    for data, target in train_loader:
        # th.cuda.empty_cache()
        i+=1
        # if is_lstm: model.reset_hidden_state(data.shape[0])
        # print('data', data.shape)
        data = {k: v.to(device).squeeze(dim=0) for k, v in data.items()}
        target = target.to(device).squeeze(dim=0)
        optimizer.zero_grad()
        output_target = target
        output_prediction = model(data)
        loss = criterion(output_prediction, output_target)
        loss.backward()
        optimizer.step()
        lr_schedule()
        total_loss+=loss.item()
        if i % log_interval == 0:
            try:
                print(f'Avg Loss: {(total_loss/i+1)}%')
                train_losses.append(total_loss/i+1)
            except:
                pass
    return total_loss / len(train_loader.dataset)

  def validation(model, device, val_loader, is_lstm=is_lstm):
    model.eval()
    loss_total = 0
    with th.no_grad():
      for data, target in val_loader:
        # if is_lstm: model.reset_hidden_state(data.shape[0])
        data = {k: v.to(device).squeeze(dim=0) for k, v in data.items()}
        target = target.to(device).squeeze(dim=0)
        output_target = target
        output_prediction = model(data)
        loss = criterion(output_prediction, output_target)
        loss_total += loss.item()

    val_loss = loss_total / len(val_loader.dataset)
    val_losses.append(val_loss)
    print('Validation_loss:', val_loss)
    return val_loss

  model.to(device)

  optimizer = optim.RMSprop(model.parameters(), lr = start_lr,
        eps=1e-7,
        weight_decay=0.002,
        # momentum=0.92,
        # centered=True
    )
  print('Training...')
  for epoch in range(1, epochs+1):
    train_loss = train(model, device, train_loader, optimizer)
    if epoch % 10 == 0 :
        val_loss = validation(model, device, val_loader)
    if epoch % 50 == 0:
        model.save_checkpoint()
  plt.figure()
  plt.plot(range(len(train_losses)), train_losses, label='Train Loss')
  plt.xlabel('Seq')
  plt.ylabel('Loss')
  plt.savefig('train_loss.png')
  plt.figure()
  plt.plot(range(len(val_losses)), val_losses, label='Val Loss')
  plt.xlabel('Seq')
  plt.ylabel('Loss')
  plt.savefig('val_loss.png')
  plt.legend()
  plt.show()

In [None]:
### Initialize the Neural Networks
from Networks.NumeraiPredictionModels import * 
# Testing Network
input_size = 310
batch_size = 3
expert_decoder = ExpertDecoder(num_experts=8, num_residuals=8)
print(f'Model\'s Parameter Count w/ {expert_decoder.num_experts} Experts and {expert_decoder.num_residuals} Residuals Each:',sum(p.numel() for p in expert_decoder.parameters()))
expert_decoder.eval()
x = th.rand(size=(batch_size, input_size))
y = expert_decoder(x)
# for tens in range(len(y)):
#     print(y[tens],x[tens])
print(y)   