In [None]:
## Load secrets from file
import os
FP_Secrets = 'Numerai.secrets'

if not os.path.exists(FP_Secrets):
    raise FileNotFoundError(f"'{FP_Secrets}' not found. Make sure the file exists.")

# Read API keys 
api_keys = {}
with open(FP_Secrets, 'r') as secrets_file:
    for line in secrets_file:
        key, value = line.strip().split('=')
        api_keys[key] = value

# Set your Numerai API credentials
PUBLIC_KEY = api_keys.get('PUBLIC_KEY')
SECRET_KEY = api_keys.get('SECRET_KEY')

if not PUBLIC_KEY or not SECRET_KEY:
    raise ValueError("API keys not found in the 'numerai.secrets' file.")

import numerapi

# Set your Numerai API credentials
napi = numerapi.NumerAPI(public_id=PUBLIC_KEY, secret_key=SECRET_KEY)


In [None]:
## Download the latest Numerai datasets
napi.download_dataset("v4.1/train.parquet", "train.parquet")
napi.download_dataset("v4.1/validation.parquet", "validation.parquet")
napi.download_dataset("v4.1/live.parquet", "live.parquet")
napi.download_dataset("v4.1/live_example_preds.parquet", "live_example_preds.parquet")
napi.download_dataset("v4.1/validation_example_preds.parquet", "validation_example_preds.parquet")
napi.download_dataset("v4.1/features.json", "features.json")
napi.download_dataset("v4.1/meta_model.parquet", "meta_model.parquet")

# Challenge: How might you use the additional files like 'features.json' and 'meta_model.parquet' in your ML models?


In [None]:
# Load the data into pandas DataFrames using `pd.read_parquet`
import pandas as pd

train_data = pd.read_parquet("train.parquet")
validation_data = pd.read_parquet("validation.parquet")
live_data = pd.read_parquet("live.parquet")
live_example_preds = pd.read_parquet("live_example_preds.parquet")
validation_example_preds = pd.read_parquet("validation_example_preds.parquet")

# Display basic info about the data
print("Training data shape:", train_data.shape)
print("Validation data shape:", validation_data.shape)
print("Live data shape:", live_data.shape)

In [None]:
## Initializes Numerai Data and NumerAPI
import numpy as np
import pandas as pd
import numerapi
import re

# Set your Numerai API credentials
napi = numerapi.NumerAPI(public_id=PUBLIC_KEY, secret_key=SECRET_KEY)

# Download the latest Numerai dataset
# napi.download_current_dataset(unzip=True)

f_pattern = r"numerai_dataset_\d+"
f_name = None
print(os.listdir())
for file in os.listdir():
    if re.match(f_pattern, file):
        f_name = file
        break

assert f_name != None
f_name = f_name.replace('.zip', '') 


In [None]:
## Loads data by chunks (My laptop does not have enough RAM)
t_data = os.path.join(f_name, "numerai_training_data.csv")
tor_data = os.path.join(f_name, "numerai_tournament_data.csv")
chunk_size = 50000 
num_chunks = 10
chunks = []
for i, chunk in enumerate(pd.read_csv(t_data, chunksize=chunk_size)):
    chunks.append(chunk)
    if i > num_chunks: break
train_data = pd.concat(chunks, axis=0)

chunks = []
for i, chunk in enumerate(pd.read_csv(tor_data, chunksize=chunk_size)):
    chunks.append(chunk)
    if i > num_chunks: break
tournament_data = pd.concat(chunks, axis=0)

# Display basic info about the data
print("Training data shape:", train_data.shape)
print("Tournament data shape:", tournament_data.shape)

In [None]:
## View Dataset
feature_names = [
        f for f in train_data.columns if f.startswith("feature")
    ]
target_names = [f for f in train_data.columns if f not in feature_names]
print('Features:', feature_names, '\nLength of Features:', len(feature_names))
print('Targets:', target_names, '\nLength of Features:', len(target_names))

In [None]:
## More Dataset Viewing
train_data['target'].max()

In [24]:
### Initialize the Neural Networks
import torch as th
import torch.nn as nn
import os
"""
let x = Features Batch
x -> let opinions = {ResidualFeatureEncoder_i(x) for ResidualFeatureEncoder in self.Experts}
 -> let consensus = Sum(opinions) -> Decoder(consensus) \eq y (Target)

"""

class ResidualBlock(nn.Module):
    def __init__(self, residual_dim=512, brodcast_dim=1048, dropout_prob=0.1, activation_fnc=nn.GELU, device='cuda'):
        super(ResidualBlock, self).__init__()
        self.device = device
        self.to(self.device)
        self.residual_block = nn.Sequential(
            nn.Linear(residual_dim, brodcast_dim),
            activation_fnc(),
            nn.Dropout(dropout_prob),
            nn.Linear(brodcast_dim, brodcast_dim),
            activation_fnc(),
            nn.Dropout(dropout_prob),
            nn.Linear(brodcast_dim, brodcast_dim),
            activation_fnc(),
            nn.Dropout(dropout_prob),
            nn.Linear(brodcast_dim, residual_dim),
            activation_fnc(), # Needs Batch Norm
        )
        self._norm = nn.LayerNorm(residual_dim)
    def forward(self, x):
        return self._norm(x + self.residual_block(x))
    
class ResidualFeatureEncoder(nn.Module):
    def __init__(self, expert_num, num_residuals=5, input_dim=313, output_dim= 256, residual_dim=512, dataset_name="Dataset_563", device='cuda'):
        super(ResidualFeatureEncoder, self).__init__()
        self.device = device
        self.to(self.device)
        self.brodcast = nn.Sequential(
            nn.Linear(input_dim, residual_dim),
            nn.GELU(),
        )
        self.residuals = [ResidualBlock(residual_dim=residual_dim,) for _ in range(num_residuals)]
        self.residuals = nn.Sequential(*self.residuals)
        self.outcast = nn.Linear(residual_dim, output_dim)

        self._expert_num = expert_num
        self._ds_name = dataset_name
        self.save_path = os.path.join('.', f'Networks','Checkpoints','NumeraiExperts', f"{self._expert_num}")
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        
    def forward(self, x):
        x = self.brodcast(x)
        x = self.residuals(x)
        x = self.outcast(x)
        return x
    
    def load_checkpoint(self):
        if os.path.exists(os.path.join(self.save_path, f'{self._ds_name}_{self._expert_num}_weights.pt')):
            self.load_state_dict(th.load(os.path.join(self.save_path, f'{self._ds_name}_{self._expert_num}_weights.pt')))

    
    def save_checkpoint(self, file=None):
        print(f'[Expert {self._expert_num}] Saving Checkpoint...')
        if file != None:
            th.save(self.state_dict(), file)
        else:
            th.save(self.state_dict(), self.save_path + "/" + f'{self._ds_name}_{self._expert_num}_weights.pt') 

class ExpertDecoder(nn.Module):
    def __init__(self, num_experts = 15, num_residuals=5, input_dim=313, output_dim=1, residual_input_dim=313, residual_output_dim= 256, residual_dim=512, dropout_probs=0.05, load_default=False, dataset_name="Dataset_563", device='cuda'):
        super(ExpertDecoder, self).__init__()
        self.device = device
        self.to(self.device)
        self.num_experts, self.num_residuals = num_experts, num_residuals
        self.experts = nn.ModuleList([ResidualFeatureEncoder(expert_num=expert_num, num_residuals=num_residuals, input_dim=residual_input_dim, output_dim=residual_output_dim, residual_dim=residual_dim) for expert_num in range(num_experts)])
        self._norm = nn.LayerNorm(residual_output_dim)
        self.decoder = nn.Sequential(
            nn.Linear(residual_output_dim, residual_output_dim),
            nn.GELU(),
            nn.Dropout(dropout_probs),
            nn.Linear(residual_output_dim, residual_output_dim),
            nn.GELU(),
            nn.Dropout(dropout_probs),
            nn.Linear(residual_output_dim, residual_output_dim//4),
            nn.GELU(),
            nn.Linear(residual_output_dim//4, residual_output_dim//8),
            nn.GELU(),
            nn.Linear(residual_output_dim//8, output_dim), # output 
            nn.Sigmoid(),
        )
        self._ds_name = dataset_name
        self.save_path = os.path.join('.', f'Networks','Checkpoints','NumeraiExperts', 'ExpertDecoder')
        self._file_name = f'expert_decoder_{self.num_experts}#Experts_{self.num_residuals}#Residuals_{self._ds_name}.pt'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
    
        elif load_default:
            self.load_checkpoint()
            # for expert in self.experts:
            #     expert.load_checkpoint()

    def forward(self, x):
        expert_opinions = [expert(x) for expert in self.experts]
        expert_opinions_tnsr = th.stack(expert_opinions, dim=0)
        expert_consensus = th.sum(expert_opinions_tnsr, dim=0)
        normalized_consensus = self._norm(expert_consensus)
        y = self.decoder(normalized_consensus)
        return y

    def load_checkpoint(self):
        if os.path.exists(os.path.join(self.save_path, self.file_name)):
            self.load_state_dict(th.load(os.path.join(self.save_path, self.file_name)))
    
    def save_checkpoint(self, file=None):
        print(f'[Expert {self._expert_num}] Saving Checkpoint...')
        if file != None:
            th.save(self.state_dict(), file)
        else:
            th.save(self.state_dict(), self.save_path + "/" + self.file_name)

# Testing Network
input_size = 313
batch_size = 3
expert_decoder = ExpertDecoder(num_experts=8, num_residuals=8)
print(f'Model\'s Parameter Count w/ {expert_decoder.num_experts} Experts and {expert_decoder.num_residuals} Residuals Each:',sum(p.numel() for p in expert_decoder.parameters()))
expert_decoder.eval()
x = th.rand(size=(batch_size, input_size))
y = expert_decoder(x)
print(y)   

Model's Parameter Count w/ 8 Experts and 8 Residuals Each: 212051585
tensor([[0.4811],
        [0.4800],
        [0.4790]], grad_fn=<SigmoidBackward0>)
