# Template Solution for Numerai

Solution is based on the `hello_numerai` project provided by numerai.

Author: Calvin Min (202)

In [1]:
!python --version
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle python-dotenv scipy==1.10.1

zsh:1: command not found: python
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m


In [1]:
from numerapi import NumerAPI
import pandas as pd
import json
import lightgbm as lgb
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve API keys from environment variables
PUBLIC_ID = os.getenv('PUBLIC_ID')
API_SECRET = os.getenv('SECRET')

# Initialize NumerAPI
napi = NumerAPI(PUBLIC_ID, API_SECRET)

# Use one of the latest data versions
DATA_VERSION = "v4.3"

# Download data
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet")
napi.download_dataset(f"{DATA_VERSION}/features.json")
napi.download_dataset(f"{DATA_VERSION}/validation_int8.parquet")

# Load data - Train and Validation
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
features = feature_metadata["feature_sets"]["medium"]
train = pd.read_parquet(f"{DATA_VERSION}/train_int8.parquet", columns=["era"]+features+["target"])
validation = pd.read_parquet(f"{DATA_VERSION}/validation_int8.parquet", columns=["era"]+features+["target"])
train = pd.concat([train, validation])

# Train model
model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=2**5-1,
    colsample_bytree=0.1
)

print("-----BEGIN TO TRAIN MODEL-----")
model.fit(
    train[features],
    train["target"]
)
print("-----FINISHED TRAINING-----")


2024-09-10 15:31:20,207 INFO numerapi.utils: target file already exists
2024-09-10 15:31:20,208 INFO numerapi.utils: download complete
2024-09-10 15:31:20,748 INFO numerapi.utils: target file already exists
2024-09-10 15:31:20,749 INFO numerapi.utils: download complete
2024-09-10 15:31:21,229 INFO numerapi.utils: target file already exists
2024-09-10 15:31:21,230 INFO numerapi.utils: download complete


-----BEGIN TO TRAIN MODEL-----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3525
[LightGBM] [Info] Number of data points in the train set: 5254695, number of used features: 705
[LightGBM] [Info] Start training from score 0.497215
-----FINISHED TRAINING-----


In [2]:
## -- Download Latest Dataset from Numerai --
napi.download_dataset(f"{DATA_VERSION}/live_int8.parquet")
live_data = pd.read_parquet(f"{DATA_VERSION}/live_int8.parquet")

# Extract Live Features
live_features = live_data[features]

2024-09-10 15:40:10,154 INFO numerapi.utils: target file already exists
2024-09-10 15:40:10,156 INFO numerapi.utils: download complete


In [9]:
### --- USING API to Submit --- ###

# Prediction function using trained model above
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[features])
    submission = pd.Series(live_predictions, index=live_features.index)
    submission.to_frame("prediction").to_csv(f"./submission.csv")

# Format and save submission
predict(live_features)

# Upload submission
napi.upload_predictions("./submission.csv")

2024-09-10 15:46:32,827 INFO numerapi.base_api: uploading predictions...


'a6533f7d-87f3-42b4-88da-6d7ba08e0f05'

In [5]:
### --- Pickle Submission --- ### 
from datetime import datetime
import cloudpickle

# Get the current date and time for the filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create the filename with the timestamp
filename = f"predict_{timestamp}.pkl"

# Pickle the predict function
with open(filename, "wb") as f:
    cloudpickle.dump(predict, f)

print(f"Function pickled to {filename}")

Function pickled to predict_20240910_154309.pkl
