In [1]:
# Install dependencies
%pip install --upgrade pip
%pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle seaborn scipy==1.10.1

# Inline plots
%matplotlib inline
# %pip install colabcode
# %pip install googlecolab
# %pip install google



import lightgbm as lgb
import pandas as pd
import json
from numerapi import NumerAPI



# Download data
napi = NumerAPI()
napi.download_dataset("v4.2/train_int8.parquet")
napi.download_dataset("v4.2/features.json")

# Load data
feature_metadata = json.load(open("v4.2/features.json"))
feature_cols = feature_metadata["feature_sets"]["all"]
target_cols = feature_metadata["targets"]
train = pd.read_parquet("v4.2/train_int8.parquet", columns=["era"] + feature_cols + target_cols)

# Number of features 
len(feature_cols)

# Print target columns
train[["era"] + target_cols]

# Drop `target` column
assert train["target"].equals(train["target_cyrus_v4_20"])
target_names = target_cols[1:]
targets_df = train[["era"] + target_names]
# Print target names grouped by name and time horizon
pd.set_option('display.max_rows', 100)
t20s = [t for t in target_names if t.endswith("_20")]
t60s = [t for t in target_names if t.endswith("_60")]
names = [t[7:-6] for t in t20s]
pd.DataFrame({"name": names,"20": t20s,"60": t60s}).set_index("name")
targets_df[["target_cyrus_v4_20", "target_cyrus_v4_60", "target_xerxes_v4_20", "target_xerxes_v4_60"]].plot(kind="hist", bins=35, density=True, figsize=(8, 4), title="Target Distributions", subplots=True, layout=(2, 2), ylabel="", yticks=[]);
# print number of NaNs per era
nans_per_era = targets_df.groupby("era").apply(lambda x: x.isna().sum())
nans_per_era[target_names].plot(figsize=(64, 32), title="Number of NaNs per Era", legend=False)
# Plot correlation matrix of targets
import seaborn as sns
sns.heatmap(targets_df[target_names].corr(), cmap="coolwarm", xticklabels=False, yticklabels=False)



# Arbitrarily pick a few 20-day target candidates
target_candidates = ["target_cyrus_v4_20", "target_sam_v4_20", "target_caroline_v4_20", "target_xerxes_v4_20"]

targets_df[target_names].corrwith(targets_df["target_cyrus_v4_20"]).sort_values(ascending=False).to_frame("corr_with_cyrus_v4_20")


models = {}
for target in target_candidates:
    model = lgb.LGBMRegressor(
        device='gpu',
        n_estimators=4000,
        learning_rate=0.01,
        max_depth=25,
        num_leaves=31,
        colsample_bytree=0.1,
        force_col_wise=True,       
     
    )
    model.fit(
        train[feature_cols],
        train[target]
    )
    models[target] = model
    
    PYDEVD_DISABLE_FILE_VALIDATION=1
    # Download validation data
napi.download_dataset("v4.2/validation_int8.parquet")

# Load the validation data, filtering for data_type == "validation"
validation = pd.read_parquet("v4.2/validation_int8.parquet", columns=["era", "data_type"] + feature_cols + target_cols)
validation = validation[validation["data_type"] == "validation"]
del validation["data_type"]

# Downsample every 4th era to reduce memory usage and speedup validation (suggested for Colab free tier)
# Comment out the line below to use all the data
# validation = validation[validation["era"].isin(validation["era"].unique()[::4])]

# Embargo overlapping eras from training data
last_train_era = int(train["era"].unique()[-1])
eras_to_embargo = [str(era).zfill(4) for era in [last_train_era + i for i in range(4)]]
validation = validation[~validation["era"].isin(eras_to_embargo)]

# Generate validation predictions for each model
for target in target_candidates:
    validation[f"prediction_{target}"] = models[target].predict(validation[feature_cols])

pred_cols = [f"prediction_{target}" for target in target_candidates]
validation[pred_cols]

from scipy import stats
import numpy as np

def numerai_corr(preds, target):
    ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
    gauss_ranked_preds = stats.norm.ppf(ranked_preds)
    centered_target = target - target.mean()
    preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
    target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
    return np.corrcoef(preds_p15, target_p15)[0, 1]


correlations = {}
cumulative_correlations = {}
for target in target_candidates:
    correlations[f"prediction_{target}"] = validation.groupby("era").apply(lambda d: numerai_corr(d[f"prediction_{target}"], d["target"]))
    cumulative_correlations[f"prediction_{target}"] = correlations[f"prediction_{target}"].cumsum()

cumulative_correlations = pd.DataFrame(cumulative_correlations)
cumulative_correlations.plot(title="Cumulative Correlation of validation Predictions", figsize=(10, 6), xticks=[])

summary_metrics = {}
for target in target_candidates:
    # per era correlation between this target and cyrus
    mean_corr_with_cryus = validation.groupby("era").apply(lambda d: d[target].corr(d["target_cyrus_v4_20"])).mean()
    # per era correlation between predictions of the model trained on this target and cyrus
    mean = correlations[f"prediction_{target}"].mean()
    std = correlations[f"prediction_{target}"].std()
    sharpe = mean / std
    rolling_max = cumulative_correlations[f"prediction_{target}"].expanding(min_periods=1).max()
    max_drawdown = (rolling_max - cumulative_correlations[f"prediction_{target}"]).max()
    summary_metrics[f"prediction_{target}"] = {
        "mean": mean,
        "std": std,
        "sharpe": sharpe,
        "max_drawdown": max_drawdown,
        "mean_corr_with_cryus": mean_corr_with_cryus,
    }
pd.set_option('display.float_format', lambda x: '%f' % x)
summary = pd.DataFrame(summary_metrics).T
summary

# Ensemble predictions together with a simple average
favorite_targets = ["target_cyrus_v4_20", "target_xerxes_v4_20"]
ensemble_cols = [f"prediction_{target}" for target in favorite_targets]
validation["ensemble"] = validation.groupby("era")[ensemble_cols].rank(pct=True).mean(axis=1)

# Print the ensemble predictions
pred_cols = ensemble_cols + ["ensemble"]
validation[pred_cols]

correlations = {}
cumulative_correlations = {}
for col in pred_cols:
    correlations[col] = validation.groupby("era").apply(lambda d: numerai_corr(d[col], d["target"]))
    cumulative_correlations[col] = correlations[col].cumsum()

cumulative_correlations = pd.DataFrame(cumulative_correlations)
cumulative_correlations.plot(title="Cumulative Correlation of validation Predictions", figsize=(10, 6), xticks=[])

summary_metrics = {}
for col in pred_cols:
    mean = correlations[col].mean()
    std = correlations[col].std()
    sharpe = mean / std
    rolling_max = cumulative_correlations[col].expanding(min_periods=1).max()
    max_drawdown = (rolling_max - cumulative_correlations[col]).max()
    summary_metrics[col] = {
        "mean": mean,
        "std": std,
        "sharpe": sharpe,
        "max_drawdown": max_drawdown,
    }
pd.set_option('display.float_format', lambda x: '%f' % x)
summary = pd.DataFrame(summary_metrics).T
summary

def predict_ensemble(live_features: pd.DataFrame) -> pd.DataFrame:
    # generate predictions from each model
    predictions = pd.DataFrame(index=live_features.index)
    for target in favorite_targets:
        predictions[target] = models[target].predict(live_features[feature_cols])
    # ensemble predictions
    ensemble = predictions.rank(pct=True).mean(axis=1)
    # format submission
    submission = ensemble.rank(pct=True, method="first")
    return submission.to_frame("prediction")

# Quick test
napi.download_dataset("v4.2/live_int8.parquet")
live_features = pd.read_parquet(f"v4.2/live_int8.parquet", columns=feature_cols)
predict_ensemble(live_features)

# Use the cloudpickle library to serialize your function and its dependencies
import cloudpickle
p = cloudpickle.dumps(predict_ensemble)
with open("predict_ensembleTodayTxr.pkl", "wb") as f:
    f.write(p)
    
    # Download file if running in Google Colab
try:
        from google.colab import file
        file('predict_ensemble.pkl')
except:
    pass








Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


2023-12-12 21:21:28,952 INFO numerapi.utils: target file already exists
2023-12-12 21:21:28,952 INFO numerapi.utils: download complete
2023-12-12 21:21:30,824 INFO numerapi.utils: target file already exists
2023-12-12 21:21:30,825 INFO numerapi.utils: download complete


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 10660
[LightGBM] [Info] Number of data points in the train set: 2420521, number of used features: 2132
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2070, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2132 dense feature groups (2465.36 MB) transferred to GPU in 4.066455 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.500015
