# Numerai

In [None]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
napi = NumerAPI()

# Print all files available for download in the latest v4.1 dataset
[f for f in napi.list_datasets() if f.startswith("v4.2")]

In [None]:
from numerapi import NumerAPI
napi = NumerAPI()
import pandas as pd
import json

# Download the training data and feature metadata
# This may take a few minutes

train = "v4.2/train_int8.parquet"
features = "v4.2/features.json"


napi.download_dataset(train);
napi.download_dataset(features);

# Load the training data but only the "small" subset of features to save time and memory
# In practice you will want to use all the features to maximize your model's performance
feature_metadata = json.load(open(features)) 
feature_cols = feature_metadata["feature_sets"]["small"]
training_data = pd.read_parquet(train, columns= ["era"] + feature_cols + ["target"]) 

real_submition = True
if real_submition:
# For better models, join train and validation data and train on all of it
    napi.download_dataset("v4.2/validation_int8.parquet");
    validation = pd.read_parquet("v4.2/validation_int8.parquet", columns=["era", "data_type"]+feature_cols+["target"])
    validation = validation[validation["data_type"] == "validation"] # drop rows which don't have targets yet
    training_data = pd.concat([training_data, validation])



# downsample para cada 4rt era , assim nao trein
# training_data = training_data[training_data["era"].isin(training_data["era"].unique()[::4])]

training_data

In [None]:
training_data.groupby("era").size().plot(title="Number of Rows per Era", figsize=(5, 3), xlabel="Era");

# Modeling
o objetivo aqui é criar um modelo para submeter no numer.ai



## LightGBM
isto faz trees

primeiro vou criar um modelo com as opcoes default dadas pelo numerai para ver como se sai


In [None]:
import lightgbm as lgb

try:
    # load aos modelos de treino se ja existir
    model = lgb.Booster(model_file='small_lgbm_tree.model');
except lgb.basic.LightGBMError:
    # se naoe xistir criar o modelo e gravar
    model = lgb.LGBMRegressor(
        n_estimators = 2000,
        learning_rate = 0.01,
        max_depth = 5,
        num_leaves = 2**5-1,
        colsample_bytree = 0.1
    )

    #treinar o modelo
    model.fit(
        training_data[feature_cols],
        training_data["target"]
    )

    # gravar o modelo
    model.booster_.save_model("small_lgbm_tree.model")

testar o modelo agora



Definir a funcao de scoring para corr
esta é a maneira principal de scoring do numerai

In [None]:
from scipy import stats
import numpy as np

# Numerai's primary scoring metric
def numerai_corr(preds, target):
    # rank (keeping ties) then gaussianize predictions to standardize prediction distributions
    ranked_preds = (preds.rank(method="average").values - 0.5) / preds.count()
    gauss_ranked_preds = stats.norm.ppf(ranked_preds)
    # center targets around 0
    centered_target = target - target.mean()
    # raise both preds and target to the power of 1.5 to accentuate the tails
    preds_p15 = np.sign(gauss_ranked_preds) * np.abs(gauss_ranked_preds) ** 1.5
    target_p15 = np.sign(centered_target) * np.abs(centered_target) ** 1.5
    # finally return the Pearson correlation
    return np.corrcoef(preds_p15, target_p15)[0, 1]

In [None]:
# Download the validation data and prepare for scoring

# Download validation data 
# This will take a few minutes 🍵
napi.download_dataset("v4.2/validation_int8.parquet");

# Load the validation data, filtering for data_type == "validation"
validation_data = pd.read_parquet("v4.2/validation_int8.parquet", columns=["era", "data_type"] + feature_cols + ["target"]) 
validation_data = validation_data[validation_data["data_type"] == "validation"]
del validation_data["data_type"]

# Downsample to every 4th era to reduce memory usage and speedup evaluation (suggested for Colab free tier)
# Comment out the line below to use all the data (higher memory usage, slower inference, more accurate evaluation)
validation_data = validation_data[validation_data["era"].isin(validation_data["era"].unique()[::4])]

# Eras are 1 week apart, but targets look 4 weeks into the future, so we need to "embargo" the 4 eras following our last train era to avoid data leakage. 
last_train_era = int(training_data["era"].unique()[-1])
eras_to_embargo = [str(era).zfill(4) for era in [last_train_era + i for i in range(4)]]
validation_data = validation_data[~validation_data["era"].isin(eras_to_embargo)]

# Generate predictions against the out-of-sample validation features
# This will take a few minutes 🍵
validation_data["prediction"] = model.predict(validation_data[feature_cols])
validation_data[["era", "prediction", "target"]]

In [None]:
# Plot da corr/cummulatia / era, mais util porque permite ter nocao de "quanto ganha"__qualname__ 
per_era_corr = validation_data.groupby("era").apply(lambda x: numerai_corr(x["prediction"], x["target"]))
per_era_corr.cumsum().plot(kind="line", title="Cumulative Validation Correlation", figsize=(10, 6));


In [None]:
# scoring data function
def scoring_data(model, validation_data, feature_cols):
    # Generate predictions against the out-of-sample validation features
    # This will take a few minutes 🍵
    validation_data["prediction"] = model.predict(validation_data[feature_cols])
    validation_data[["era", "prediction", "target"]]
    per_era_corr = validation_data.groupby("era").apply(lambda x: numerai_corr(x["prediction"], x["target"]))

    # Compute performance metrics
    corr_mean = per_era_corr.mean()
    corr_std = per_era_corr.std(ddof=0)
    corr_sharpe = corr_mean / corr_std
    max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max()

    return pd.DataFrame({
        "mean": corr_mean,
        "std": corr_std,
        "sharpe": corr_sharpe,
        "max_drawdown": max_drawdown
    }, index=["Value"]).T


# scoring_data(model, validation_data, feature_cols)

    

In [None]:
# Define predict function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[feature_cols])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# picklar a funcao
import cloudpickle

p = cloudpickle.dumps(predict)
with open("small_lgbm_tree.pkl", "wb") as f:
    f.write(p)


## XGBoost
mesma coisa mas com xgboost




In [None]:
import numpy as np
import xgboost as xgb


# Initialize the XGBoost regression model tree
model = xgb.XGBRegressor(
    n_estimators = 2000,
    learning_rate = 0.01,
    max_depth = 5,
    max_leaves = 2**5,
    colsample_bytree = 0.1,
    #tree_method='gpu_hist',
    #gpu_id= 0,
    random_state=420
    )

# Train the model on the training data
model.fit(
    training_data[feature_cols],
    training_data["target"]
    )

In [None]:
# Define predict function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[feature_cols])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# picklar a funcao
import cloudpickle

p = cloudpickle.dumps(predict)
with open("small_lgbm_tree.pkl", "wb") as f:
    f.write(p)

## Scoring and stuff

In [None]:
# correlacao /era das previsoes com o metamodelo + um plot

per_era_corr = validation_data.groupby("era").apply(lambda x: numerai_corr(x["prediction"], x["target"]))
per_era_corr.plot(kind="bar", title="Validation Correlation", figsize=(10, 6), xticks=[]);

In [None]:
# Plot da corr/cummulatia / era, mais util porque permite ter nocao de "quanto ganha"__qualname__ 
per_era_corr.cumsum().plot(kind="line", title="Cumulative Validation Correlation", figsize=(10, 6));

# Linear regression


In [None]:
from sklearn.linear_model import LinearRegression

# Load the tournament data
# training_data = training_data

# Select features and target variable

X = training_data[feature_cols]
y = training_data["target"]

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X, y)


In [None]:
# Define predict function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[feature_cols])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# picklar a funcao
import cloudpickle

p = cloudpickle.dumps(predict)
with open("small_lin_reg.pkl", "wb") as f:
    f.write(p)

## quad reg


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

X = training_data[feature_cols]


# Assuming X contains your input data and y contains your output data
polynomial_features = PolynomialFeatures(degree=2)
X_poly = polynomial_features.fit_transform(X)

model = LinearRegression()
model.fit(X_poly, y)

# Assuming X_test contains your test data
X_test_poly = polynomial_features.transform()
predictions = model.predict(X_test_poly)


In [None]:
# Define predict function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    X_test_poly = polynomial_features.transform(live_features[feature_cols])
    live_predictions = model.predict(X_test_poly)
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# picklar a funcao
import cloudpickle

p = cloudpickle.dumps(predict)
with open("small_quad_reg.pkl", "wb") as f:
    f.write(p)

# Octo Reg