In [None]:
from pytorch_lightning import seed_everything
from config import create_config, create_data_loaders
import numpy as np
import torch
from tqdm import tqdm

args = {
    'model': 'mtt', # dummy model name
    'dataset': 'higgs', # name of the dataset
    'data_root': '/path/to/data/', # root directory of the data
    'run_name': 'auto',
    'cuda': 1,
    'seed': 1
}

# Load configuration from YAML file
config = create_config(args)

# Set seeds
seed_everything(config['seed'])

Seed set to 1


1

## XGBoost

In [27]:
import xgboost as xgb    

# Load the dataset
if config['dataset'] == 'aliexpress':
    dataset = create_data_loaders(config, return_splits=True)
    x_train = np.concatenate((dataset['train'].x_cat, dataset['train'].x_num), axis=1)
    y_train = np.concatenate((dataset['train'].y_click.unsqueeze(-1), dataset['train'].y_conversion.unsqueeze(-1)), axis=1)
    x_test = np.concatenate((dataset['test'].x_cat, dataset['test'].x_num), axis=1)
    y_test = np.concatenate((dataset['test'].y_click.unsqueeze(-1), dataset['test'].y_conversion.unsqueeze(-1)), axis=1)

elif config['dataset'] == 'acs_income' or config['dataset'] == 'higgs':
    dataset = create_data_loaders(config, return_splits=True)
    x_train = dataset['train'].features.numpy()
    y_train = np.concatenate(([target.unsqueeze(-1).numpy() for target in dataset['train'].targets.values()]), axis=1)
    x_test = dataset['test'].features.numpy()
    y_test = np.concatenate(([target.unsqueeze(-1).numpy() for target in dataset['test'].targets.values()]), axis=1)

In [28]:
config.data

{'format': 'h5',
 'name': 'higgs',
 'num_features': 21,
 'path': '/higgs11M/',
 'short_name': 'hig',
 'task_out_dim': {'Target': 1,
  'm_bb': 1,
  'm_jj': 1,
  'm_jjj': 1,
  'm_jlv': 1,
  'm_lv': 1,
  'm_wbb': 1,
  'm_wwbb': 1},
 'task_type': {'Target': 'binary',
  'm_bb': 'regression',
  'm_jj': 'regression',
  'm_jjj': 'regression',
  'm_jlv': 'regression',
  'm_lv': 'regression',
  'm_wbb': 'regression',
  'm_wwbb': 'regression'},
 'tasks': ['Target',
  'm_jj',
  'm_jjj',
  'm_lv',
  'm_jlv',
  'm_bb',
  'm_wbb',
  'm_wwbb'],
 'seperate_ft_types': False,
 'feature_dims': {'0': 3,
  '1': 3,
  '2': 3,
  '3': 3,
  '4': 1,
  '5': 1,
  '6': 1,
  '7': 1,
  '8': 1,
  '9': 1,
  '10': 1,
  '11': 1,
  '12': 1,
  '13': 1,
  '14': 1,
  '15': 1,
  '16': 1,
  '17': 1,
  '18': 1,
  '19': 1,
  '20': 1}}

In [29]:
# check all the shapes
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (9900000, 21)
y_train shape: (9900000, 8)
x_test shape: (550000, 21)
y_test shape: (550000, 8)


## AliExpresss

In [10]:
# Convert the training and testing data into DMatrix format (XGBoost specific data structure)
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

# Set parameters for XGBoost
params = {
    'max_depth': 6,
    'learning_rate': 0.2,
    'objective': 'binary:logistic',  # binary objective
    'eval_metric': 'auc',  # Evaluation metric
}

# Train the XGBoost model
num_rounds = 200  # Number of training rounds
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions on the testing data
y_pred = bst.predict(dtest)

In [11]:
from torchmetrics.classification import AUROC

# Initialize the BinaryAUROC metric
auroc = AUROC(task='binary')
bce = torch.nn.BCELoss()

# Compute AUROC
for i, task in enumerate(config.data.tasks):
    auc_roc_score = auroc(torch.tensor(y_pred[:,i]), torch.tensor(y_test[:,i]))
    bce_loss = bce(torch.tensor(y_pred[:,i]), torch.tensor(y_test[:,i]).to(torch.float32))
    print(f"Task: {task}, LogLoss: {bce_loss:.5f}, AUC-ROC: {auc_roc_score:.5f}")

Task: click, LogLoss: 0.10830, AUC-ROC: 0.72272
Task: conversion, LogLoss: 0.00626, AUC-ROC: 0.84519


## ACS Income

In [16]:
# Convert the training and testing data into DMatrix format (XGBoost specific data structure)
dtrain_binary = xgb.DMatrix(x_train, label=y_train[:,0])
dtest_binary = xgb.DMatrix(x_test, label=y_test[:,0])

params_binary = {
    'max_depth': 6,
    'learning_rate': 0.2,
    'objective': 'binary:logistic',  # binary objective
    'eval_metric': 'auc',  # Evaluation metric
}

dtrain_multi = xgb.DMatrix(x_train, label=y_train[:,1])
dtest_multi = xgb.DMatrix(x_test, label=y_test[:,1])

params_multiclass = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'objective': 'multi:softprob',  # multi-class objective
    'num_class': 5,  # Number of classes for multi-class classification
    'eval_metric': 'auc',  # Evaluation metric
}

# Train the XGBoost model
num_rounds = 200  # Number of training rounds
bst_binary = xgb.train(params_binary, dtrain_binary, num_rounds)
bst_multi = xgb.train(params_multiclass, dtrain_multi, num_rounds)

# Make predictions on the testing data
y_pred_binary = bst_binary.predict(dtest_binary)
y_pred_multi = bst_multi.predict(dtest_multi)

In [24]:
from torchmetrics.classification import AUROC

# Initialize the BinaryAUROC metric
auroc = AUROC(task='binary')

# Initialize the Multclass AUROC metric
auroc_multiclass = AUROC(task='multiclass', num_classes=5)

auc_roc_score = auroc(torch.tensor(y_pred_binary), torch.tensor(y_test[:, 0]))
print(f"Task: {config.data.tasks[0]}, AUC-ROC: {auc_roc_score:.5f}")

auc_roc_score = auroc_multiclass(torch.tensor(y_pred_multi), torch.tensor(y_test[:, 1]).long())
print(f"Task: {config.data.tasks[1]}, AUC-ROC: {auc_roc_score:.5f}")

Task: PINCP, AUC-ROC: 0.89738
Task: MAR, AUC-ROC: 0.88438


## Higgs

In [30]:
# Convert the training and testing data into DMatrix format (XGBoost specific data structure)
dtrain_binary = xgb.DMatrix(x_train, label=y_train[:,0])
dtest_binary = xgb.DMatrix(x_test, label=y_test[:,0])

# Set parameters for XGBoost
params_binary = {
    'max_depth': 6,
    'learning_rate': 0.2,
    'objective': 'binary:logistic',  # binary objective
    'eval_metric': 'auc',  # Evaluation metric
}

dtrain_regression = xgb.DMatrix(x_train, label=y_train[:,1:])
dtest_regression = xgb.DMatrix(x_test, label=y_test[:,1:])

# config for XGBOOST on regression
params = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'objective': 'reg:squarederror',  # regression objective
    'eval_metric': 'rmse',  # Evaluation metric
}

# Train the XGBoost model
num_rounds = 100  # Number of training rounds
bst_binary = xgb.train(params_binary, dtrain_binary, num_rounds)
bst_regression = xgb.train(params, dtrain_regression, num_rounds)

# Make predictions on the testing data
y_pred_binary = bst_binary.predict(dtest_binary)
y_pred_regression = bst_regression.predict(dtest_regression)

In [31]:
from torchmetrics.classification import AUROC
from torchmetrics.regression import ExplainedVariance

# Initialize the metric
auroc = AUROC(task='binary')
explained_variance = ExplainedVariance()

auc_roc_score = auroc(torch.tensor(y_pred_binary), torch.tensor(y_test[:, 0]))
print(f"Task: {config.data.tasks[0]}, AUC-ROC: {auc_roc_score:.5f}")

for i, task in enumerate(config.data.tasks[1:]):
    explained_variance_score = explained_variance(torch.tensor(y_pred_regression[:, i]), torch.tensor(y_test[:, i + 1]))
    print(f"Task: {task}, Explained Variance: {explained_variance_score:.5f}")

Task: Target, AUC-ROC: 0.71872
Task: m_jj, Explained Variance: 0.24298
Task: m_jjj, Explained Variance: 0.29085
Task: m_lv, Explained Variance: 0.22060
Task: m_jlv, Explained Variance: 0.40948
Task: m_bb, Explained Variance: 0.77195
Task: m_wbb, Explained Variance: 0.59887
Task: m_wwbb, Explained Variance: 0.54457
