In [2]:
import copy
import iminuit as iminuit
import numba
import optuna
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, StratifiedKFold

from model import LightningFullyConnected, BigLightningModel, FullyConnectedModel
from verstack import LGBMTuner
from lightgbm import LGBMClassifier
import lightning as L
import numpy as np
import pandas as pd
from dataset import ParticleDataset
import torch
from torch import optim
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import pickle
from matplotlib import pyplot as plt

2023-05-20 14:16:20.403331: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-20 14:16:22.289548: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-05-20 14:16:22.289686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
APP_ML_PATH = "/home/amh/Documents/Coding/GitHub/AppliedML2023"

# NN hyperparameters
hidden_channels = 20
decode_channels = 6
hidden_layers = 5
p_dropout = 0.2
lr = 0.0003
activation=nn.LeakyReLU
final_activation=nn.Sigmoid
batch_size=2500
optimizer=optim.AdamW
scheduler=optim.lr_scheduler.CosineAnnealingLR
loss_fn=F.binary_cross_entropy
in_channels=15
out_channels=1
use_wandb=True

#LGBM hyperparameters
lgbm_hyper = {'task': 'train', 'learning_rate': 0.04, 'num_leaves': 158, 'feature_fraction': 0.7758858169778262, 'bagging_fraction': 0.6447189399303701, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'lambda_l1': 6.846114915590905, 'lambda_l2': 1.8801187817986014, 'min_split_gain': 0.0, 'zero_as_missing': False, 'max_bin': 255, 'min_data_in_bin': 3, 'random_state': 42, 'device_type': 'cpu', 'num_classes': 1, 'objective': 'binary', 'metric': 'binary_logloss', 'num_threads': 6, 'min_sum_hessian_in_leaf': 1.016024682289675, 'num_iterations': 519}

# Catboost hyperparameters
best_params = {'iterations': 2000, 'learning_rate': 0.03254880658174435, 'depth': 7, 'l2_leaf_reg': 7.565788405198633, 'bootstrap_type': 'Bayesian', 'random_strength': 4.118449381050315e-06, 'bagging_temperature': 0.36985835084484797, 'od_type': 'Iter', 'od_wait': 40}

# NN classifier - Fully trained
model1: LightningFullyConnected = LightningFullyConnected.load_from_checkpoint("../data/initial/nn_clf_final.ckpt")
model1.to("cpu")

# LGBM classifier
lgbm = LGBMClassifier(**lgbm_hyper)

# Catboost classifier
cat_clf = CatBoostClassifier(verbose=False,
                             task_type="GPU",
                             loss_function="Logloss",
                             eval_metric="Logloss",
                             **best_params,)

# Data
data_train = ParticleDataset()
data_test = ParticleDataset(path=f"{APP_ML_PATH}/data/initial/train",
                            target="ALL")
data_train[:][0].to("cpu")
data_train[:][1].to("cpu")
data_test = data_test[:][0].to("cpu")


lgbm.fit(data_train[:][0].detach().numpy(), data_train[:][1].detach().numpy())
cat_clf.fit(data_train[:][0].detach().numpy(), data_train[:][1].detach().numpy())

lgbm_proba = lgbm.predict_proba(data_test.detach().numpy())[:, 1]
cat_proba = cat_clf.predict_proba(data_test.detach().numpy())[:, 1]
nn_proba = model1(data_test).detach().numpy().reshape(-1)
print(lgbm_proba.shape)
print(cat_proba.shape)
print(nn_proba.shape)

(162500,)
(162500,)
(162500,)


In [4]:
# Create ensembles
w1 = lgbm.score(data_train[:][0].detach().numpy(), data_train[:][1].numpy())
print(w1)
w2 = cat_clf.score(data_train[:][0].detach().numpy(), data_train[:][1].numpy())
print(w2)
w3 = np.mean(np.round(model1(data_train[:][0]).detach().numpy().reshape(-1)) == data_train[:][1].numpy())
print(w3)

0.9650276923076923
0.9582461538461539
0.9269723076923077


In [7]:
ensemble_avg2 = (lgbm_proba + cat_proba) / 2
ensemble_avg3 = (lgbm_proba + cat_proba + nn_proba) / 3
ensemble_avg_w2 = (w1 * lgbm_proba + w2 * cat_proba) / (w1 + w2)
ensemble_avg_w3 = (w1 * lgbm_proba + w2 * cat_proba + w3 * nn_proba) / (w1 + w2 + w3)

In [9]:
lgbm_results = pd.Series(lgbm_proba)
cat_results = pd.Series(cat_proba)
nn_results = pd.Series(nn_proba)
ensemble_avg3_results = pd.Series(ensemble_avg3)
ensemble_wavg3_results = pd.Series(ensemble_avg_w3)
ensemble_avg2_results = pd.Series(ensemble_avg2)
ensemble_wavg2_results = pd.Series(ensemble_avg_w2)

SOLUTION_DIR = "data/initial/solutions/classification"

lgbm_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_LGBM.txt")
cat_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_Catboost.txt")
nn_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_NeuralNet.txt")
ensemble_avg3_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_3ModelLinearEnsemble.txt")
ensemble_wavg3_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_3ModelWeightedLinearEnsemble.txt")
ensemble_avg2_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_2ModelLinearEnsemble.txt")
ensemble_wavg2_results.to_csv(f"{APP_ML_PATH}/{SOLUTION_DIR}/Classification_AndreasMHermansen_2ModelWeightedLinearEnsemble.txt")


# Models ranked on validation: a2, w2, cat, lgbm, w3, a3, w4, a4, nn
# Final models??? (a2, w3, cat)