In [2]:
import uproot
import torch
import numpy as np
import mplhep
from matplotlib import pyplot as plt
import torch.nn as nn
import xgboost as xgb
import torch.optim as optim
import sklearn.datasets
import sklearn.metrics
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import optuna

file = uproot.open('/Users/augielarson/Desktop/LHCb_CERN_Datasets_Research/Gen_DKKPi_1.root')

tree = file['fitTree']


pred_metric = torch.tensor(charge_array, dtype=torch.float32)


variables = [
    'mPrime', 'thPrime', 'charge', 'genSig', 'genqqbar', 'iEvtWithinExpt', 'iExpt', 
    'cosHel12', 'cosHel13', 'cosHel23', 'efficiency', 'evtWeight', 'm12', 'm12Sq', 
    'm13', 'm13Sq', 'm23', 'm23Sq'
]

# Array Extraction and Storage
arrays = [tree[var].array(library='np') for var in variables]

# Stacking Input Arrays (Charge Agnostic)
input_data_arr = np.stack(arrays[:-1], axis=1)
charge_arr = arrays[2]
prediction_metric = np.where(charge_arr > 0, 1, 0)

# Split; Training (70%), Validation (15%), Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(input_data_arr, pred_metric, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Creating DMatrices for XGBoost Operators
DM_train = xgb.DMatrix(X_train, label=y_train)
DM_eval = xgb.DMatrix(X_val, label=y_val)
DM_test = xgb.DMatrix(X_test, label=y_test)

# Defining Objective Function for Optuna Studies
def objective(trial):
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "tree_method": "exact",
        "eta": trial.suggest_float("eta", 1e-8, 1.0, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 9, step=2),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 10),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
    }

    evals = [(DM_train, 'train'), (dval, 'eval')]
    num_epochs = 30
    best = xgb.train(param, DM_train, num_epochs, evals, early_stopping_rounds=10)
    predictions = bst.predict(DM_eval)
    prediction_labels = np.rint(predictions)
    accuracy = accuracy_score(y_val, prediction_labels)
    if accuracy > 0.95:
        raise optuna.exceptions.TrialPruned()

    return accuracy

# Running Optuna
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Best Parameters
    best_params = trial.params

    # Training Final Model with Best Study
    evals = [(dtrain, 'train'), (DM_eval, 'eval')]
    final_best = xgb.train(best_params, DM_train, num_epochs, evals, early_stopping_rounds=10)

    # Evaluating Test Set
    y_test_pred = final_bst.predict(DM_test)

    # Computing ROC and AUROC (Test Set)
    fpr, tpr, _ = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)

    # Plotting
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # Predicting new Data Points
    X_new = np.random.randn(1, X_train.shape[1])
    data_new = xgb.DMatrix(X_new)
    output = final_bst.predict(data_new)
    prediction = 1 if output[0] > 0.5 else 0
    print("Prediction:", prediction)


NameError: name 'charge_array' is not defined