In [2]:
import uproot
import torch
import numpy as np
import mplhep
from matplotlib import pyplot as plt
import torch.nn as nn
import xgboost as xgb
import torch.optim as optim
import sklearn.datasets
import sklearn.metrics
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import optuna
from xgboost import XGBClassifier

In [3]:
file = uproot.open('/Users/augielarson/Desktop/LHCb_CERN_Datasets_Research/Gen_DKKPi_1.root')
tree = file['fitTree']
data_df = tree.arrays(library='pd')

In [4]:
data_df.head()

Unnamed: 0,charge,genSig,genqqbar,iEvtWithinExpt,iExpt,cosHel12,cosHel13,cosHel23,efficiency,evtWeight,m12,m12Sq,m13,m13Sq,m23,m23Sq,mPrime,thPrime
0,-1,1,0,0,0,-0.600779,-0.021703,-0.618265,1.0,1.0,0.82971,0.688418,1.616485,2.613025,0.837294,0.701062,0.656096,0.705143
1,-1,1,0,1,0,-0.256318,0.576692,0.517862,1.0,1.0,1.238359,1.533533,1.264775,1.599655,0.932372,0.869317,0.283286,0.582509
2,-1,1,0,2,0,-0.616962,0.612876,0.245549,1.0,1.0,1.139805,1.299155,1.42175,2.021373,0.825819,0.681977,0.381394,0.711637
3,-1,1,0,3,0,0.924468,0.827218,0.972939,1.0,1.0,1.31434,1.727489,1.018129,1.036586,1.112848,1.23843,0.186025,0.12451
4,-1,1,0,4,0,-0.268246,0.781068,0.778652,1.0,1.0,1.314281,1.727334,1.199678,1.439226,0.9143,0.835945,0.186117,0.586444


In [5]:
variables = [
    'mPrime', 'thPrime', 'charge', 'genSig', 'genqqbar', 'iEvtWithinExpt', 'iExpt', 
    'cosHel12', 'cosHel13', 'cosHel23', 'efficiency', 'evtWeight', 'm12', 'm12Sq', 
    'm13', 'm13Sq', 'm23', 'm23Sq'
]

In [10]:
# Array Extraction and Storage
arrays = [tree[var].array(library='np') for var in variables]

# Stacking Input Arrays (Charge Agnostic)
input_data_arr = np.stack(arrays[:-1], axis=1)
charge_arr = arrays[2]
prediction_metric = np.where(charge_arr > 0, 1, 0)


# Split; Training (70%), Validation (15%), Test (15%)
x_train, x_temp, y_train, y_temp = train_test_split(input_data_arr, prediction_metric, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
# Objective Func
num_epochs = 30
def objective(trial):
    # Hyperparams Suggestions
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10, step=2)
    n_estimators = trial.suggest_int("n_estimators", 100, 300, step=100)
    gamma = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
    min_child_weight = trial.suggest_int("min_child_weight", 2, 10)
    max_delta_step = trial.suggest_int("max_delta_step", 0, 10)
    subsample = trial.suggest_float("subsample", 0.2, 1.0)
    
    # XGB Hyperparams
    model = XGBClassifier(
        objective='binary:logistic',
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        gamma=gamma,
        min_child_weight=min_child_weight,
        max_delta_step=max_delta_step,
        subsample=subsample,
        seed=42,
        use_label_encoder=False,
        eval_metric='logloss'  # THis avoids the wanring for label encoding
    )

    # Train Model
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=10, verbose=False)

    # Valid Pred Set
    y_pred = model.predict(x_val)

    # Accuracy
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

# Running Optuna
if __name__ == "__main__":
    # Creating a study to maximize accuracy
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=1000, timeout=600)

    # Output Results
    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Retrieve best Hyperparams
    best_params = trial.params

    # Train w/ best Hyperparams
    final_model = XGBClassifier(
        objective='binary:logistic',
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        gamma=best_params['gamma'],
        min_child_weight=best_params['min_child_weight'],
        max_delta_step=best_params['max_delta_step'],
        subsample=best_params['subsample'],
        seed=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Final Training
    final_model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=10, verbose=False)

    # Preds on Test
    y_test_pred = final_model.predict(x_test)

    # ROC and AUROC for Test
    fpr, tpr, _ = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)

    # Plotting ROC
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    # New Data Point Preds
    X_new = np.random.randn(1, x_train.shape[1])
    output = final_model.predict(X_new)
    print("Prediction:", output[0])


[I 2024-10-06 23:28:10,974] A new study created in memory with name: no-name-7ff93f71-8972-4ceb-9948-9c0b5b7f5a6f
[I 2024-10-06 23:28:12,492] Trial 0 finished with value: 0.6341333333333333 and parameters: {'learning_rate': 0.00036809302513860534, 'max_depth': 4, 'n_estimators': 300, 'gamma': 5.0910969153166114e-08, 'min_child_weight': 5, 'max_delta_step': 5, 'subsample': 0.6909219590169432}. Best is trial 0 with value: 0.6341333333333333.
[I 2024-10-06 23:28:13,462] Trial 1 finished with value: 0.6341333333333333 and parameters: {'learning_rate': 4.768785558590724e-05, 'max_depth': 2, 'n_estimators': 200, 'gamma': 0.00018710393192242566, 'min_child_weight': 2, 'max_delta_step': 10, 'subsample': 0.710695144217468}. Best is trial 0 with value: 0.6341333333333333.
[I 2024-10-06 23:28:14,421] Trial 2 finished with value: 1.0 and parameters: {'learning_rate': 0.002221943080608557, 'max_depth': 2, 'n_estimators': 200, 'gamma': 0.24648397431573266, 'min_child_weight': 8, 'max_delta_step': 0,

KeyboardInterrupt: 