***
# Starting Kit - Black Swan HiggsML Course
***

In [1]:
COLAB = "google.colab" in str(get_ipython())

In [2]:
if COLAB:
    ! git clone --depth 1 https://github.com/AboodJamal/Higgs_collaborations.git

    ! git status
    %cd Higgs_collaborations

fatal: destination path 'Higgs_collaborations' already exists and is not an empty directory.
fatal: not a git repository (or any of the parent directories): .git
/content/Higgs_collaborations


In [3]:
# HiggsML utility package should not be modified
# %pip install HiggsML
# %pip install xgboost

In [4]:
# !pip install mlflow

In [5]:
import sys
print(sys.executable)

/usr/bin/python3


In [6]:
!pip install mlflow

### Imports

In [7]:
from sys import path
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from itertools import product
from numpy.random import RandomState
import warnings
import os
import sys
import mlflow
import mlflow.keras

warnings.filterwarnings("ignore")

### Directories

In [None]:
!pip install mplhep

In [None]:
!pip install HiggsML

In [None]:
!pip install iminuit

In [None]:
import os
import sys

# Get root and submission directories
root_dir = os.getcwd()
print("Root directory is", root_dir)

submission_dir = os.path.join(root_dir, "sample_code_submission")

# The directory where results will be written
output_dir = os.path.join(root_dir, "sample_result_submission")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Add submission directory to sys.path
sys.path.append(submission_dir)

# Now import the model
from model import Model


## Import Submission Model
We import a class named `Model` from the submission file (`model.py`). This `Model` class has the following methods:
- `init`: receives train set and systematics class as input
- `fit`: can be used for training
- `predict`: receives one test set and outputs a dictionary with the following keys
    - `mu_hat` : predicted mu $\hat{\mu}$
    - `delta_mu_hat`: $\Delta{\hat{\mu}}$ bound for $\mu$
    - `p16`: 16th percentile
    - `p84`: 84th percentile

In this example code, the `Model` class implements a basic model with 2 different model trained to predict the class label.

* 1 XGBoost BDT ( [see](/home/chakkappai/Work/ST4_CS/Collaboration_A/sample_code_submission/boosted_decision_tree.py) )
* 2 Tebsorflow NN  ( [see](/home/chakkappai/Work/ST4_CS/Collaboration_A/sample_code_submission/neural_network.py) )

The feature engineering is in where you can include derived quantities and decide which feature should be needed. ( [see](/home/chakkappai/Work/ST4_CS/Collaboration_A/sample_code_submission/feature_engineering.py) )

the statistical analysis part is where yoiu write the mu finding calculation using the output of the classifier. ( [see](/home/chakkappai/Work/ST4_CS/Collaboration_A/sample_code_submission/statistical_analysis.py) )

If running in Collab, click the folder icon in the left sidebar to open the file browser.


## Data
### Available data sets
1. blackSwan_data
2. sample_data
3. neurips2024_data

In [None]:
from HiggsML.datasets import download_dataset

data = download_dataset(
    "blackSwan_data"
)  # change to "blackSwan_data" for the actual data

### ⚠️ Note:
The data used here is a small subset of the full data is for demonstration only to get a view of what the data looks like.

In [None]:
# load train set
data.load_train_set()
data_set = data.get_train_set()

***
## Visualize the Data Set
***

In [None]:
from tabulate import tabulate

target = data_set["labels"]
weights = data_set["weights"]
detailed_label = data_set["detailed_labels"]
keys = np.unique(detailed_label)


weight_keys = {}
average_weights = {}
for key in keys:
    weight_keys[key] = weights[detailed_label == key]

table_data = []
for key in keys:
    table_data.append(
        [
            key,
            np.sum(weight_keys[key]),
            len(weight_keys[key]),
            np.mean(weight_keys[key]),
        ]
    )

table_data.append(
    [
        "Total Signal",
        np.sum(weights[target == 1]),
        len(weights[target == 1]),
        np.mean(weights[target == 1]),
    ]
)
table_data.append(
    [
        "Total Background",
        np.sum(weights[target == 0]),
        len(weights[target == 0]),
        np.mean(weights[target == 0]),
    ]
)


print("[*] --- Detailed Label Summary")
print(
    tabulate(
        table_data,
        headers=[
            "Detailed Label",
            "Total Weight",
            "Number of events",
            "Average Weight",
        ],
        tablefmt="grid",
    )
)

In [None]:
print("\n[*] --- Examples of all features\n")
display(data_set.head())

In [None]:
print("\n[*] --- Description of all features\n")
display(data_set.describe())

In [None]:
print("\n[*] --- Labels vs. Detailed Labels\n")
display(data_set[["labels", "detailed_labels"]].head(70))

In [None]:
# !pip install mplhep

In [None]:
from utils import histogram_dataset

# this function is defined in utils.py in the sample_code_submission directory. feel free to modify it as needed

histogram_dataset(
    data_set,
    target,
    weights,
    columns=["PRI_lep_phi", "PRI_met", "DER_mass_vis", "DER_deltaeta_jet_jet"],
)

In [None]:
import seaborn as sns

sns.set_theme(rc={"figure.figsize": (10, 10)}, style="whitegrid")

caption = ["Signal feature", "Background feature"]

for i in range(2):

    dfplot = pd.DataFrame(
        data_set,
        columns=[
            "PRI_lep_phi",
            "PRI_met",
            "DER_pt_ratio_lep_had",
            "DER_deltaeta_jet_jet",
        ],
    )

    print(caption[i], " correlation matrix")
    corrMatrix = dfplot[target == i].corr()
    sns.heatmap(corrMatrix, annot=True)
    plt.title("Correlation matrix of features")
    plt.show()

del dfplot

In [None]:
from HiggsML.visualization import stacked_histogram

stacked_histogram(data_set, target, weights, detailed_label, "PRI_jet_subleading_pt")

In [None]:
from HiggsML.visualization import pair_plots

# Show data summary
pair_plots(
    data_set,
    target,
    sample_size=100,
    columns=[
        "PRI_lep_phi",
        "PRI_met",
        "DER_lep_eta_centrality",
        "DER_deltaeta_jet_jet",
    ],
)

### Ingestion



Ingestion is part of your competition framework (from HiggsML.ingestion). Its job is to:

| Responsibility                   | Explanation                                                |
| -------------------------------- | ---------------------------------------------------------- |
| Standardize model interface      | Calls `Model.__init__`, `fit`, `predict` with correct args |
| Pass dataset correctly           | Gives your model access to `get_train_set()`               |
| Enforce submission rules         | Checks naming, format, timing, and required outputs        |
| Log outputs and monitor training | Might save logs, errors, or performance                    |
| Run evaluation (sometimes)       | Possibly computes metrics like `mu_hat` or AUC             |


In [None]:
from HiggsML.ingestion import Ingestion

ingestion = Ingestion(data)

In [None]:
# initialize submission
ingestion.init_submission(Model,"NN")

In [None]:
# fit submission
ingestion.fit_submission()

In [None]:
# load test set
data.load_test_set()

### Test Settings
The Test setting sets the test conditions in ingestion.
This includes what systematics you want and how many psuedo experiments you want.

In [None]:
test_settings = {
    "systematics": {  # Systematics to use
        "tes": False,  # tau energy scale
        "jes": False,  # jet energy scale
        "soft_met": False,  # soft term in MET
        "ttbar_scale": False,  # W boson scale factor
        "diboson_scale": False,  # Diboson scale factor
        "bkg_scale": False,  # Background scale factor
    },
    "num_pseudo_experiments": 20,  # Number of pseudo-experiments to run per set
    "num_of_sets": 1,  # Number of sets of pseudo-experiments to run.
}

RANDOM_SEED = 42

random_state = np.random.RandomState(RANDOM_SEED)
test_settings["ground_truth_mus"] = (
    random_state.uniform(0.1, 3, test_settings["num_of_sets"])
).tolist()

random_settings_file = os.path.join(output_dir, "test_settings.json")
with open(random_settings_file, "w") as f:
    json.dump(test_settings, f)

In [None]:
# predict submission
ingestion.predict_submission(test_settings)

In [None]:
ingestion.process_results_dict()

In [None]:
# save result
ingestion.save_result(output_dir)

## Score
1. Compute Scores
2. Visualize Scores


In [None]:
from HiggsML.score import Scoring

In [None]:
# Initialize Score
score = Scoring()

In [None]:
print(output_dir)
score.load_ingestion_results(prediction_dir=output_dir, score_dir=output_dir)

In [None]:
# !pip install pydot graphviz

In [None]:
from utils import visualize_model_architecture
from neural_network import NeuralNetwork
from HiggsML.datasets import download_dataset

# Load dataset
data = download_dataset("blackSwan_data")
data.load_train_set()
data_set = data.get_train_set()

X_train = data_set.iloc[:, :-1]  # Features only

model_instance = NeuralNetwork(X_train)
visualize_model_architecture(model_instance.model, filename="nn_architecture.png")


In [None]:
import os
print("Current notebook working directory:", os.getcwd())


In [None]:
# Compute Score
score.compute_scores(test_settings)

In [None]:
from HiggsML.visualization import visualize_scatter

# Visualize scatter plot of ground truth mu and predicted mu
visualize_scatter(
    ingestion_result_dict=ingestion.results_dict,
    ground_truth_mus=test_settings["ground_truth_mus"],
)

In [None]:
!python -m HiggsML.score --prediction $output_dir --output $output_dir