# Classical Model Training Using the CONNIE Dataset

In [1]:
%run ./notebook_init.py

import os
import uproot

import numpy as np
import pandas as pd

from pathlib import Path
from glob import glob
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

from core import MAIN_DIR
from scripts.connie_training_utils import Seed

In [None]:
processed_data_folder = os.path.join(MAIN_DIR, "processed_data")

In [None]:
seed = Seed()

In [None]:
categories = ["Alpha", "Blob", "Diffusion_Hit", "Electron", "Muon", "Others"]
branch_name = "hitSumm"

In [None]:
all_data_list = []
all_data_list_excluded_vars = []

print("Starting data loading")
for category in categories:
    category_path = os.path.join(processed_data_folder, category)
    root_files = glob(os.path.join(category_path, "*.root"))

    if not root_files:
        print(f"Warning: No .root files found in {category_path}")
        continue

    print(f"Processing category: {category} ({len(root_files)} files)")
    for idx, file_path in enumerate(root_files):
        try:
            with uproot.open(file_path) as file:
                if branch_name not in file:
                    print(f"Warning: TTree '{branch_name}' not found in {file_path}. Skipping.")
                    continue
                file_branch = file[branch_name]
                df = file_branch.arrays(library="pd")
                df['label'] = category
                all_data_list.append(df)

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")


Combine all DataFrames into a single DataFrame

In [None]:
if all_data_list:
    df_combined = pd.concat(all_data_list, ignore_index=True)
    print(f"Successfully loaded {len(df_combined)} rows of data.")
else:
    print("No data loaded.")

* Calculate mean of ePix and level to be used as features
* Remove features with more than one dimension, such as xPix and yPix
* Remove "flag", as we already filtered for only valid events
* Drop columns with no variance

In [None]:
df_processed = df_combined.copy()

df_processed["ePixMean"] = df_processed["ePix"].apply(np.mean)
df_processed["levelMean"] = df_processed["level"].apply(np.mean)

df_processed = df_processed.drop(columns=["label", "xPix", "yPix", "level", "ePix", "flag"])

# Drop columns with no variance
df_processed = df_processed.loc[:, df_processed.nunique() > 1]


Calculating the correlation between features

In [None]:
corr_df_combined = df_processed.corr()
corr_pairs = corr_df_combined.unstack()
# Filter out self-correlations
filtered = corr_pairs[corr_pairs != 1.0]
# Remove duplicate mirror entries
filtered = filtered.drop_duplicates()
# Find correlations above 0.9
high_corr = filtered[filtered.abs() > 0.9]
print(high_corr.sort_values(ascending=False))

Removing features from the dataframe

In [None]:
df_processed_final = df_processed.drop(columns=["yBary0", "yBary1", "yVar1","xBary0", "xBary1",
                                                "ohdu", "E1", "n1", "NpixAC", "DeltaT",
                                                "chid", "runID", "imgID", "skpID", "xMax"])

* Set all classes other than muons to label 0
* Split the data into training and test sets
* Use k-fold cross-validation for training and validation

In [None]:
x_data = df_processed_final.copy()
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(df_combined["label"])

class_name = "Muon"
class_id = label_encoder.transform([class_name])[0]
print(f"Class ID for '{class_name}':{class_id}\n")

for i, class_name in enumerate(label_encoder.classes_):
    print(f"Class ID {i}: {class_name}")

x_train_cv, x_test_final, y_train_cv, y_test_final = train_test_split(
    x_data, y_data, test_size=0.15, random_state=seed.get_seed())

# 1 if class 4, else 0
y_binary_muon = (y_train_cv == class_id).astype(int)

k_folds = 5
kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=seed.get_seed())

Train the models

In [None]:
xgb_train_scores = []
xgb_val_scores = []
rf_train_scores = []
rf_val_scores = []

for train_idx, val_idx in kf.split(x_train_cv, y_binary_muon):
    x_train, X_val = x_train_cv.iloc[train_idx], x_train_cv.iloc[val_idx]
    y_train, y_val = y_binary_muon[train_idx], y_binary_muon[val_idx]

    # XGBoost model
    xgb = XGBClassifier(n_estimators=300,
                        use_label_encoder=False,
                        eval_metric='logloss',
                        gamma=0.98,
                        learning_rate=0.05,
                        random_state=seed.get_seed())
    xgb.fit(x_train, y_train)
    xgb_train_scores.append(accuracy_score(y_train, xgb.predict(x_train)))
    xgb_val_scores.append(accuracy_score(y_val, xgb.predict(X_val)))

    # Random Forest model
    rf = RandomForestClassifier(n_estimators=300,
                                max_features=0.3,
                                min_samples_split=10,
                                random_state=seed.get_seed())
    rf.fit(x_train, y_train)
    rf_train_scores.append(accuracy_score(y_train, rf.predict(x_train)))
    rf_val_scores.append(accuracy_score(y_val, rf.predict(X_val)))


print(f"XGBoost - Train: {np.mean(xgb_train_scores):.4f}, Val: {np.mean(xgb_val_scores):.4f} ± {np.std(xgb_val_scores):.4f}")
print(f"Random Forest - Train: {np.mean(rf_train_scores):.4f}, Val: {np.mean(rf_val_scores):.4f} ± {np.std(rf_val_scores):.4f}")