In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ashishrajput88","key":"7091cd62a789835f9aacd6822fff56bb"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle competitions download -c higgs-boson
!unzip higgs-boson.zip

Downloading higgs-boson.zip to /content
  0% 0.00/54.3M [00:00<?, ?B/s]
100% 54.3M/54.3M [00:00<00:00, 999MB/s]
Archive:  higgs-boson.zip
  inflating: HiggsBosonCompetition_AMSMetric_rev1.py  
  inflating: random_submission.zip   
  inflating: test.zip                
  inflating: training.zip            


In [4]:
!ls

HiggsBosonCompetition_AMSMetric_rev1.py  random_submission.zip	training.zip
higgs-boson.zip				 sample_data
kaggle.json				 test.zip


In [5]:
!unzip training.zip

Archive:  training.zip
  inflating: training.csv            


In [6]:
!ls

HiggsBosonCompetition_AMSMetric_rev1.py  random_submission.zip	training.csv
higgs-boson.zip				 sample_data		training.zip
kaggle.json				 test.zip


# Higgs Boson Classification using Deep Neural Network (ANN)

This notebook presents a deep learning approach to classify particle collision events as signal (Higgs boson) or background using the [Higgs Boson Machine Learning Challenge Dataset](https://www.kaggle.com/c/higgs-boson).

The goal is to build a binary classification model that can distinguish rare signal events from dominant background noise using 28 physics-engineered features.


In [7]:
import pandas as pd

In [8]:
import numpy as np

df = pd.read_csv('training.csv')

## Dataset Summary

- Total samples: 250,000+
- Features: 30 (28 float + 2 categorical: EventID and Weight)
- Label: 's' (signal) and 'b' (background)

**Challenge:** The dataset is heavily imbalanced. Signal events are much fewer than background.


In [9]:
df.head(5)

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [10]:
df.tail(5)

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
249995,349995,-999.0,71.989,36.548,5.042,-999.0,-999.0,-999.0,1.392,5.042,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,4.505083,b
249996,349996,-999.0,58.179,68.083,22.439,-999.0,-999.0,-999.0,2.585,22.439,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,2.497259,b
249997,349997,105.457,60.526,75.839,39.757,-999.0,-999.0,-999.0,2.39,22.183,...,1,41.992,1.8,-0.166,-999.0,-999.0,-999.0,41.992,0.018636,s
249998,349998,94.951,19.362,68.812,13.504,-999.0,-999.0,-999.0,3.365,13.504,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.681611,b
249999,349999,-999.0,72.756,70.831,7.479,-999.0,-999.0,-999.0,2.025,7.479,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.877474,b


In [12]:
df.shape

(250000, 31)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 31 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   DER_mass_MMC                 250000 non-null  float64
 1   DER_mass_transverse_met_lep  250000 non-null  float64
 2   DER_mass_vis                 250000 non-null  float64
 3   DER_pt_h                     250000 non-null  float64
 4   DER_deltaeta_jet_jet         250000 non-null  float64
 5   DER_mass_jet_jet             250000 non-null  float64
 6   DER_prodeta_jet_jet          250000 non-null  float64
 7   DER_deltar_tau_lep           250000 non-null  float64
 8   DER_pt_tot                   250000 non-null  float64
 9   DER_sum_pt                   250000 non-null  float64
 10  DER_pt_ratio_lep_tau         250000 non-null  float64
 11  DER_met_phi_centrality       250000 non-null  float64
 12  DER_lep_eta_centrality       250000 non-null  float64
 13 

In [14]:
df.describe()

Unnamed: 0,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,-49.023079,49.239819,81.181982,57.895962,-708.420675,-601.237051,-709.356603,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,-348.329567,-399.254314,-399.259788,-692.381204,-709.121609,-709.118631,73.064591
std,406.345647,35.344886,40.828691,63.655682,454.480565,657.972302,453.019877,0.782911,22.273494,115.706115,...,1.812223,126.499506,0.977426,532.962789,489.338286,489.333883,479.875496,453.384624,453.389017,98.015662
min,-999.0,0.0,6.329,0.0,-999.0,-999.0,-999.0,0.208,0.0,46.104,...,-3.142,13.678,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
25%,78.10075,19.241,59.38875,14.06875,-999.0,-999.0,-999.0,1.81,2.841,77.55,...,-1.575,123.0175,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0
50%,105.012,46.524,73.752,38.4675,-999.0,-999.0,-999.0,2.4915,12.3155,120.6645,...,-0.024,179.739,1.0,38.96,-1.872,-2.093,-999.0,-999.0,-999.0,40.5125
75%,130.60625,73.598,92.259,79.169,0.49,83.446,-4.593,2.961,27.591,200.47825,...,1.561,263.37925,2.0,75.349,0.433,0.503,33.703,-2.457,-2.275,109.93375
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433


## Preprocessing Steps

- Dropped irrelevant columns: `EventId`, `Weight`
- Replaced invalid values (-999.0) with NaNs
- Imputed missing values using `most_frequent` strategy
- Standardized all numerical features using `StandardScaler`
- Converted labels: `'s' → 1` and `'b' → 0`
- Split data into Train/Validation/Test using stratified sampling


In [11]:
df = df.drop(columns={'EventId', 'Weight'})

In [15]:
df['Label'].unique()

array(['s', 'b'], dtype=object)

In [16]:
df.isnull().sum()

Unnamed: 0,0
DER_mass_MMC,0
DER_mass_transverse_met_lep,0
DER_mass_vis,0
DER_pt_h,0
DER_deltaeta_jet_jet,0
DER_mass_jet_jet,0
DER_prodeta_jet_jet,0
DER_deltar_tau_lep,0
DER_pt_tot,0
DER_sum_pt,0


In [17]:
df.duplicated().value_counts()

Unnamed: 0,count
False,250000


In [18]:
df = df.replace(-999.0, np.nan)

In [19]:
for i in df.columns :
  if df[i].dtype == 'int64' :
    df[i] = df[i].astype('int32')
  elif df[i].dtype == 'float64' :
    df[i] = df[i].astype('float32')

In [20]:
from sklearn.impute import SimpleImputer

In [21]:
si = SimpleImputer(strategy='most_frequent')

In [22]:
null_cols = []

for i in df.columns :
  if df[i].isnull().sum() > 0 :
    null_cols.append(i)

print("Columns: ", null_cols)
print("Number of null columns: ", len(null_cols))

Columns:  ['DER_mass_MMC', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi']
Number of null columns:  11


In [23]:
df[null_cols] = si.fit_transform(df[null_cols])

In [24]:
df['Label'] = df['Label'].replace({'s':1, 'b':0}).astype('int32')

  df['Label'] = df['Label'].replace({'s':1, 'b':0}).astype('int32')


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
ss = StandardScaler()

In [27]:
X = df.drop(columns={'Label'})

In [28]:
X = ss.fit_transform(X)

In [29]:
y = df['Label']

In [30]:
y

Unnamed: 0,Label
0,1
1,0
2,0
3,0
4,0
...,...
249995,0
249996,0
249997,1
249998,0


In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
from imblearn.over_sampling import SMOTE

In [34]:
smt = SMOTE(random_state=42)

In [35]:
#X_train, y_train = smt.fit_resample(X_train, y_train)

In [36]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.2, stratify=y_train)

In [37]:
y_train = y_train.values.ravel().astype('int32')

In [38]:
y_test = y_test.values.ravel().astype('int32')

In [39]:
y_val = y_val.values.ravel().astype('int32')

## Deep Neural Network (ANN) Architecture

- Input: 28 features
- Hidden Layers:
  - Dense(700) → BatchNorm → Dropout(0.5)
  - Dense(500) → BatchNorm → Dropout(0.3)
  - Dense(250) → BatchNorm
  - Dense(100) → BatchNorm
  - Dense(50)  → BatchNorm → Dropout(0.1)
  - Dense(25)  → BatchNorm
- Output: Dense(1, activation='sigmoid')

> Optimizer: Adam  
> Loss: Binary Crossentropy  
> Metrics: Accuracy, AUC  
> Class Weighting: `{0: 1.0, 1: 2.0}`  
> EarlyStopping: Based on `val_auc` with patience=5


In [40]:
import tensorflow as tf

In [41]:
import keras

In [194]:
model = keras.Sequential([
    (tf.keras.layers.Dense(700, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dropout(0.5)),
    (tf.keras.layers.Dense(500, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dropout(0.3)),
    (tf.keras.layers.Dense(250, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dense(100, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dense(50, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dropout(0.1)),
    (tf.keras.layers.Dense(25, activation='relu')),
    (tf.keras.layers.BatchNormalization()),
    (tf.keras.layers.Dense(1, activation='sigmoid'))
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', keras.metrics.AUC(name='auc', from_logits=False)])

## Training Performance

Model was trained with EarlyStopping on the validation AUC. Training stopped at epoch 30 and restored the best weights.

The model showed stable convergence without significant overfitting.


In [195]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=5,
    mode='max',
    restore_best_weights=True,
    verbose=1
)


In [196]:
model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stop], class_weight={0: 1.0, 1: 2.0},)

Epoch 1/100
[1m4375/4375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 4ms/step - accuracy: 0.7509 - auc: 0.8377 - loss: 0.6670 - val_accuracy: 0.8189 - val_auc: 0.8989 - val_loss: 0.4128
Epoch 2/100
[1m4375/4375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.7949 - auc: 0.8807 - loss: 0.5837 - val_accuracy: 0.8231 - val_auc: 0.9032 - val_loss: 0.3990
Epoch 3/100
[1m4375/4375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - accuracy: 0.8036 - auc: 0.8869 - loss: 0.5694 - val_accuracy: 0.8158 - val_auc: 0.9029 - val_loss: 0.4059
Epoch 4/100
[1m4375/4375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.8047 - auc: 0.8898 - loss: 0.5630 - val_accuracy: 0.8256 - val_auc: 0.9083 - val_loss: 0.3923
Epoch 5/100
[1m4375/4375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.8113 - auc: 0.8931 - loss: 0.5556 - val_accuracy: 0.8180 - val_auc: 0.9077 - val_loss: 0.3992
Epoch 6/10

<keras.src.callbacks.history.History at 0x7cfaaf7ec610>

In [197]:
loss, accuracy, auc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}, Test Loss: {loss:.4f}")

[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8287 - auc: 0.9106 - loss: 0.3796
Test Accuracy: 0.8305, Test Loss: 0.3791


In [310]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict(X_test)

[1m2344/2344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


## Threshold Tuning

Instead of using a default 0.5 classification threshold, we tuned and fixed the decision threshold to **0.67**, which provided a better precision-recall trade-off for the minority class (signal events).


In [311]:
y_pred = (y_pred >= 0.67).astype('int32')

In [312]:
y_pred

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [313]:
from sklearn.metrics import classification_report, accuracy_score

## Final Evaluation on Test Set

### Classification Report:

| Class          | Precision | Recall | F1-Score |
|----------------|-----------|--------|----------|
| 0 (background) |   0.86    |  0.91  |   0.88   |
| 1 (signal)     |   0.80    |  0.71  |   0.75   |

- **Accuracy**: 0.84
- **AUC Score**: ~0.91
- **Macro F1**: 0.82

### Key Insight:
The model performs significantly well on both majority and minority classes, even under class imbalance.


In [314]:
print("Report: ", classification_report(y_test, y_pred))

Report:                precision    recall  f1-score   support

           0       0.86      0.91      0.88     49555
           1       0.80      0.71      0.75     25445

    accuracy                           0.84     75000
   macro avg       0.83      0.81      0.82     75000
weighted avg       0.84      0.84      0.84     75000



In [315]:
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.84284
