# To get started with Data Understanding:
>Kaggle Link:
https://www.kaggle.com/competitions/higgs-boson/data
>
>Journal Article about the dataset (*Documentation about this data is good.*):
https://proceedings.mlr.press/v42/cowa14.pdf
>
>Offical dataset (*if you don't want to use Kaggle*):
https://opendata.cern.ch/record/328

Type this into your notebook:
>>`!curl -O https://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz`
>>
>>`!ls`
>>
>>`!gunzip 'atlas-higgs-challenge-2014-v2.csv.gz'`


In [4]:
!curl -O https://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 62.5M  100 62.5M    0     0  6148k      0  0:00:10  0:00:10 --:--:-- 6307k


In [5]:
!ls # sanity check.

atlas-higgs-challenge-2014-v2.csv.gz  DATA_3402_Rough_Draft.ipynb  README.md


In [6]:
!gunzip 'atlas-higgs-challenge-2014-v2.csv.gz'

In [7]:
!ls # sanity check.

atlas-higgs-challenge-2014-v2.csv  DATA_3402_Rough_Draft.ipynb	README.md


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [40]:
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv')
df.drop(['KaggleSet', 'KaggleWeight', 'EventId'], axis=1, inplace=True) # drop columns that we may not need.

In [41]:
df.head(5) # sanity check

Unnamed: 0,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,197.76,1.582,1.396,0.2,32.638,1.017,0.381,51.626,2.273,-2.414,16.824,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s
1,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,125.157,0.879,1.414,-999.0,42.014,2.039,-3.011,36.918,0.501,0.103,44.704,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b
2,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,197.814,3.776,1.414,-999.0,32.154,-0.705,-2.093,121.409,-0.953,1.052,54.283,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b
3,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,75.968,2.354,-1.285,-999.0,22.647,-1.655,0.01,53.321,-0.522,-3.1,31.082,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b
4,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,57.983,1.056,-1.385,-999.0,28.209,-2.197,-2.231,29.774,0.798,1.569,2.723,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b


#### By running this code, we can identify where to start our filtering for signal and background

In [42]:
df.Label.unique()

array(['s', 'b'], dtype=object)

### Import necessary ML libraries.

In [43]:
import sklearn.discriminant_analysis as DA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LinearRegression

### I looped through the names to get VarNames. Excluding `Label` which contains the sig/bkg

In [44]:
VarNames=[i for i in df.columns if i != 'Label']

VarNames
# Note: After reading the article: "The subject of the Challenge
# was to study the H to tau tau channel."
# So maybe we only look at ['DER_pt_h','DER_deltar_tau_lep','DER_pt_ratio_lep_tau','PRI_tau_pt','PRI_tau_eta','PRI_tau_phi']

['DER_mass_MMC',
 'DER_mass_transverse_met_lep',
 'DER_mass_vis',
 'DER_pt_h',
 'DER_deltaeta_jet_jet',
 'DER_mass_jet_jet',
 'DER_prodeta_jet_jet',
 'DER_deltar_tau_lep',
 'DER_pt_tot',
 'DER_sum_pt',
 'DER_pt_ratio_lep_tau',
 'DER_met_phi_centrality',
 'DER_lep_eta_centrality',
 'PRI_tau_pt',
 'PRI_tau_eta',
 'PRI_tau_phi',
 'PRI_lep_pt',
 'PRI_lep_eta',
 'PRI_lep_phi',
 'PRI_met',
 'PRI_met_phi',
 'PRI_met_sumet',
 'PRI_jet_num',
 'PRI_jet_leading_pt',
 'PRI_jet_leading_eta',
 'PRI_jet_leading_phi',
 'PRI_jet_subleading_pt',
 'PRI_jet_subleading_eta',
 'PRI_jet_subleading_phi',
 'PRI_jet_all_pt',
 'Weight']

In [45]:
# Remove rows where any value is -999
df_clean = df[(df != -999).all(axis=1)]

N = len(df_clean) # total size of cleaned data
Train_sample = df_clean.iloc[:round(N * .8)] # training sample size
Test_sample = df_clean.iloc[round(N * .8):] # testing sample size

X_Train = Train_sample[VarNames] # EXPLANATORY training sample
y_Train = Train_sample["Label"].apply(lambda x: 1 if x == 's' else 0) # RESPONSE training sample

X_Test = Test_sample[VarNames] # EXPLANATORY testing sample
y_Test = Test_sample["Label"].apply(lambda x: 1 if x == 's' else 0) # RESPONSE testing sample

Test_sig = Test_sample[Test_sample.Label == 's'] # filter for signal
Test_bkg = Test_sample[Test_sample.Label == 'b'] # filter for background

In [48]:
# Least Squres Regression Model

lr = LinearRegression()
lr.fit(X_Train, y_Train)

from sklearn.metrics import r2_score
y_pred = lr.predict(X_Test)
# Get the R-squared score
r2 = r2_score(y_Test, y_pred)
print(f"R-squared (R2) Score: {r2:.4f}")

R-squared (R2) Score: 0.4443


Low score as expected. Let's try Ridge Regression

In [49]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X_Train)
X_Test_scaled = scaler.transform(X_Test)

alphas = [0.1, 1.0, 10.0, 100]
rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', cv=5)
rcv.fit(x_scaled, y_Train)


best_alpha = rcv.alpha_
print(f'Best Alpha: {best_alpha}')
y_pred = rcv.predict(X_Test_scaled)
r2 = r2_score(y_Test, y_pred)
print(f'R2 Score: {r2:.4f}')

Best Alpha: 10.0
R2 Score: 0.4443


Not much better, this means no feature is highly correlleted to the data and the model is not overfit

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

input_dim = x_scaled.shape[1]

model = Sequential([
    Dense(64, activation='relu', input_shape=(input_dim,)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])

model.summary()

ValueError: Cannot convert '31' to a shape.

In [59]:
model.compile(
    optimizer='adam', # A popular and effective choice for gradient descent
    loss='mse',       # Mean Squared Error, the standard loss for regression
    metrics=['mae', 'R2Score'] # Track Mean Absolute Error and R2 score
)

In [60]:
history = model.fit(
    x_scaled,
    y_Train,
    epochs=50,          # The number of times to iterate over the entire dataset
    batch_size=32,      # The number of samples per gradient update
    validation_split=0.1, # Use 10% of the training data to monitor performance
    verbose=1           # Display training progress
)

Epoch 1/50


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 31), dtype=float32). Expected shape (None, 178859, 31), but input has incompatible shape (None, 31)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 31), dtype=float32)
  • training=True
  • mask=None
  • kwargs=<class 'inspect._empty'>