# Step 1: Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
import tensorflow
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LeakyReLU, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam, Nadam
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2


#for data preprocessing
from google.colab import drive

#for loading the model
from tensorflow.keras.models import load_model



# Step 2: Data preperation



1.   Connecting google drive




In [2]:

drive.mount('/content/drive')


Mounted at /content/drive


2. Load the data into a pandas dataframe

In [3]:
path = '/content/drive/MyDrive/Artifical Minds - CMPS261/HIGGS_train.csv'
data = pd.read_csv(path)

data.info()

  data = pd.read_csv(path)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599999 entries, 0 to 599998
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   1.00E+00    599999 non-null  float64
 1   8.69E-01    599999 non-null  float64
 2   -6.35E-01   599999 non-null  float64
 3   2.26E-01    599999 non-null  float64
 4   3.27E-01    599999 non-null  float64
 5   -6.90E-01   599999 non-null  float64
 6   7.54E-01    599999 non-null  float64
 7   -2.49E-01   599999 non-null  float64
 8   -1.09E+00   599999 non-null  object 
 9   0.00E+00    599999 non-null  float64
 10  1.37E+00    599999 non-null  float64
 11  -6.54E-01   599999 non-null  float64
 12  9.30E-01    599999 non-null  float64
 13  1.11E+00    599999 non-null  float64
 14  1.14E+00    599999 non-null  float64
 15  -1.58E+00   599999 non-null  float64
 16  -1.05E+00   599999 non-null  float64
 17  0.00E+00.1  599998 non-null  float64
 18  6.58E-01    599999 non-null  float64
 19  -1

3. Naming the columns

In [4]:
column_names = ['class_label', 'lepton_pt', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude', 'missing_energy_phi',
                'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag', 'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag',
                'jet_3_pt', 'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta', 'jet_4_phi', 'jet_4_b-tag',
                'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']
data.columns = column_names

4. Checking for null values

In [5]:
print(data.isna().sum())
data['jet_1_phi'] = pd.to_numeric(data['jet_1_phi'], errors='coerce')
data['jet_4_b-tag'] = pd.to_numeric(data['jet_4_b-tag'], errors='coerce')
data['class_label'] = data['class_label'].astype(int)

data = data.dropna();
data.drop_duplicates(inplace=True)
data.info()

class_label                 0
lepton_pt                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet_1_pt                    0
jet_1_eta                   0
jet_1_phi                   0
jet_1_b-tag                 0
jet_2_pt                    0
jet_2_eta                   0
jet_2_phi                   0
jet_2_b-tag                 0
jet_3_pt                    0
jet_3_eta                   0
jet_3_phi                   0
jet_3_b-tag                 1
jet_4_pt                    0
jet_4_eta                   0
jet_4_phi                   0
jet_4_b-tag                 0
m_jj                        0
m_jjj                       0
m_lv                        0
m_jlv                       0
m_bb                        0
m_wbb                       0
m_wwbb                      0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 599193 entries, 0 to 599998
Data columns (total 29 columns):
 #   C

# Step 3: Data Scaling

1.  Split the data into features and labels


In [6]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
print(y.values)

[1 1 0 ... 1 1 0]


2. Feature scaling

In [8]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

#Step 4: Model architecture 

In [10]:
def create_improved_model(activation='relu', optimizer='adam'):
    inputs = Input(shape=(X.shape[1],))
    x = Dense(1024, activation=activation)(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation=activation)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation=activation)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(128, activation=activation)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inputs, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create the model
improved_model = KerasClassifier(build_fn=create_improved_model, epochs=50, batch_size=256 , verbose=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add EarlyStopping and ReduceLROnPlateau callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, min_lr=0.00001)

# Train and evaluate the model
history = improved_model.fit(X_train, y_train, validation_split=0.33, callbacks=[early_stopping, lr_reduction], verbose=1)
test_accuracy = improved_model.score(X_test, y_test)
print("Test accuracy:", test_accuracy)


  improved_model = KerasClassifier(build_fn=create_improved_model, epochs=50, batch_size=256 , verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.7595190405845642


#Step 5: Saving, loading and printing the info of our model



In [None]:
# Save the model
improved_model.model.save('my_model.h5')

In [16]:

# Load the saved model
loaded_model = load_model('my_model.h5')

# Print the summary of the loaded model
loaded_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 28)]              0         
                                                                 
 dense_5 (Dense)             (None, 1024)              29696     
                                                                 
 batch_normalization_4 (Batc  (None, 1024)             4096      
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 1024)              0         
                                                                 
 dense_6 (Dense)             (None, 512)               524800    
                                                                 
 batch_normalization_5 (Batc  (None, 512)              2048      
 hNormalization)                                           