<a href="https://colab.research.google.com/github/AAA530/Higgs_boson_classification/blob/main/AI_project_higgs_boson.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up the colab notebook

In [1]:
! pip install -q kaggle

In [2]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"aaa530","key":"810688b283394e65d41f44aa8e38f705"}'}

In [3]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets list

ref                                                                   title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
arnabchaki/data-science-salaries-2023                                 Data Science Salaries 2023 💸                         25KB  2023-04-13 09:55:16          33750        919  1.0              
tawfikelmetwally/automobile-dataset                                   Car information dataset                               6KB  2023-05-28 18:26:48           4408        129  0.9411765        
fatihb/coffee-quality-data-cqi                                        Coffee Quality Data (CQI May-2023)                   22KB  2023-05-12 13:06:39           6489        139  1.0              
mohithsairamreddy/salary-data 

In [6]:
! kaggle competitions download -c higgs-boson

Downloading higgs-boson.zip to /content
100% 54.0M/54.3M [00:03<00:00, 17.8MB/s]
100% 54.3M/54.3M [00:03<00:00, 15.2MB/s]


In [7]:
! mkdir higgs-boson

In [8]:
!unzip higgs-boson.zip -d higgs-boson/

Archive:  higgs-boson.zip
  inflating: higgs-boson/HiggsBosonCompetition_AMSMetric_rev1.py  
  inflating: higgs-boson/random_submission.zip  
  inflating: higgs-boson/test.zip    
  inflating: higgs-boson/training.zip  


In [9]:
! mkdir higgs-boson/train

In [10]:
!unzip higgs-boson/training.zip -d higgs-boson/train/

Archive:  higgs-boson/training.zip
  inflating: higgs-boson/train/training.csv  


## Starting with data visualization and data preprocessing

### Project Objective
The objective of the project is to classify an event produced in the particle accelerator as background or signal. As described earlier, a background event is explained by the existing theories and previous observations. A signal event, however, indicates a process that cannot be described by previous observations and leads to the potential discovery of a new particle.

In [11]:
# File system manangement
import time, psutil, os

# Mathematical functions
import math

# Data manipulation
import numpy as np
import pandas as pd

# Plotting and visualization
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as mpatches

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from matplotlib import cm
from mpl_toolkits.mplot3d.axes3d import get_test_data

import seaborn as sns
sns.set_theme()
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [12]:
# Loading the training data
data_train = pd.read_csv('higgs-boson/training.zip')
data_test = pd.read_csv('higgs-boson/test.zip')

print(pd.Series({"Memory usage": "{:.2f} MB".format(data_train.memory_usage().sum()/(1024*1024)),
                 "Dataset shape": "{}".format(data_train.shape)}).to_string())
data_train.head()

Memory usage         62.94 MB
Dataset shape    (250000, 33)


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [13]:
data_train.loc[data_train.index[0]]

EventId                          100000
DER_mass_MMC                     138.47
DER_mass_transverse_met_lep      51.655
DER_mass_vis                     97.827
DER_pt_h                          27.98
DER_deltaeta_jet_jet               0.91
DER_mass_jet_jet                124.711
DER_prodeta_jet_jet               2.666
DER_deltar_tau_lep                3.064
DER_pt_tot                       41.928
DER_sum_pt                       197.76
DER_pt_ratio_lep_tau              1.582
DER_met_phi_centrality            1.396
DER_lep_eta_centrality              0.2
PRI_tau_pt                       32.638
PRI_tau_eta                       1.017
PRI_tau_phi                       0.381
PRI_lep_pt                       51.626
PRI_lep_eta                       2.273
PRI_lep_phi                      -2.414
PRI_met                          16.824
PRI_met_phi                      -0.277
PRI_met_sumet                   258.733
PRI_jet_num                           2
PRI_jet_leading_pt               67.435


In [14]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()


data_train['Label'] = enc.fit_transform(data_train['Label'])
data_train.head()


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,1
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,0
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,0
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,0
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,0


In [15]:
data_train.drop(['Weight'], axis=1,inplace=True)

In [16]:
data_train.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,1
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,0
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,0
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0


In [17]:
data_train.shape,data_test.shape

((250000, 32), (550000, 31))

In [18]:
X = data_train.drop(columns = "Label")
y = data_train['Label']

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)


In [20]:
from sklearn.tree import DecisionTreeClassifier  
classifier_DT= DecisionTreeClassifier(criterion='gini', random_state=0)  
classifier_DT.fit(X_train, y_train) 

In [21]:
y_predict_dt = classifier_DT.predict(X_test) 
from sklearn.metrics import accuracy_score,r2_score
print("Accuracy : "+ str(accuracy_score(y_test,y_predict_dt)))
print("R2 score : "+ str(r2_score(y_test,y_predict_dt)))


Accuracy : 0.7640533333333334
R2 score : -0.045952231065285565


# ANN for the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense


In [None]:
classifier = Sequential()
#add input layer and first hidden layer
classifier.add(Dense(6, kernel_initializer = "uniform", activation = "relu", input_dim = 31))
#add 2nd hidden layer
classifier.add(Dense(6, kernel_initializer = "uniform", activation = "relu"))

classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))

classifier.compile(optimizer = "Adam", loss ="binary_crossentropy", metrics = ["accuracy"])


In [None]:
classifier.fit(X_train, y_train, batch_size = 30, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f106c661c90>

In [None]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)




In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


In [None]:
print(cm)

[[43525  5543]
 [ 7704 18228]]


# PyTorch

In [None]:
X = data_train.drop(columns = "Label")
y = data_train['Label']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# Define the model class
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        
        # Define the layers
        self.fc1 = nn.Linear(31, 6)
        self.fc2 = nn.Linear(6, 6)
        self.fc3 = nn.Linear(6, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Create an instance of the model
classifier = Classifier()

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters())

# Create an instance of StandardScaler
sc = StandardScaler()

# Convert the data to torch tensors and perform data preprocessing
X_train = torch.tensor(sc.fit_transform(X_train),dtype=torch.float32)
X_test = torch.tensor(sc.transform(X_test),dtype=torch.float32)
y_train = torch.tensor(y_train.to_numpy(),dtype=torch.float32).reshape(-1,1)
y_test = torch.tensor(y_test.to_numpy(),dtype=torch.float32).reshape(-1,1)



In [None]:

num_epochs = 10
batch_size = 10000

# Train the model
for epoch in range(num_epochs):
  for i in range(0, len(X_train), batch_size):
    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = classifier(X_train)
    loss = criterion(outputs, y_train)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()
  print(f'Finished epoch {epoch}, latest loss {loss}')

Finished epoch 0, latest loss 0.6589263677597046
Finished epoch 1, latest loss 0.6477974653244019
Finished epoch 2, latest loss 0.6356185078620911
Finished epoch 3, latest loss 0.6224154829978943
Finished epoch 4, latest loss 0.611514687538147
Finished epoch 5, latest loss 0.6025620698928833
Finished epoch 6, latest loss 0.5931715369224548
Finished epoch 7, latest loss 0.5831978917121887
Finished epoch 8, latest loss 0.5728825926780701
Finished epoch 9, latest loss 0.5620506405830383


In [None]:
with torch.no_grad():
    y_pred = classifier(X_test)
accuracy = (y_pred.round() == y_test).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.6767200231552124


#Semi Supervised Decision Tree

In [22]:
# Loading the training data labelled

data_train = pd.read_csv('higgs-boson/training.zip')
data_test = pd.read_csv('higgs-boson/test.zip')

print(pd.Series({"Memory usage": "{:.2f} MB".format(data_train.memory_usage().sum()/(1024*1024)),
                 "Dataset shape": "{}".format(data_train.shape)}).to_string())
data_train.head()


from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()


data_train['Label'] = enc.fit_transform(data_train['Label'])
data_train.drop(['Weight'], axis=1,inplace=True)
data_train.head()

Memory usage         62.94 MB
Dataset shape    (250000, 33)


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,1
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,0
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,0
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,0


In [23]:
labelled_data = data_train.iloc[:1000]
unlabelled_data = data_train.iloc[1000:10000]

unlabelled_data


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Label
1000,101000,213.398,83.423,122.297,2.217,-999.000,-999.000,-999.000,3.648,2.217,...,129.555,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,0
1001,101001,74.239,31.364,56.832,50.391,-999.000,-999.000,-999.000,1.800,1.745,...,251.981,1,49.277,-2.121,-1.753,-999.000,-999.000,-999.000,49.277,0
1002,101002,56.977,44.013,43.921,57.708,-999.000,-999.000,-999.000,1.402,26.208,...,215.719,1,38.466,1.740,-1.001,-999.000,-999.000,-999.000,38.466,0
1003,101003,140.563,9.658,95.006,6.068,-999.000,-999.000,-999.000,3.060,6.068,...,193.413,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,1
1004,101004,125.036,3.000,74.697,134.903,6.535,1991.475,-10.537,1.472,1.253,...,280.138,2,80.741,2.893,2.871,71.395,-3.643,1.876,152.137,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109995,-999.000,63.778,48.528,4.306,-999.000,-999.000,-999.000,1.718,4.306,...,105.207,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,0
9996,109996,95.998,13.293,55.290,59.265,-999.000,-999.000,-999.000,2.255,11.367,...,137.655,1,62.098,3.498,-3.113,-999.000,-999.000,-999.000,62.098,1
9997,109997,110.956,12.936,84.361,108.394,2.723,297.556,-0.852,1.502,45.155,...,473.755,3,89.863,-0.360,2.347,71.268,2.363,1.709,221.210,0
9998,109998,103.270,35.624,71.944,18.157,-999.000,-999.000,-999.000,2.850,18.157,...,133.645,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000,0


In [37]:

X = labelled_data.drop(columns = "Label")
y = labelled_data['Label']


# from sklearn.tree import DecisionTreeClassifier  
# classifier_DT= DecisionTreeClassifier(criterion='gini', random_state=0)  
# classifier_DT.fit(X.values, y) 

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
classifier_DT = gnb.fit(X.values, y)


In [38]:

X_test = unlabelled_data.drop(columns = "Label")
y_test = unlabelled_data['Label']


y_predict_dt = classifier_DT.predict_proba(X_test) 




In [39]:
print(y_predict_dt)

[[9.99717059e-01 2.82941395e-04]
 [7.95451322e-01 2.04548678e-01]
 [9.68324950e-01 3.16750496e-02]
 ...
 [1.17586518e-08 9.99999988e-01]
 [2.43227800e-03 9.97567722e-01]
 [4.33246759e-03 9.95667532e-01]]


In [28]:
X

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,138.470,51.655,97.827,27.980,0.91,124.711,2.666,3.064,41.928,...,-0.277,258.733,2,67.435,2.150,0.444,46.062,1.24,-2.475,113.497
1,100001,160.937,68.768,103.235,48.146,-999.00,-999.000,-999.000,3.473,2.078,...,-1.916,164.546,1,46.226,0.725,1.158,-999.000,-999.00,-999.000,46.226
2,100002,-999.000,162.172,125.953,35.635,-999.00,-999.000,-999.000,3.148,9.336,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.000,-999.00,-999.000,44.251
3,100003,143.905,81.417,80.943,0.414,-999.00,-999.000,-999.000,3.310,0.414,...,0.060,86.062,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,-0.000
4,100004,175.864,16.915,134.805,16.405,-999.00,-999.000,-999.000,3.891,16.405,...,-0.871,53.131,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,100995,138.306,1.453,93.058,1.597,-999.00,-999.000,-999.000,3.235,1.597,...,0.570,69.225,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000
996,100996,71.303,31.438,55.612,2.110,-999.00,-999.000,-999.000,2.719,2.110,...,-1.553,58.185,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000
997,100997,-999.000,77.947,87.548,1.396,-999.00,-999.000,-999.000,2.464,1.396,...,0.932,55.672,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000
998,100998,188.996,114.478,93.671,10.506,-999.00,-999.000,-999.000,3.144,34.687,...,-3.049,243.343,1,39.852,-0.418,-0.411,-999.000,-999.00,-999.000,39.852


In [29]:
X_test

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
1000,101000,213.398,83.423,122.297,2.217,-999.000,-999.000,-999.000,3.648,2.217,...,-0.306,129.555,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000
1001,101001,74.239,31.364,56.832,50.391,-999.000,-999.000,-999.000,1.800,1.745,...,-2.751,251.981,1,49.277,-2.121,-1.753,-999.000,-999.000,-999.000,49.277
1002,101002,56.977,44.013,43.921,57.708,-999.000,-999.000,-999.000,1.402,26.208,...,1.087,215.719,1,38.466,1.740,-1.001,-999.000,-999.000,-999.000,38.466
1003,101003,140.563,9.658,95.006,6.068,-999.000,-999.000,-999.000,3.060,6.068,...,-2.486,193.413,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000
1004,101004,125.036,3.000,74.697,134.903,6.535,1991.475,-10.537,1.472,1.253,...,-0.179,280.138,2,80.741,2.893,2.871,71.395,-3.643,1.876,152.137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,109995,-999.000,63.778,48.528,4.306,-999.000,-999.000,-999.000,1.718,4.306,...,-2.987,105.207,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000
9996,109996,95.998,13.293,55.290,59.265,-999.000,-999.000,-999.000,2.255,11.367,...,-0.244,137.655,1,62.098,3.498,-3.113,-999.000,-999.000,-999.000,62.098
9997,109997,110.956,12.936,84.361,108.394,2.723,297.556,-0.852,1.502,45.155,...,0.063,473.755,3,89.863,-0.360,2.347,71.268,2.363,1.709,221.210
9998,109998,103.270,35.624,71.944,18.157,-999.000,-999.000,-999.000,2.850,18.157,...,-0.046,133.645,0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000


In [30]:
y

0      1
1      0
2      0
3      0
4      0
      ..
995    1
996    0
997    0
998    0
999    0
Name: Label, Length: 1000, dtype: int64

In [None]:
# X_test.loc(1001)
# print(X_test.loc[X_test['EventId'] == 101001.000])
X_test.drop(X_test.index[0])
X_test.iloc[0]

EventId                        101001.000
DER_mass_MMC                       74.239
DER_mass_transverse_met_lep        31.364
DER_mass_vis                       56.832
DER_pt_h                           50.391
DER_deltaeta_jet_jet             -999.000
DER_mass_jet_jet                 -999.000
DER_prodeta_jet_jet              -999.000
DER_deltar_tau_lep                  1.800
DER_pt_tot                          1.745
DER_sum_pt                        125.622
DER_pt_ratio_lep_tau                2.262
DER_met_phi_centrality             -1.307
DER_lep_eta_centrality           -999.000
PRI_tau_pt                         23.406
PRI_tau_eta                        -0.124
PRI_tau_phi                        -0.022
PRI_lep_pt                         52.940
PRI_lep_eta                        -0.688
PRI_lep_phi                         1.688
PRI_met                             7.312
PRI_met_phi                        -2.751
PRI_met_sumet                     251.981
PRI_jet_num                       

In [31]:

# # Assuming you have an ndarray with shape (9000, 2) called 'ndarray'
# ndarray = np.random.rand(9000, 2)

# # Assuming you have a DataFrame with shape (9000, 31) called 'df_original'
# df_original = pd.DataFrame(np.random.rand(9000, 31))

# Assuming you have an empty DataFrame with shape (1000, 31) called 'df_appended'
pred = classifier_DT.predict_proba(X_test)

df_appended = pd.DataFrame(columns=X_test.columns)
df_appended_y = pd.Series([])


# Iterate over the ndarray and check conditions
indices_to_remove = []
for index, values in enumerate(pred):
    # value1 =   # Access the first value
    # print(values[0])
    if values[0] > 0.8:
        df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
        # df_appended_y = df_appended_y.append(pd.Series({'Label': 0}), ignore_index=True)
        df_appended_y = pd.concat([df_appended_y, pd.Series({'Label': 0})], ignore_index=True)
        indices_to_remove.append(index)

    if values[1]>0.8:
        df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
        # df_appended_y = df_appended_y.append(pd.Series({'Label': 1}), ignore_index=True)
        df_appended_y = pd.concat([df_appended_y, pd.Series({'Label': 1})], ignore_index=True)

        indices_to_remove.append(index)

# Remove the appended entries from the original DataFrame
X_test = X_test.drop(X_test.index[indices_to_remove])
# y = y.drop()

# Print the appended DataFrame
print(df_appended)
print(df_appended_y)

# Print the updated original DataFrame
print(X_test)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[

       EventId  DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  \
0     101000.0       213.398                       83.423       122.297   
1     101001.0        74.239                       31.364        56.832   
2     101002.0        56.977                       44.013        43.921   
3     101003.0       140.563                        9.658        95.006   
4     101004.0       125.036                        3.000        74.697   
...        ...           ...                          ...           ...   
8995  109995.0      -999.000                       63.778        48.528   
8996  109996.0        95.998                       13.293        55.290   
8997  109997.0       110.956                       12.936        84.361   
8998  109998.0       103.270                       35.624        71.944   
8999  109999.0       115.644                       30.715        83.097   

      DER_pt_h  DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0        2.217      

  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)
  df_appended = df_appended.append(X_test.iloc[index], ignore_index=True)


In [32]:
# Print the appended DataFrame
print(df_appended)
print(df_appended_y)

# Print the updated original DataFrame
print(X_test)

       EventId  DER_mass_MMC  DER_mass_transverse_met_lep  DER_mass_vis  \
0     101000.0       213.398                       83.423       122.297   
1     101001.0        74.239                       31.364        56.832   
2     101002.0        56.977                       44.013        43.921   
3     101003.0       140.563                        9.658        95.006   
4     101004.0       125.036                        3.000        74.697   
...        ...           ...                          ...           ...   
8995  109995.0      -999.000                       63.778        48.528   
8996  109996.0        95.998                       13.293        55.290   
8997  109997.0       110.956                       12.936        84.361   
8998  109998.0       103.270                       35.624        71.944   
8999  109999.0       115.644                       30.715        83.097   

      DER_pt_h  DER_deltaeta_jet_jet  DER_mass_jet_jet  DER_prodeta_jet_jet  \
0        2.217      

In [33]:
concatenated_df = pd.concat([X, df_appended])
concatenated_series = pd.concat([y, df_appended_y])



In [34]:
concatenated_df

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000.0,138.470,51.655,97.827,27.980,0.910,124.711,2.666,3.064,41.928,...,-0.277,258.733,2.0,67.435,2.150,0.444,46.062,1.240,-2.475,113.497
1,100001.0,160.937,68.768,103.235,48.146,-999.000,-999.000,-999.000,3.473,2.078,...,-1.916,164.546,1.0,46.226,0.725,1.158,-999.000,-999.000,-999.000,46.226
2,100002.0,-999.000,162.172,125.953,35.635,-999.000,-999.000,-999.000,3.148,9.336,...,-2.186,260.414,1.0,44.251,2.053,-2.028,-999.000,-999.000,-999.000,44.251
3,100003.0,143.905,81.417,80.943,0.414,-999.000,-999.000,-999.000,3.310,0.414,...,0.060,86.062,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,-0.000
4,100004.0,175.864,16.915,134.805,16.405,-999.000,-999.000,-999.000,3.891,16.405,...,-0.871,53.131,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,109995.0,-999.000,63.778,48.528,4.306,-999.000,-999.000,-999.000,1.718,4.306,...,-2.987,105.207,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000
8996,109996.0,95.998,13.293,55.290,59.265,-999.000,-999.000,-999.000,2.255,11.367,...,-0.244,137.655,1.0,62.098,3.498,-3.113,-999.000,-999.000,-999.000,62.098
8997,109997.0,110.956,12.936,84.361,108.394,2.723,297.556,-0.852,1.502,45.155,...,0.063,473.755,3.0,89.863,-0.360,2.347,71.268,2.363,1.709,221.210
8998,109998.0,103.270,35.624,71.944,18.157,-999.000,-999.000,-999.000,2.850,18.157,...,-0.046,133.645,0.0,-999.000,-999.000,-999.000,-999.000,-999.000,-999.000,0.000


In [35]:
concatenated_series

0       1
1       0
2       0
3       0
4       0
       ..
8995    0
8996    0
8997    1
8998    0
8999    1
Length: 10000, dtype: int64

In [36]:
X_test

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt


In [None]:

pred = classifier_DT.predict_proba(X_test)
for index, row in enumerate(pred):
  # if(index == 1000):
  #   print(index)
    
    if pred[index][0] > 0.8:
      X = pd.concat([X, X_test.iloc[index]], ignore_index=True)
      # X.append(X_test.loc[index],ignore_index=True)

      y = pd.concat([y, pd.Series({'Label': 0})], ignore_index=True)
      # y.append(pd.Series({'Label': 0}),ignore_index=True)
      X_test = X_test.drop(X_test.index[index])
    if pred[index][1] > 0.8:
      X = pd.concat([X, X_test.iloc[index]], ignore_index=True)
      # X.append(X_test.loc[index],ignore_index=True)
      
      y = pd.concat([y, pd.Series({'Label': 1})], ignore_index=True)
      # y.append(pd.Series({'Label': 1}),ignore_index=True)

      X_test = X_test.drop(X_test.index[index])



IndexError: ignored

In [None]:
X

In [None]:
y