<a href="https://colab.research.google.com/github/Ava100rav/CERN_proton_collision/blob/main/1_CERN_Proton_Collision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DATASET : -   https://www.kaggle.com/fedesoriano/multijet-primary-dataset

CONTEXT

MultiJet primary dataset in AOD format from RunB of 2010. This dataset contains runs from 2010 RunB. The list of all validated runs, which must be applied to all analyses.   This file contains events from the MultiJet primary dataset from the CMS open data release, and computes the razor variables MR and Rsq, used in supersymmetric particle searches.

In [None]:
# 1) Run: The run number of the event.
# 2) Lumi: The lumi section of the event.
# 3) Event: The event number of the event.
# 4) MR: First razor kinematic variable, the MR variable is an estimate of an overall mass scale, 
#                  which in the limit of massless decay products equals the mass of the heavy parent particle.
# 5) Rsq: Second razor kinematic variable, the Rsq variable is the square of the ratio R, 
#   which quantifies the flow of energy in the plane perpendicular to the beam and the partitioning of momentum between visible and invisible particles.
# 6,7,8,9) E1,Px1,Py1,Pz1: The four-vector of the leading megajet (with the largest transverse momentum).
# 10,11,12,13) E2,Px2,Py2,Pz2: The four-vector of the subleading megajet (with the largest transverse momentum).
# 14) HT: The scalar sum of the transverse momentum of the jets.
# 15) MET: The magnitude of the vector sum of the transverse energy of the particles in the event.
# 16) nJets: The number of jets with transverse momentum above 40 GeV.
# 17) nBJets: The number of b-tagged jets with transverse momentum above 40 GeV.

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt   
import matplotlib.style 

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [None]:
df=pd.read_csv('/content/drive/MyDrive/28 feb/MultiJetRun2010B.csv')
df.head(3)

Unnamed: 0,Run,Lumi,Event,MR,Rsq,E1,Px1,Py1,Pz1,E2,Px2,Py2,Pz2,HT,MET,nJets,nBJets
0,148029,388,302318745,215.553,0.031977,136.71,-109.893,-54.0342,-58.9032,142.179,70.0254,41.1225,-116.513,203.666,18.311,2,0
1,148029,388,302323641,155.437,0.042157,83.3865,81.15,6.88361,-12.9688,73.9025,-72.2472,11.8835,3.0899,154.659,14.7747,2,0
2,148029,388,302336217,400.563,0.026938,253.184,139.902,102.64,-101.935,535.551,-110.379,-89.0929,-516.179,343.28,25.2211,3,0


In [None]:
df.shape

(21726, 17)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21726 entries, 0 to 21725
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Run     21726 non-null  int64  
 1   Lumi    21726 non-null  int64  
 2   Event   21726 non-null  int64  
 3   MR      21726 non-null  float64
 4   Rsq     21726 non-null  float64
 5   E1      21726 non-null  float64
 6   Px1     21726 non-null  float64
 7   Py1     21726 non-null  float64
 8   Pz1     21726 non-null  float64
 9   E2      21726 non-null  float64
 10  Px2     21726 non-null  float64
 11  Py2     21726 non-null  float64
 12  Pz2     21726 non-null  float64
 13  HT      21726 non-null  float64
 14  MET     21726 non-null  float64
 15  nJets   21726 non-null  int64  
 16  nBJets  21726 non-null  int64  
dtypes: float64(12), int64(5)
memory usage: 2.8 MB


In [None]:
df.isnull().sum().sum()

0

In [None]:
df2=df
df2.shape

(21726, 17)

In [None]:
# Copy all the predictor variables into X dataframe
X = df.drop('MR', axis=1)
# Copy target into the y dataframe.This is the dependent variable
y = df[['MR']]
X.head()

Unnamed: 0,Run,Lumi,Event,Rsq,E1,Px1,Py1,Pz1,E2,Px2,Py2,Pz2,HT,MET,nJets,nBJets
0,148029,388,302318745,0.031977,136.71,-109.893,-54.0342,-58.9032,142.179,70.0254,41.1225,-116.513,203.666,18.311,2,0
1,148029,388,302323641,0.042157,83.3865,81.15,6.88361,-12.9688,73.9025,-72.2472,11.8835,3.0899,154.659,14.7747,2,0
2,148029,388,302336217,0.026938,253.184,139.902,102.64,-101.935,535.551,-110.379,-89.0929,-516.179,343.28,25.2211,3,0
3,148029,388,302382289,0.094192,175.486,-156.024,-62.9535,-47.7434,112.851,89.0843,3.45025,67.9007,257.397,46.0288,2,0
4,148029,388,302403873,0.018804,833.795,100.41,-16.659,-827.498,445.612,-91.1991,15.5583,-390.144,269.492,8.11345,3,0


In [None]:
X.shape

(21726, 16)

In [None]:
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import  train_test_split
# Split X and y into training and test set in 65:35 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35 , random_state=10)

In [None]:
# invoke the LinearRegression function and find the bestfit model on training data
from sklearn.linear_model import  LinearRegression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_model.score(X_train, y_train)

0.5865211320153703

In [None]:
regression_model.score(X_test, y_test)

0.57358486643931

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree=DecisionTreeRegressor(max_depth=25)
dtree.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=25)

In [None]:

dtree.score(X_train, y_train)

0.9999999914052359

In [None]:
dtree.score(X_test, y_test)

0.8790927337542787

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
m = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)
m.score(X_train, y_train)

  y = column_or_1d(y, warn=True)


0.9580603442745916

In [None]:
m.score(X_test, y_test)

0.9425727580512294

In [None]:
# Copy all the predictor variables into X dataframe
X2 = df2.drop('Rsq', axis=1)
# Copy target into the y dataframe.This is the dependent variable
y2 = df2[['Rsq']]
X2.head()

Unnamed: 0,Run,Lumi,Event,MR,E1,Px1,Py1,Pz1,E2,Px2,Py2,Pz2,HT,MET,nJets,nBJets
0,148029,388,302318745,215.553,136.71,-109.893,-54.0342,-58.9032,142.179,70.0254,41.1225,-116.513,203.666,18.311,2,0
1,148029,388,302323641,155.437,83.3865,81.15,6.88361,-12.9688,73.9025,-72.2472,11.8835,3.0899,154.659,14.7747,2,0
2,148029,388,302336217,400.563,253.184,139.902,102.64,-101.935,535.551,-110.379,-89.0929,-516.179,343.28,25.2211,3,0
3,148029,388,302382289,286.245,175.486,-156.024,-62.9535,-47.7434,112.851,89.0843,3.45025,67.9007,257.397,46.0288,2,0
4,148029,388,302403873,204.514,833.795,100.41,-16.659,-827.498,445.612,-91.1991,15.5583,-390.144,269.492,8.11345,3,0


In [None]:
X2.shape

(21726, 16)

In [None]:
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import  train_test_split
# Split X and y into training and test set in 65:35 ratio

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.35 , random_state=10)

In [None]:
# invoke the LinearRegression function and find the bestfit model on training data
from sklearn.linear_model import  LinearRegression
regression_model = LinearRegression()
regression_model.fit(X2_train, y_train)

regression_model.score(X2_train, y_train)

1.0

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree=DecisionTreeRegressor(max_depth=25)
dtree.fit(X2_train, y2_train)

DecisionTreeRegressor(max_depth=25)

In [None]:
dtree.score(X2_train, y2_train)

0.9999907921656159

In [None]:
dtree.score(X2_test, y2_test)

0.8673291712227369

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
m = GradientBoostingRegressor(random_state=0).fit(X2_train, y2_train)
m.score(X2_train, y2_train)

  y = column_or_1d(y, warn=True)


0.9610011410513603

In [None]:
m.score(X2_test, y2_test)

0.9359445009523338