In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sweetviz as sv
import dtale

%matplotlib inline

In [19]:
# To display all columns:
pd.pandas.set_option('display.max_columns', None)

## Reading Data:

In [20]:
# For running in google colab:
# df = pd.read_csv(r'training.csv', na_values=-999.00)
# df_test = pd.read_csv(r'test.csv', na_values=-999.00)


# For reading Data from local system:
df = pd.read_csv(r'../data/raw/training.csv', na_values=-999.00)
df_test = pd.read_csv(r'../data/raw/test.csv', na_values=-999.00)

# Storing the event ids in variable
train_event_id, test_event_id = df['EventId'], df_test['EventId']

# Removing event ids from both dataframes
df.drop(columns='EventId', inplace=True)
df_test.drop(columns='EventId', inplace=True)

In [21]:
# Dividing Datframe:
X_train = df.drop(columns=['Weight', 'Label'])
X_test= df_test

In [22]:
# Loding Data
df1 = pd.read_csv('../data/interim/train_MICE_median_ind.csv')
df1_test = pd.read_csv('../data/interim/test_MICE_median_ind.csv')
df1.head()

Unnamed: 0,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,ind_15,ind_40,ind_70,Weight,Label
0,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,197.76,1.582,1.396,0.2,32.638,1.017,0.381,51.626,2.273,-2.414,16.824,-0.277,258.733,2.0,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,False,False,False,0.002653,1
1,160.937,68.768,103.235,48.146,3.404602,456.387937,-2.262194,3.473,2.078,125.157,0.879,1.414,0.621629,42.014,2.039,-3.011,36.918,0.501,0.103,44.704,-1.916,164.546,1.0,46.226,0.725,1.158,43.309039,0.271125,0.400691,46.226,False,False,True,2.233584,0
2,207.11754,162.172,125.953,35.635,2.760164,338.231864,-1.264787,3.148,9.336,197.814,3.776,1.414,0.558697,32.154,-0.705,-2.093,121.409,-0.953,1.052,54.283,-2.186,260.414,1.0,44.251,2.053,-2.028,43.37634,-1.070836,0.732583,44.251,True,False,True,2.347389,0
3,143.905,81.417,80.943,0.414,3.40734,477.658851,-2.081679,3.31,0.414,75.968,2.354,-1.285,0.617685,22.647,-1.655,0.01,53.321,-0.522,-3.1,31.082,0.06,86.062,0.0,43.781292,-0.452922,0.841083,42.313622,-0.333805,0.495673,-0.0,False,True,True,5.446378,0
4,175.864,16.915,134.805,16.405,3.697092,512.224159,-2.519068,3.891,16.405,57.983,1.056,-1.385,0.606945,28.209,-2.197,-2.231,29.774,0.798,1.569,2.723,-0.871,53.131,0.0,42.013222,-0.199417,0.220767,38.810338,-0.208926,0.188463,0.0,False,True,True,6.245333,0


In [23]:
# LabelEncode target variable:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
target = le.fit_transform(df['Label'])

In [24]:
df['Label'] = target.copy()

## Data Vizualization using Dtale:

In [None]:
import dtale

# Displays directly in jupyter notebook:
# dtale.show(df, ignore_duplicate=True)
# dtale.show(pd.DataFrame([1,2,3]), app_root='/user/johndoe/proxy/40000/`)


d = dtale.show(df1)


# Altering data associated with D-Tale process
# FYI: this will clear any front-end settings you have at the time for this process (filter, sorts, formatting)
# d.data = tmp

# Shutting down D-Tale process
# d.kill()

# using Python's `webbrowser` package it will try and open your server's default browser to this process
d.open_browser()

d._main_url # /user/johndoe/proxy/40000/dtale/main/1

# d

# Multicolinearity:

## Checking for high correlation in entire dataset (> 0.9 or < - 0.9):

In [25]:
# Define factor value
factor = 0.9

dfcorr = df1.corr() 
dfcorr1 = dfcorr[((dfcorr > factor) & (dfcorr<1.0)) | (dfcorr < -factor)]
dfcorr1.dropna(how='all', inplace=True)
dfcorr1.dropna(axis=1, how='all', inplace=True)
dfcorr1

Unnamed: 0,DER_mass_MMC,DER_mass_vis,DER_sum_pt,PRI_met_sumet,PRI_jet_all_pt
DER_mass_MMC,,0.9275,,,
DER_mass_vis,0.9275,,,,
DER_sum_pt,,,,0.904481,0.965628
PRI_met_sumet,,,0.904481,,
PRI_jet_all_pt,,,0.965628,,


## Removing multicollinear features using manual method:

On analysis we find, removing `DER_mass_MMC` & `DER_sum_pt` is enough.

**Note:** `DER_mass_MMC` was having high correlation (`0.91`) with `DER_mass_vis` even before imputation, but at that point there was change of bias. But after imputation it is clear.

In [29]:
df2 = df1.drop(columns=['DER_mass_MMC', 'DER_sum_pt'])
df2_test = df1_test.drop(columns=['DER_mass_MMC', 'DER_sum_pt'])

In [30]:
# Define factor value
factor = 0.9

dfcorr = df2.corr() 
dfcorr1 = dfcorr[((dfcorr > factor) & (dfcorr<1.0)) | (dfcorr < -factor)]
dfcorr1.dropna(how='all', inplace=True)
dfcorr1.dropna(axis=1, how='all', inplace=True)
dfcorr1

## Variance Inflation Factor:

In [36]:
# !pip install statsmodels

In [35]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

## MissingIndicator:

In [None]:
from sklearn.impute import MissingIndicator

In [None]:
## With Indicator column:
mi = MissingIndicator(sparse=False)
train_temp_ind = mi.fit_transform(X_train[['DER_mass_MMC', 'PRI_jet_leading_pt', 'DER_deltaeta_jet_jet']])
test_temp_ind = mi.transform(X_test[['DER_mass_MMC', 'PRI_jet_leading_pt', 'DER_deltaeta_jet_jet']])

## Transforming Data to Normal Distribution:

**Note**: Since the data is not normally distributed, StandardScalar, MinMaxScaler, RobustScaler, MaxScaler will not be usefull.

### PowerTransformer:

In [12]:
from sklearn.preprocessing import PowerTransformer

In [13]:
# Transformer
pt = PowerTransformer(method='yeo-johnson')

# Training set
train_temp = pd.DataFrame(pt.fit_transform(df2.iloc[:, :-5]), 
                         columns=df2.columns[:-5])

# Test set
test_temp = pd.DataFrame(pt.transform(df2_test.iloc[:, :-3]), 
                         columns=df2_test.columns[:-3])

train_temp[['ind_15', 'ind_40', 'ind_70']] = train_temp_ind
test_temp[['ind_15', 'ind_40', 'ind_70']] = test_temp_ind
train_temp[['Weight', 'Label']] = df2[['Weight', 'Label']]

# Save data
train_temp.to_csv('../data/interim/train_MICE_median_ind_PowerTransformer.csv', index=False)
test_temp.to_csv('../data/interim/test_MICE_median_ind_PowerTransformer.csv', index=False)

NameError: name 'df2' is not defined

In [None]:
#EDA using Autoviz
sweet_report = sv.analyze(train_temp)

#Saving results to HTML file
# sweet_report.show_html("/content/drive/MyDrive/HR Analytics EDA Analysis/output_sweetViz.html")  // for drive
sweet_report.show_html("../reports/op_sv_PowerTransformer.html")

# Open the following URL in browser to see report:
# file:///C:/Users/DEEPANKAR%20MULLICK/Desktop/higgs_boson/reports/op_sv_PowerTransformer.html

### QuantileTransformer:

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
# Transformer
qt = QuantileTransformer(output_distribution='normal')


# Training set
train_temp = pd.DataFrame(qt.fit_transform(df2.iloc[:, :-5]), 
                          columns=df2.columns[:-5])


# Test set
test_temp = pd.DataFrame(qt.transform(df2_test.iloc[:, :-3]) , columns=df2_test.columns[:-3])

train_temp[['ind_15', 'ind_40', 'ind_70']] = train_temp_ind
test_temp[['ind_15', 'ind_40', 'ind_70']] = test_temp_ind
train_temp[['Weight', 'Label']] = df2[['Weight', 'Label']]

# Save data
train_temp.to_csv('../data/interim/train_MICE_median_ind_QuantileTransformer.csv', index=False)
test_temp.to_csv('../data/interim/test_MICE_median_ind_QuantileTransformer.csv', index=False)

In [None]:
#EDA using Autoviz
sweet_report = sv.analyze(train_temp)

#Saving results to HTML file
# sweet_report.show_html("/content/drive/MyDrive/HR Analytics EDA Analysis/output_sweetViz.html")  // for drive
sweet_report.show_html("../reports/op_sv_QuantileTransformer.html")

# Open the following URL in browser to see report:
# file:///C:/Users/DEEPANKAR%20MULLICK/Desktop/higgs_boson/reports/op_sv_QuantileTransformer.html