In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [4]:
## Display all rows:

pd.pandas.set_option('display.max_columns', None)

In [5]:
# For colab:
# df = pd.read_csv('test_MICE_median_ind_QuantileTransformer.csv')

# For local
df = pd.read_csv('../data/interim/test_MICE_median_ind_QuantileTransformer.csv')

## Dropping un-important columns:

In [10]:
# Dropping the less important feature columns
drop_cols = ['PRI_jet_leading_phi',
 'PRI_jet_subleading_phi',
 'PRI_tau_phi',
 'PRI_met_phi',
 'PRI_lep_phi']

df = df.drop(columns=drop_cols)

## Loading saved model:

In [7]:
import pickle

with open('../models/regg/xgb_without_outliers.pkl', 'rb') as fp:
    xgb = pickle.load(fp)

## Predict Weight:

In [11]:
weight = xgb.predict(df)
df['Weight'] = weight

In [13]:
df.head()

Unnamed: 0,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_lep_pt,PRI_lep_eta,PRI_met,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_all_pt,ind_15,ind_40,ind_70,Weight
0,0.870957,-2.216645,-1.119281,-0.247752,0.04577,0.31478,-1.760258,-0.628724,0.37763,-1.08194,-0.154389,-1.152662,-0.381,-0.514836,-0.312376,0.559906,-1.058504,-5.199338,-0.054166,-0.22182,0.395689,-0.253346,-5.199338,True,True,True,3.96982
1,0.485897,0.537209,0.240473,-0.764985,-0.688471,0.732956,-0.447305,-0.710987,0.671605,-0.285482,-0.885844,-0.073666,-1.378114,0.723435,-0.078281,-0.689173,-0.033037,0.137177,0.161513,-0.513904,0.598879,-0.266867,0.114521,False,False,True,2.273312
2,0.207364,0.803335,-0.981127,0.772278,0.723645,-0.682084,0.328223,-0.429568,-0.304482,-1.207532,0.606136,0.744433,-0.203027,0.568913,-0.346188,-0.784166,-0.845672,-5.199338,-0.238294,-0.158376,-0.21375,-0.15443,-5.199338,False,True,True,1.388584
3,-0.356748,0.830878,-0.753776,0.658167,0.476624,-0.588021,0.410169,-0.07785,-0.797044,-0.761353,0.554211,0.935067,0.295951,0.118736,-0.166791,-0.466263,-0.158631,-5.199338,-0.067167,0.005652,-0.438161,-0.013566,-5.199338,False,True,True,0.306692
4,0.980058,-0.708379,0.803889,-1.257833,1.104476,1.00834,-1.564141,2.039303,0.598435,-0.148161,-1.456896,0.682761,-1.156012,1.43748,-1.436244,-0.625119,2.856842,5.199338,2.16349,-0.88924,2.772217,0.539375,2.834838,False,False,False,0.684285


In [14]:
df.shape

(550000, 27)

Note in the begining we have dropped the EventId column which is in-correct approach but since we have not shuffled the test dataset, we can add the column from the initial test data

## Inserting EventId field:

In [15]:
test = pd.read_csv('../data/raw/test.csv')
test.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,350000,-999.0,79.589,23.916,3.036,-999.0,-999.0,-999.0,0.903,3.036,56.018,1.536,-1.404,-999.0,22.088,-0.54,-0.609,33.93,-0.504,-1.511,48.509,2.022,98.556,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0
1,350001,106.398,67.49,87.949,49.994,-999.0,-999.0,-999.0,2.048,2.679,132.865,1.777,-1.204,-999.0,30.716,-1.784,3.054,54.574,-0.169,1.795,21.093,-1.138,176.251,1,47.575,-0.553,-0.849,-999.0,-999.0,-999.0,47.575
2,350002,117.794,56.226,96.358,4.137,-999.0,-999.0,-999.0,2.755,4.137,97.6,1.096,-1.408,-999.0,46.564,-0.298,3.079,51.036,-0.548,0.336,19.461,-1.868,111.505,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
3,350003,135.861,30.604,97.288,9.104,-999.0,-999.0,-999.0,2.811,9.104,94.112,0.819,-1.382,-999.0,51.741,0.388,-1.408,42.371,-0.295,2.148,25.131,1.172,164.707,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,350004,74.159,82.772,58.731,89.646,1.347,536.663,-0.339,1.028,77.213,721.552,1.713,-0.913,0.004,45.087,-1.548,1.877,77.252,-1.913,2.838,22.2,-0.231,869.614,3,254.085,-1.013,-0.334,185.857,0.335,2.587,599.213


In [16]:
df.insert(0, column='EventId', value=test['EventId'])
df.head()

Unnamed: 0,EventId,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,DER_pt_ratio_lep_tau,DER_met_phi_centrality,DER_lep_eta_centrality,PRI_tau_pt,PRI_tau_eta,PRI_lep_pt,PRI_lep_eta,PRI_met,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_all_pt,ind_15,ind_40,ind_70,Weight
0,350000,0.870957,-2.216645,-1.119281,-0.247752,0.04577,0.31478,-1.760258,-0.628724,0.37763,-1.08194,-0.154389,-1.152662,-0.381,-0.514836,-0.312376,0.559906,-1.058504,-5.199338,-0.054166,-0.22182,0.395689,-0.253346,-5.199338,True,True,True,3.96982
1,350001,0.485897,0.537209,0.240473,-0.764985,-0.688471,0.732956,-0.447305,-0.710987,0.671605,-0.285482,-0.885844,-0.073666,-1.378114,0.723435,-0.078281,-0.689173,-0.033037,0.137177,0.161513,-0.513904,0.598879,-0.266867,0.114521,False,False,True,2.273312
2,350002,0.207364,0.803335,-0.981127,0.772278,0.723645,-0.682084,0.328223,-0.429568,-0.304482,-1.207532,0.606136,0.744433,-0.203027,0.568913,-0.346188,-0.784166,-0.845672,-5.199338,-0.238294,-0.158376,-0.21375,-0.15443,-5.199338,False,True,True,1.388584
3,350003,-0.356748,0.830878,-0.753776,0.658167,0.476624,-0.588021,0.410169,-0.07785,-0.797044,-0.761353,0.554211,0.935067,0.295951,0.118736,-0.166791,-0.466263,-0.158631,-5.199338,-0.067167,0.005652,-0.438161,-0.013566,-5.199338,False,True,True,0.306692
4,350004,0.980058,-0.708379,0.803889,-1.257833,1.104476,1.00834,-1.564141,2.039303,0.598435,-0.148161,-1.456896,0.682761,-1.156012,1.43748,-1.436244,-0.625119,2.856842,5.199338,2.16349,-0.88924,2.772217,0.539375,2.834838,False,False,False,0.684285


## Saving test data:

In [17]:
df.to_csv('../data/processed/test_weight_regg.csv', index=False)