# Derivation of L2L3 MC corrections


In [None]:
import time

from coffea import hist, processor, nanoevents
from coffea import util
from coffea.nanoevents.methods import candidate
from coffea.nanoevents import BaseSchema, NanoAODSchema
import awkward as ak
import numpy as np
import glob as glob
import itertools
import json

In [None]:
ak.behavior.update(candidate.behavior)

### Change to your xrootd username, unless you're AC Williams ;)

In [None]:
xrootdstr = 'root://acwillia@cmsxrootd.fnal.gov/'

### Try the experimental dask processor

If you'd like ... 

`useColumnClient` should only be set to `True` if working in a singularity shell with the latest coffea-columnservice, rather than coffea-dask

In [None]:
useDask = False
useColumnClient = False #This option should stay false unless using coffea-columnservices in singularity

In [None]:
if useDask:
    if useColumnClient:
        from columnservice.client import ColumnClient
        cc = ColumnClient("coffea-dask.fnal.gov")
        client = cc.get_dask()
    else:
        from distributed import Client
        client = Client('coffea-dask.fnal.gov:8786')

### Define the processor, or import one from a separate `.py` file

In [None]:
# Look at ProcessorABC to see the expected methods and what they are supposed to do
class FancyJECL2L3Processor(processor.ProcessorABC):
    def __init__(self):
        dataset_axis = hist.Cat("dataset", "Primary dataset")
        eta_axis = hist.Bin("eta", r"$\eta$", 20, -5, 5)
        pt_axis = hist.Bin("pt", r"$p_{T}$ [GeV]", 
                           np.array([0,5,10,15,20,25,30,35,40,45,50,60,70,80,90,
                                     100,120,140,160,180,
                                     200,250,300,350,400,450,500,
                                     600,700,800,900,1000,
                                     1500,2000,3000,4000,5000]))
        m_axis = hist.Bin("m", r"$p_{T}$ [GeV]", 200, 0, 500)
        r_axis = hist.Bin("r", "RECO / GEN response", 200, 0, 5)
        
        self._accumulator = processor.dict_accumulator({
            'pt':hist.Hist("Counts", dataset_axis, pt_axis),
            'eta':hist.Hist("Counts", dataset_axis, eta_axis),
            'r_pt_ptveta':hist.Hist("Counts", dataset_axis, pt_axis, eta_axis, r_axis),
            'r_m_ptveta':hist.Hist("Counts", dataset_axis, pt_axis, eta_axis, r_axis),
            'r_m_ptvm':hist.Hist("Counts", dataset_axis, pt_axis, m_axis, r_axis),
            'cutflow': processor.defaultdict_accumulator(int),
        })
    
    @property
    def accumulator(self):
        return self._accumulator
    
    def process(self, events):
        output = self.accumulator.identity()

        dataset = events.metadata['dataset']
      
        Jets = ak.zip({
            "pt": events.Jet_pt * (1 - events.Jet_rawFactor),
            "eta": events.Jet_eta,
            "phi": events.Jet_phi,
            "mass": events.Jet_mass,
            "jetId": events.Jet_jetId,
            "p4": ak.zip({
                "pt": events.Jet_pt * (1 - events.Jet_rawFactor),
                "eta": events.Jet_eta,
                "phi": events.Jet_phi,
                "mass": events.Jet_mass,
                }, with_name="PtEtaPhiMLorentzVector"),
            })
        
        GenJets = ak.zip({
            "pt": events.GenJet_pt,
            "eta": events.GenJet_eta,
            "phi": events.GenJet_phi,
            "mass": events.GenJet_mass,
            "p4": ak.zip({
                "pt": events.GenJet_pt,
                "eta": events.GenJet_eta,
                "phi": events.GenJet_phi,
                "mass": events.GenJet_mass,
                }, with_name="PtEtaPhiMLorentzVector"),
        })
        
        
        evtweights = events.Generator_weight
        output['cutflow']['all events'] += ak.to_awkward0(Jets).size

        jetId_cut = (Jets.jetId > 0)        
        Jets = Jets[jetId_cut]
        output['cutflow']['>=1 with loose id'] += ak.to_awkward0(jetId_cut).any().sum()
        twoJets = (ak.num(Jets, axis=-1) >= 2)        
        output['cutflow']['>=2 reco jets'] += ak.to_awkward0(twoJets).sum()
        twoGens = (ak.num(GenJets, axis=-1) >= 2)
        output['cutflow']['>=2 gen jets'] += ak.to_awkward0(twoGens).sum()
        
        Jets = Jets[twoJets & twoGens]
        GenJets = GenJets[twoJets & twoGens]
        
        
        dphi_index = Jets.p4[:,0].delta_phi( Jets.p4[:,1] ) > 1.8
        output['cutflow']['dPhi > 1.8'] += ak.to_awkward0(dphi_index).sum()
        

        Jets = Jets[dphi_index]
        GenJets = GenJets[dphi_index]
        
        #pairing = Jets.p4[:,0:2].cross(GenJets.p4, nested=True)
        pairing = ak.cartesian([Jets.p4[:,0:2], GenJets.p4])
        metric = pairing.slot0.delta_r(pairing.slot1)
        index_of_minimized = ak.argmin(metric, axis=-1)
        
        dr_cut = (metric[index_of_minimized] < 0.2)
        best_pairings_that_pass_dr_cut = pairing[index_of_minimized][dr_cut]
        genrecos = ak.flatten(best_pairings_that_pass_dr_cut, axis=1) #.flatten(axis=1)
        ptresponse = genrecos.slot0.pt / genrecos.slot1.pt
        
        output['pt'].fill(dataset=dataset,
                            pt=ak.flatten(Jets.pt))
        output['eta'].fill(dataset=dataset, 
                                 eta=ak.flatten(Jets.eta))
        output['r_pt_ptveta'].fill( dataset=dataset, pt=genrecos.slot1.pt, eta=genrecos.slot1.eta, r=ptresponse)
        return output

    def postprocess(self, accumulator):
        return accumulator

### Get the root files and store them in a dictionary `fileset` to be used in the uproot job

In [None]:
# Get the sample metadata
samples = json.load( open('samples_qcdflat.json') )

fileset = {}
for sample in samples["samples"]:    
    name, xsec, nevents, files = sample['name'], sample['xsec'], sample['nevents'], sample['files']
    for ifile,file in enumerate(files):
        files[ifile] = xrootdstr + file
    fileset[name] = files

print(fileset)

In [None]:
tstart = time.time()

if not useDask:
    output = processor.run_uproot_job(fileset,
                                      treename='Events',
                                      processor_instance=FancyJECL2L3Processor(),
                                      executor=processor.iterative_executor,
                                      executor_args={
                                          'skipbadfiles':False,
                                          'schema': BaseSchema, #NanoAODSchema, 
                                          'workers': 4},
                                      chunksize=50000, maxchunks=100
                                     )
else:
    output = processor.run_uproot_job(fileset,
                                      treename='Events',
                                      processor_instance=FancyJECL2L3Processor(),
                                      executor=processor.dask_executor,
                                      executor_args={
                                          'skipbadfiles':True,
                                          'client': client, 
                                          'schema': BaseSchema, #NanoAODSchema, 
                                          'workers': 2}
                                     )

elapsed = time.time() - tstart
print(output)

In [None]:
print("Events/s:", output['cutflow']['all events']/elapsed)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.rcParams["image.cmap"] = 'Blues'

In [None]:
ax = hist.plotgrid(output['pt'], overlay="dataset", stack=False, density=True)

for iax in ax.flatten():
    iax.set_yscale('log')
    iax.autoscale(axis='y')
    iax.set_title(r'Jet $p_T$')

In [None]:
ax = hist.plotgrid(output['eta'], overlay="dataset", stack=False, density=True)

for iax in ax.flatten():
    iax.autoscale(axis='y')
    iax.set_title(r'Jet $\eta$')

In [None]:
for i in output['r_pt_ptveta'].axis('eta'):
    title = r'$\eta$ range ' + str(i)
    ax = hist.plot2d(output['r_pt_ptveta'].sum('dataset').integrate('eta', i),xaxis='pt')
    ax.set_title(title)

In [None]:
for i in output['r_pt_ptveta'].axis('pt'):
    title = r'$p_T$ range ' + str(i)
    ax = hist.plot2d(output['r_pt_ptveta'].sum('dataset').integrate('pt',i),xaxis='eta')
    ax.set_title(title)

In [None]:
print(output['r_pt_ptveta'].axis('eta'))

In [None]:
len(output['eta'].values()[('QCDFlat',)])