In [1]:
!pip install git+https://github.com/oshadura/topcoffea.git@coffea-casa-analysis
#! pip install -e .

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting git+https://github.com/oshadura/topcoffea.git@coffea-casa-analysis
  Cloning https://github.com/oshadura/topcoffea.git (to revision coffea-casa-analysis) to /tmp/pip-req-build-93rj5dyo
Building wheels for collected packages: topcoffea
  Building wheel for topcoffea (setup.py) ... [?25ldone
[?25h  Created wheel for topcoffea: filename=topcoffea-0.0.0-py3-none-any.whl size=4514059 sha256=3cd0a496c2daa738d2ff062ea7d3219ae2643305a661cf600445cbc2470beea3
  Stored in directory: /tmp/pip-ephem-wheel-cache-hympjofx/wheels/3a/95/24/bfbb2d1dc4571114a0106e7d0d6a8124bcd2ad9e72ec9ea14e
Successfully built topcoffea


In [2]:
import lz4.frame as lz4f
import pickle
import json
import time
import cloudpickle
import gzip
import os
from optparse import OptionParser

import uproot
import numpy as np
from coffea import hist, processor
from coffea.util import load, save
from coffea.nanoevents import NanoAODSchema

from topcoffea.modules import samples
from topcoffea.modules import fileReader

#FIXME: analysis is not installed anywhere (should be installed as well)
import topcoffea.analysis.topEFT.topeft

import importlib.resources

if hasattr(__builtins__,'__IPYTHON__'):
    import sys
    sys.argv = ['']

In [3]:
import argparse
parser = argparse.ArgumentParser(description='You can customize your run')
parser.add_argument('jsonFiles'           , nargs='?', help = 'Json file(s) containing files and metadata')
parser.add_argument('--prefix', '-r'     , nargs='?', help = 'Prefix or redirector to look for the files')
parser.add_argument('--test','-t'       , action='store_true'  , help = 'To perform a test, run over a few events in a couple of chunks')
parser.add_argument('--pretend'        , action='store_true'  , help = 'Read json files but, not execute the analysis')
#parser.add_argument('--nworkers','-n'   , default=8  , help = 'Number of workers')
parser.add_argument('--chunksize','-s'   , default=500000  , help = 'Number of events per chunk')
parser.add_argument('--nchunks','-c'   , default=None  , help = 'You can choose to run only a number of chunks')
parser.add_argument('--outname','-o'   , default='plotsTopEFT', help = 'Name of the output file with histograms')
parser.add_argument('--outpath','-p'   , default='histos', help = 'Name of the output directory')
parser.add_argument('--treename'   , default='Events', help = 'Name of the tree inside the files')
parser.add_argument('--do-errors', action='store_true', help = 'Save the w**2 coefficients')

args = parser.parse_args()

if args.jsonFiles is not None:
  jsonFiles    = args.jsonFiles
  print('jsonFiles {}'.format(args.jsonFiles))
else:
  with importlib.resources.path("topcoffea.json", "TTZToLLNuNu_M10.json") as path:
    jsonFiles = str(path)
    print('jsonFile was selected for UNL {}'.format(jsonFiles))
    
if args.prefix is not None:
  prefix    = args.prefix
else:
  prefix = "root://xcache//"

dotest     = args.test
#nworkers   = int(args.nworkers)
chunksize  = int(args.chunksize)
nchunks    = int(args.nchunks) if not args.nchunks is None else args.nchunks
outname    = args.outname
outpath    = args.outpath
pretend    = args.pretend
treename   = args.treename
do_errors = args.do_errors

if dotest:
  nchunks = 2
  chunksize = 10000
  nworkers = 1
  print('Running a fast test with %i workers, %i chunks of %i events'%(nworkers, nchunks, chunksize))

jsonFile was selected for UNL /opt/conda/lib/python3.8/site-packages/topcoffea/json/TTZToLLNuNu_M10.json


In [4]:

samplesdict = {}
allInputFiles = []

def LoadJsonToSampleName(jsonFile, prefix):
  sampleName = jsonFile if not '/' in jsonFile else jsonFile[jsonFile.rfind('/')+1:]
  if sampleName.endswith('.json'): sampleName = sampleName[:-5]
  with open(jsonFile) as jf:
    samplesdict[sampleName] = json.load(jf)
    samplesdict[sampleName]['redirector'] = prefix

if  isinstance(jsonFiles, str) and ',' in jsonFiles: jsonFiles = jsonFiles.replace(' ', '').split(',')
elif isinstance(jsonFiles, str)                     : jsonFiles = [jsonFiles]

for jsonFile in jsonFiles:
  if os.path.isdir(jsonFile):
    if not jsonFile.endswith('/'): jsonFile+='/'
    for f in os.path.listdir(jsonFile):
      if f.endswith('.json'): allInputFiles.append(jsonFile+f)
  else:
    allInputFiles.append(jsonFile)

print(allInputFiles)
# Read from cfg files
for f in allInputFiles:
  if not os.path.isfile(f):
    print('[WARNING] Input file "%s% not found!'%f)
    continue
  # This input file is a json file, not a cfg
  if f.endswith('.json'): 
    LoadJsonToSampleName(f, prefix)
  # Open cfg files
  else:
    with open(f) as fin:
      print(' >> Reading json from cfg file...')
      lines = fin.readlines()
      for l in lines:
        if '#' in l: l=l[:l.find('#')]
        l = l.replace(' ', '').replace('\n', '')
        if l == '': continue
        if ',' in l:
          l = l.split(',')
          for nl in l:
            if not os.path.isfile(l): prefix = nl
            else: LoadJsonToSampleName(nl, prefix)
        else:
          if not os.path.isfile(l): prefix = l
          else: LoadJsonToSampleName(l, prefix)

flist = {};
for sname in samplesdict.keys():
  redirector = samplesdict[sname]['redirector']
  flist[sname] = [(redirector+f) for f in samplesdict[sname]['files']]
  samplesdict[sname]['year'] = int(samplesdict[sname]['year'])
  samplesdict[sname]['xsec'] = float(samplesdict[sname]['xsec'])
  samplesdict[sname]['nEvents'] = int(samplesdict[sname]['nEvents'])
  samplesdict[sname]['nGenEvents'] = int(samplesdict[sname]['nGenEvents'])
  samplesdict[sname]['nSumOfWeights'] = float(samplesdict[sname]['nSumOfWeights'])

  # Print file info
  print('>> '+sname)
  print('   - isData?      : %s'   %('YES' if samplesdict[sname]['isData'] else 'NO'))
  print('   - year         : %i'   %samplesdict[sname]['year'])
  print('   - xsec         : %f'   %samplesdict[sname]['xsec'])
  print('   - histAxisName : %s'   %samplesdict[sname]['histAxisName'])
  print('   - options      : %s'   %samplesdict[sname]['options'])
  print('   - tree         : %s'   %samplesdict[sname]['treeName'])
  print('   - nEvents      : %i'   %samplesdict[sname]['nEvents'])
  print('   - nGenEvents   : %i'   %samplesdict[sname]['nGenEvents'])
  print('   - SumWeights   : %f'   %samplesdict[sname]['nSumOfWeights'])
  print('   - Prefix       : %s'   %samplesdict[sname]['redirector'])
  print('   - nFiles       : %i'   %len(samplesdict[sname]['files']))
  for fname in samplesdict[sname]['files']: print('     %s'%fname)

if pretend: 
  print('pretending...')
  exit() 

# Check that all datasets have the same list of WCs
for i,k in enumerate(samplesdict.keys()):
  if i == 0:
    wc_lst = samplesdict[k]['WCnames']
  if wc_lst != samplesdict[k]['WCnames']:
    raise Exception("Not all of the datasets have the same list of WCs.")

['/opt/conda/lib/python3.8/site-packages/topcoffea/json/TTZToLLNuNu_M10.json']
>> TTZToLLNuNu_M10
   - isData?      : NO
   - year         : 2018
   - xsec         : 0.252900
   - histAxisName : 
   - options      : 
   - tree         : Events
   - nEvents      : 14542666
   - nGenEvents   : 4876491
   - SumWeights   : 19992000.000000
   - Prefix       : root://xcache//
   - nFiles       : 18
     /store/user/jrgonzal/nanoAODcrab/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/mc2018_28apr2021_TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/210427_230816/0000/tree_1.root
     /store/user/jrgonzal/nanoAODcrab/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/mc2018_28apr2021_TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/210427_230816/0000/tree_10.root
     /store/user/jrgonzal/nanoAODcrab/TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/mc2018_28apr2021_TTZToLLNuNu_M-10_TuneCP5_13TeV-amcatnlo-pythia8/210427_230816/0000/tree_11.root
     /store/user/jrgonzal/nanoAODcrab/TTZToLLNuNu_M-10_T

In [None]:
processor_instance = topcoffea.analysis.topEFT.topeft.AnalysisProcessor(samplesdict,wc_lst,do_errors)

from dask.distributed import Client, Worker, WorkerPlugin
import os
from typing import List
class DependencyInstaller(WorkerPlugin):
    def __init__(self, dependencies: List[str]):
        self._depencendies = " ".join(f"'{dep}'" for dep in dependencies)
    def setup(self, worker: Worker):
        os.system(f"pip install {self._depencendies}")
dependency_installer = DependencyInstaller([
    "git+https://github.com/oshadura/topcoffea.git@coffea-casa-analysis","awkward==1.3.0"
])

client = Client("tls://localhost:8786")
client.register_worker_plugin(dependency_installer)

executor_args = {
                 'schema': NanoAODSchema,
                 'client': client,
                 'savemetrics': True
}

# Run the processor and get the output                                                                                                                                                                     
tic = time.time()
output = processor.run_uproot_job(flist,
                                  treename=treename,
                                  processor_instance=processor_instance,
                                  executor=processor.futures_executor,
                                  executor_args=executor_args,
                                  chunksize=chunksize,
                                  maxchunks=nchunks
                                 )
toc = time.time()

results['time'].append(toc - tic)
results['events/s/thread'].append(output[1]['entries'].value / output[1]['processtime'].value)
results['events/s'].append(output[1]['entries'].value / (toc - tic))

nbins = sum(sum(arr.size for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
nfilled = sum(sum(np.sum(arr > 0) for arr in h._sumw.values()) for h in output.values() if isinstance(h, hist.Hist))
print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))

os.system("mkdir -p histos/")
print('Saving output in %s...'%("histos/" + outname + ".pkl.gz"))
with gzip.open("histos/" + outname + ".pkl.gz", "wb") as fout:
    cloudpickle.dump(output, fout)
print('Done!')

Preprocessing:   0%|          | 0/18 [00:00<?, ?file/s]

Processing:   0%|          | 0/32 [00:00<?, ?chunk/s]



Same-sign events [ee, emu, mumu] = [5907, 10436, 4663]




Same-sign events [ee, emu, mumu] = [5851, 10626, 4732]




Same-sign events [ee, emu, mumu] = [6031, 10488, 4745]




Same-sign events [ee, emu, mumu] = [5863, 10530, 4748]




Same-sign events [ee, emu, mumu] = [6242, 11020, 4941]




Same-sign events [ee, emu, mumu] = [6231, 11196, 4865]




Same-sign events [ee, emu, mumu] = [6154, 10726, 4804]




Same-sign events [ee, emu, mumu] = [5953, 10632, 4771]




Same-sign events [ee, emu, mumu] = [5913, 10634, 4767]




Same-sign events [ee, emu, mumu] = [5952, 10631, 4699]




Same-sign events [ee, emu, mumu] = [8618, 15148, 6914]




Same-sign events [ee, emu, mumu] = [9841, 17403, 7805]




Same-sign events [ee, emu, mumu] = [6089, 10465, 4666]




Same-sign events [ee, emu, mumu] = [5925, 10627, 4748]




Same-sign events [ee, emu, mumu] = [5894, 10619, 4803]




Same-sign events [ee, emu, mumu] = [5843, 10488, 4660]




Same-sign events [ee, emu, mumu] = [6038, 10628, 4828]


In [None]:
from __future__ import print_function, division
from collections import defaultdict, OrderedDict
import gzip
import pickle
import json
import os
import uproot
import matplotlib.pyplot as plt
import numpy as np
from coffea import hist, processor
from coffea.hist import plot
from cycler import cycler

from topcoffea.plotter.OutText import OutText


path = 'histos/plotsTopEFT.pkl.gz'
outname = 'temp.png'

# Select variable, channel and cuts
var = 'met'
channel = ['eemSSonZ', 'eemSSoffZ', 'mmeSSonZ', 'mmeSSoffZ','eeeSSonZ', 'eeeSSoffZ', 'mmmSSonZ', 'mmmSSoffZ']
cut = 'base'

print('Opening path: ', path)
hists = {}
with gzip.open(path) as fin:
  hin = pickle.load(fin)
  print(' >> looking for histograms...')
  for k in hin.keys():
    if k in hists: hists[k]+=hin[k]
    else:          hists[k]=hin[k]


# Create figure
fig, (ax, rax) = plt.subplots(2, 1, figsize=(14,7), gridspec_kw={"height_ratios": (3, 1)}, sharex=True)
plt.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.9)

# Select the histogram var, channel and cut
h = hists[var]
h = h.integrate('channel', channel)
h = h.integrate('cut', cut)

# Integrate over samples
h = h.sum('sample')
 
# Plot and save figure to outname
hist.plot1d(h, ax=ax, line_opts={'color':'orange'})
fig.savefig(outname)
print('Output histogram saved in %s'%outname)