Skip to content

Commit

Permalink
Add a tool to convert data to the .raw format used as input by the HLT
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Jun 27, 2022
1 parent 767a838 commit 7a095d4
Show file tree
Hide file tree
Showing 3 changed files with 370 additions and 0 deletions.
26 changes: 26 additions & 0 deletions HLTrigger/Tools/README.md
@@ -0,0 +1,26 @@
# convertToRaw

Convert RAW data stored in one or more EDM .root files into the .raw file used as input by the HLT.

```
usage: convertToRaw [-h] [-o PATH] [-f EVENTS] [-l EVENTS] [--one-file-per-lumi] FILES [FILES ...]
Convert RAW data from .root format to .raw format.
positional arguments:
FILES input files in .root format
optional arguments:
-h, --help show this help message and exit
-o PATH, --output PATH
base path to store the output files; subdirectories based on the run number are automatically created (default: )
-f EVENTS, --events_per_file EVENTS
split the output into files with at most EVENTS events (default: 50)
-l EVENTS, --events_per_lumi EVENTS
process at most EVENTS events in each lumisection (default: 11650)
--one-file-per-lumi assume that lumisections are not split across files (and disable --events_per_lumi) (default: False)
```

The default behaviour is to process a single luminosity section at a time, in order to support luminosity sections split across multiple files and a limit on the number of events in each lumisection.

If neither of these features is needed (_i.e._ if lumisections are not split, and all events should be converted) the `--one-file-per-lumi` can be used to process all data with a single job, speeding up the conversion considerably.
134 changes: 134 additions & 0 deletions HLTrigger/Tools/python/convertToRaw.py
@@ -0,0 +1,134 @@
# Convert the RAW data from EDM .root files into DAQ .raw format
#
# usage: cmsRun $CMSSW_RELEASE_BASE/HLTrigger/Tools/python/convertToRaw.py \
# inputFiles=/store/path/file.root[,/store/path/file.root,...] \
# runNumber=NNNNNN \
# [lumiNumber=NNNN] \
# [eventsPerFile=50] \
# [eventsPerLumi=11650] \
# [outputPath=output_directory]
#
# The output files will appear as output_directory/runNNNNNN/runNNNNNN_lumiNNNN_indexNNNNNN.raw .

import sys
import os
import FWCore.ParameterSet.Config as cms
import FWCore.ParameterSet.VarParsing as VarParsing

process = cms.Process("FAKE")

process.maxEvents = cms.untracked.PSet(
input = cms.untracked.int32(-1) # to be overwritten after parsing the command line options
)

process.source = cms.Source("PoolSource",
fileNames = cms.untracked.vstring() # to be overwritten after parsing the command line options
)

process.EvFDaqDirector = cms.Service( "EvFDaqDirector",
runNumber = cms.untracked.uint32( 0 ), # to be overwritten after parsing the command line options
baseDir = cms.untracked.string( "" ), # to be overwritten after parsing the command line options
buBaseDir = cms.untracked.string( "" ), # to be overwritten after parsing the command line options
useFileBroker = cms.untracked.bool( False ),
fileBrokerKeepAlive = cms.untracked.bool( True ),
fileBrokerPort = cms.untracked.string( "8080" ),
fileBrokerUseLocalLock = cms.untracked.bool( True ),
fuLockPollInterval = cms.untracked.uint32( 2000 ),
requireTransfersPSet = cms.untracked.bool( False ),
selectedTransferMode = cms.untracked.string( "" ),
mergingPset = cms.untracked.string( "" ),
outputAdler32Recheck = cms.untracked.bool( False ),
)

process.writer = cms.OutputModule("RawStreamFileWriterForBU",
source = cms.InputTag('rawDataCollector'),
numEventsPerFile = cms.uint32(0) # to be overwritten after parsing the command line options
)

process.endpath = cms.EndPath(process.writer)

process.load('FWCore.MessageService.MessageLogger_cfi')
process.MessageLogger.cerr.FwkReport.reportEvery = 0 # to be overwritten after parsing the command line options

# parse command line options
options = VarParsing.VarParsing ('python')
for name in 'filePrepend', 'maxEvents', 'outputFile', 'secondaryOutputFile', 'section', 'tag', 'storePrepend', 'totalSections':
del options._register[name]
del options._beenSet[name]
del options._info[name]
del options._types[name]
if name in options._singletons:
del options._singletons[name]
if name in options._lists:
del options._lists[name]
if name in options._noCommaSplit:
del options._noCommaSplit[name]
if name in options._noDefaultClear:
del options._noDefaultClear[name]


options.register('runNumber',
0,
VarParsing.VarParsing.multiplicity.singleton,
VarParsing.VarParsing.varType.int,
"Run number to use")

options.register('lumiNumber',
None,
VarParsing.VarParsing.multiplicity.singleton,
VarParsing.VarParsing.varType.int,
"Luminosity section number to use")

options.register('eventsPerLumi',
11650,
VarParsing.VarParsing.multiplicity.singleton,
VarParsing.VarParsing.varType.int,
"Number of events in the given luminosity section to process")

options.register('eventsPerFile',
50,
VarParsing.VarParsing.multiplicity.singleton,
VarParsing.VarParsing.varType.int,
"Split the output into files with at most this number of events")

options.register('outputPath',
os.getcwd(),
VarParsing.VarParsing.multiplicity.singleton,
VarParsing.VarParsing.varType.string,
"Output directory for the FED RAW data files")

options.parseArguments()

# check that the option values are valide
if options.runNumber <= 0:
sys.stderr.write('Invalid run number\n')
sys.exit(1)

if options.lumiNumber is not None and options.lumiNumber <= 0:
sys.stderr.write('Invalid luminosity section number\n')
sys.exit(1)

if options.eventsPerLumi == 0 or options.eventsPerLumi < -1:
sys.stderr.write('Invalid number of events per luminosity section\n')
sys.exit(1)

if options.eventsPerFile <= 0:
sys.stderr.write('Invalid number of events per output file\n')
sys.exit(1)

# configure the job based on the command line options
process.source.fileNames = options.inputFiles
if options.lumiNumber is not None:
# process only one lumisection
process.source.lumisToProcess = cms.untracked.VLuminosityBlockRange('%d:%d' % (options.runNumber, options.lumiNumber))
process.maxEvents.input = options.eventsPerLumi
process.EvFDaqDirector.runNumber = options.runNumber
process.EvFDaqDirector.baseDir = options.outputPath
process.EvFDaqDirector.buBaseDir = options.outputPath
process.writer.numEventsPerFile = options.eventsPerFile
process.MessageLogger.cerr.FwkReport.reportEvery = options.eventsPerFile

# create the output directory, if it does not exist
os.makedirs(options.outputPath, exist_ok=True)
os.makedirs('%s/run%06d' % (options.outputPath, options.runNumber), exist_ok=True)
open('%s/run%06d/fu.lock' % (options.outputPath, options.runNumber), 'w').close()
210 changes: 210 additions & 0 deletions HLTrigger/Tools/scripts/convertToRaw
@@ -0,0 +1,210 @@
#! /usr/bin/env python3

import argparse
import glob
import json
import os, os.path
import re
import shutil
import socket
import subprocess
import sys

def cmsRun(config, **args):
cmd = [ 'cmsRun', config ] + [ arg + '=' + str(val) for (arg, val) in args.items() ]
sys.stdout.write(' \\\n '.join(cmd))
sys.stdout.write('\n\n')
status = subprocess.run(cmd, stdout=None, stderr=None)
status.check_returncode()

# handle error conditions
if status.returncode < 0:
sys.stderr.write('error: cmsRun was killed by signal %d\n' % -status.returncode)
sys.exit(status.returncode)
elif status.returncode > 0:
sys.stderr.write('error: cmsRun exited with error code %d\n' % status.returncode)
sys.exit(status.returncode)


# default values
events_per_file = 50
events_per_lumi = 11650
output_directory = ''

parser = argparse.ArgumentParser(description='Convert RAW data from .root format to .raw format.', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('files', type=str, metavar='FILES', nargs='+', help='input files in .root format')
parser.add_argument('-o', '--output', type=str, dest='output_directory', metavar='PATH', default='', help='base path to store the output files; subdirectories based on the run number are automatically created')
parser.add_argument('-f', '--events_per_file', type=int, dest='events_per_file', metavar='EVENTS', default=events_per_file, help='split the output into files with at most EVENTS events')
parser.add_argument('-l', '--events_per_lumi', type=int, dest='events_per_lumi', metavar='EVENTS', default=events_per_lumi, help='process at most EVENTS events in each lumisection')
parser.add_argument('--one-file-per-lumi', action='store_true', dest='one_file_per_lumi', default=False, help='assume that lumisections are not split across files (and disable --events_per_lumi)')

# parse the command line arguments and options
args = parser.parse_args()
if args.output_directory and args.output_directory.endswith('/'):
args.output_directory = args.output_directory[:-1]

# read the list of input files from the command line arguments
files = [ 'file:' + f if (not ':' in f and not f.startswith('/store/') and os.path.exists(f)) else f for f in args.files ]

# extract the list of runs and lumiections in the input files
class FileInfo(object):
def __init__(self):
self.events = 0
self.files = set()

header = re.compile(r'^ +Run +Lumi +# Events$')
empty = re.compile(r'^ *$')
content = {}

for f in files:

# run edmFileUtil --eventsInLumis ...
output = subprocess.run(['edmFileUtil', '--eventsInLumis', f], capture_output=True, text=True)
if output.returncode < 0:
sys.stderr.write('error: edmFileUtil was killed by signal %d\n' % -output.returncode)
sys.stderr.write('\n')
sys.stderr.write(output.stderr)
sys.exit(output.returncode)
elif output.returncode > 0:
sys.stderr.write('error: edmFileUtil exited with error code %d\n' % output.returncode)
sys.stderr.write('\n')
sys.stderr.write(output.stderr)
sys.exit(output.returncode)

# parse the output of edmFileUtil
parsing = False
for line in output.stdout.splitlines():
if not parsing and header.match(line):
# start parsing
parsing = True
continue

if parsing and empty.match(line):
# stop parsing
parsing = False
continue

if parsing:
run, lumi, events = tuple(map(int, line.split()))
if not run in content:
content[run] = {}
if not lumi in content[run]:
content[run][lumi] = FileInfo()
content[run][lumi].events += events
content[run][lumi].files.add(f)

# drop empty lumisections
for run in content:
empty_lumis = [ lumi for lumi in content[run] if content[run][lumi].events == 0 ]
for lumi in empty_lumis:
del content[run][lumi]

# drop empty runs
empty_runs = [ run for run in content if not content[run] ]
for run in empty_runs:
del content[run]

# locate the CMSSW configuration file
config_name = 'HLTrigger/Tools/python/convertToRaw.py'
current_area = os.environ['CMSSW_BASE']
release_area = os.environ['CMSSW_RELEASE_BASE']

config_py = current_area + '/src/' + config_name
if not os.path.exists(config_py):
config_py = release_area + '/src/' + config_name
if not os.path.exists(config_py):
sys.stderr.write('error: cannot find the configuration file %s\n' % config_name)
sys.exit(1)

# convert the input data to FED RAW data format

# process each run
for run in sorted(content):
for lumi in sorted(content[run]):
print("events: %d" % content[run][lumi].events)
print("file: %s" % ', '.join(content[run][lumi].files))

for run in sorted(content):

# create the output directory structure
run_path = args.output_directory + f'/run{run:06d}'
shutil.rmtree(run_path, ignore_errors=True)
os.makedirs(run_path)

if args.one_file_per_lumi:
# process the whole run
lumis = sorted(content[run])
print('found run %d, lumis %d-%d, with %d events' % (run, min(lumis), max(lumis), sum(content[run][lumi].events for lumi in lumis)))
cmsRun(config_py, inputFiles = ','.join(files), runNumber = run, eventsPerFile = args.events_per_file, outputPath = args.output_directory)

else:
# process lumisections individualy, then merge the output
summary = {
'data': [0, 0, 0, 0], # [ 'events', 'files', 'lumisections', 'last lumisection' ]
'definition': run_path + '/jsd/EoR.jsd',
'source': socket.getfqdn() + '_' + str(os.getpid())
}

for lumi in sorted(content[run]):

# process individual lumisections
print('found run %d, lumi %d, with %d events' % (run, lumi, content[run][lumi].events))
lumi_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}'
shutil.rmtree(lumi_path, ignore_errors=True)
os.makedirs(lumi_path)
cmsRun(config_py, inputFiles = ','.join(content[run][lumi].files), runNumber = run, lumiNumber = lumi, eventsPerLumi = args.events_per_lumi, eventsPerFile = args.events_per_file, outputPath = lumi_path)

# merge all lumisetions data

# number of events expected to be processed
if args.events_per_lumi < 0:
expected_events = content[run][lumi].events
else:
expected_events = min(args.events_per_lumi, content[run][lumi].events)

# number of files expected to be created
expected_files = (expected_events + args.events_per_file - 1) // args.events_per_file

# find the files produced by the conversion job and move them to the per-run path
lumi_base_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}'
lumi_path = lumi_base_path + f'/run{run:06d}'

# jsd files
jsd_path = lumi_path + '/jsd'
if not os.path.exists(run_path + '/jsd'):
shutil.move(jsd_path, run_path)
else:
shutil.rmtree(jsd_path)

# lumisection data and EoLS files
lumi_files = glob.glob(lumi_path + f'/run{run:06d}_ls{lumi:04d}_*')
for f in lumi_files:
shutil.move(f, run_path + '/')

# read the partial EoR file
eor_file = lumi_path + f'/run{run:06d}_ls0000_EoR.jsn'
with open(eor_file) as f:
eor = json.load(f)
produced_events = int(eor['data'][0])
produced_files = int(eor['data'][1])
produced_lumis = int(eor['data'][2])
produced_last_lumi = int(eor['data'][3])
assert produced_events == expected_events
assert produced_files == expected_files
assert produced_lumis == 1
assert produced_last_lumi == lumi
summary['data'][0] += expected_events
summary['data'][1] += expected_files
summary['data'][2] += 1
summary['data'][3] = lumi
os.remove(eor_file)

# remove the intermediate directory
shutil.rmtree(lumi_base_path, ignore_errors=True)

# write the final EoR file
# implemented by hand instead of using json.dump() to match the style used by the DAQ tools
eor_file = run_path + f'/run{run:06d}_ls0000_EoR.jsn'
f = open(eor_file, 'w')
f.write('{\n "data" : [ "%d", "%d", "%d", "%d" ],\n "definition" : "%s",\n "source" : "%s"\n}\n' % (summary['data'][0], summary['data'][1], summary['data'][2], summary['data'][3], summary['definition'], summary['source']))
f.close()

0 comments on commit 7a095d4

Please sign in to comment.