From 7a095d4991d18c40bd4791e6475f22938fe41aa7 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 27 Jun 2022 15:27:46 +0200 Subject: [PATCH] Add a tool to convert data to the .raw format used as input by the HLT --- HLTrigger/Tools/README.md | 26 +++ HLTrigger/Tools/python/convertToRaw.py | 134 ++++++++++++++++ HLTrigger/Tools/scripts/convertToRaw | 210 +++++++++++++++++++++++++ 3 files changed, 370 insertions(+) create mode 100644 HLTrigger/Tools/README.md create mode 100644 HLTrigger/Tools/python/convertToRaw.py create mode 100755 HLTrigger/Tools/scripts/convertToRaw diff --git a/HLTrigger/Tools/README.md b/HLTrigger/Tools/README.md new file mode 100644 index 0000000000000..b240b7b8432c5 --- /dev/null +++ b/HLTrigger/Tools/README.md @@ -0,0 +1,26 @@ +# convertToRaw + +Convert RAW data stored in one or more EDM .root files into the .raw file used as input by the HLT. + +``` +usage: convertToRaw [-h] [-o PATH] [-f EVENTS] [-l EVENTS] [--one-file-per-lumi] FILES [FILES ...] + +Convert RAW data from .root format to .raw format. + +positional arguments: + FILES input files in .root format + +optional arguments: + -h, --help show this help message and exit + -o PATH, --output PATH + base path to store the output files; subdirectories based on the run number are automatically created (default: ) + -f EVENTS, --events_per_file EVENTS + split the output into files with at most EVENTS events (default: 50) + -l EVENTS, --events_per_lumi EVENTS + process at most EVENTS events in each lumisection (default: 11650) + --one-file-per-lumi assume that lumisections are not split across files (and disable --events_per_lumi) (default: False) +``` + +The default behaviour is to process a single luminosity section at a time, in order to support luminosity sections split across multiple files and a limit on the number of events in each lumisection. + +If neither of these features is needed (_i.e._ if lumisections are not split, and all events should be converted) the `--one-file-per-lumi` can be used to process all data with a single job, speeding up the conversion considerably. diff --git a/HLTrigger/Tools/python/convertToRaw.py b/HLTrigger/Tools/python/convertToRaw.py new file mode 100644 index 0000000000000..36163907e8207 --- /dev/null +++ b/HLTrigger/Tools/python/convertToRaw.py @@ -0,0 +1,134 @@ +# Convert the RAW data from EDM .root files into DAQ .raw format +# +# usage: cmsRun $CMSSW_RELEASE_BASE/HLTrigger/Tools/python/convertToRaw.py \ +# inputFiles=/store/path/file.root[,/store/path/file.root,...] \ +# runNumber=NNNNNN \ +# [lumiNumber=NNNN] \ +# [eventsPerFile=50] \ +# [eventsPerLumi=11650] \ +# [outputPath=output_directory] +# +# The output files will appear as output_directory/runNNNNNN/runNNNNNN_lumiNNNN_indexNNNNNN.raw . + +import sys +import os +import FWCore.ParameterSet.Config as cms +import FWCore.ParameterSet.VarParsing as VarParsing + +process = cms.Process("FAKE") + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32(-1) # to be overwritten after parsing the command line options +) + +process.source = cms.Source("PoolSource", + fileNames = cms.untracked.vstring() # to be overwritten after parsing the command line options +) + +process.EvFDaqDirector = cms.Service( "EvFDaqDirector", + runNumber = cms.untracked.uint32( 0 ), # to be overwritten after parsing the command line options + baseDir = cms.untracked.string( "" ), # to be overwritten after parsing the command line options + buBaseDir = cms.untracked.string( "" ), # to be overwritten after parsing the command line options + useFileBroker = cms.untracked.bool( False ), + fileBrokerKeepAlive = cms.untracked.bool( True ), + fileBrokerPort = cms.untracked.string( "8080" ), + fileBrokerUseLocalLock = cms.untracked.bool( True ), + fuLockPollInterval = cms.untracked.uint32( 2000 ), + requireTransfersPSet = cms.untracked.bool( False ), + selectedTransferMode = cms.untracked.string( "" ), + mergingPset = cms.untracked.string( "" ), + outputAdler32Recheck = cms.untracked.bool( False ), +) + +process.writer = cms.OutputModule("RawStreamFileWriterForBU", + source = cms.InputTag('rawDataCollector'), + numEventsPerFile = cms.uint32(0) # to be overwritten after parsing the command line options +) + +process.endpath = cms.EndPath(process.writer) + +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.cerr.FwkReport.reportEvery = 0 # to be overwritten after parsing the command line options + +# parse command line options +options = VarParsing.VarParsing ('python') +for name in 'filePrepend', 'maxEvents', 'outputFile', 'secondaryOutputFile', 'section', 'tag', 'storePrepend', 'totalSections': + del options._register[name] + del options._beenSet[name] + del options._info[name] + del options._types[name] + if name in options._singletons: + del options._singletons[name] + if name in options._lists: + del options._lists[name] + if name in options._noCommaSplit: + del options._noCommaSplit[name] + if name in options._noDefaultClear: + del options._noDefaultClear[name] + + +options.register('runNumber', + 0, + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, + "Run number to use") + +options.register('lumiNumber', + None, + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, + "Luminosity section number to use") + +options.register('eventsPerLumi', + 11650, + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, + "Number of events in the given luminosity section to process") + +options.register('eventsPerFile', + 50, + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.int, + "Split the output into files with at most this number of events") + +options.register('outputPath', + os.getcwd(), + VarParsing.VarParsing.multiplicity.singleton, + VarParsing.VarParsing.varType.string, + "Output directory for the FED RAW data files") + +options.parseArguments() + +# check that the option values are valide +if options.runNumber <= 0: + sys.stderr.write('Invalid run number\n') + sys.exit(1) + +if options.lumiNumber is not None and options.lumiNumber <= 0: + sys.stderr.write('Invalid luminosity section number\n') + sys.exit(1) + +if options.eventsPerLumi == 0 or options.eventsPerLumi < -1: + sys.stderr.write('Invalid number of events per luminosity section\n') + sys.exit(1) + +if options.eventsPerFile <= 0: + sys.stderr.write('Invalid number of events per output file\n') + sys.exit(1) + +# configure the job based on the command line options +process.source.fileNames = options.inputFiles +if options.lumiNumber is not None: + # process only one lumisection + process.source.lumisToProcess = cms.untracked.VLuminosityBlockRange('%d:%d' % (options.runNumber, options.lumiNumber)) + process.maxEvents.input = options.eventsPerLumi +process.EvFDaqDirector.runNumber = options.runNumber +process.EvFDaqDirector.baseDir = options.outputPath +process.EvFDaqDirector.buBaseDir = options.outputPath +process.writer.numEventsPerFile = options.eventsPerFile +process.MessageLogger.cerr.FwkReport.reportEvery = options.eventsPerFile + +# create the output directory, if it does not exist +os.makedirs(options.outputPath, exist_ok=True) +os.makedirs('%s/run%06d' % (options.outputPath, options.runNumber), exist_ok=True) +open('%s/run%06d/fu.lock' % (options.outputPath, options.runNumber), 'w').close() diff --git a/HLTrigger/Tools/scripts/convertToRaw b/HLTrigger/Tools/scripts/convertToRaw new file mode 100755 index 0000000000000..6db19061d0403 --- /dev/null +++ b/HLTrigger/Tools/scripts/convertToRaw @@ -0,0 +1,210 @@ +#! /usr/bin/env python3 + +import argparse +import glob +import json +import os, os.path +import re +import shutil +import socket +import subprocess +import sys + +def cmsRun(config, **args): + cmd = [ 'cmsRun', config ] + [ arg + '=' + str(val) for (arg, val) in args.items() ] + sys.stdout.write(' \\\n '.join(cmd)) + sys.stdout.write('\n\n') + status = subprocess.run(cmd, stdout=None, stderr=None) + status.check_returncode() + + # handle error conditions + if status.returncode < 0: + sys.stderr.write('error: cmsRun was killed by signal %d\n' % -status.returncode) + sys.exit(status.returncode) + elif status.returncode > 0: + sys.stderr.write('error: cmsRun exited with error code %d\n' % status.returncode) + sys.exit(status.returncode) + + +# default values +events_per_file = 50 +events_per_lumi = 11650 +output_directory = '' + +parser = argparse.ArgumentParser(description='Convert RAW data from .root format to .raw format.', formatter_class = argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('files', type=str, metavar='FILES', nargs='+', help='input files in .root format') +parser.add_argument('-o', '--output', type=str, dest='output_directory', metavar='PATH', default='', help='base path to store the output files; subdirectories based on the run number are automatically created') +parser.add_argument('-f', '--events_per_file', type=int, dest='events_per_file', metavar='EVENTS', default=events_per_file, help='split the output into files with at most EVENTS events') +parser.add_argument('-l', '--events_per_lumi', type=int, dest='events_per_lumi', metavar='EVENTS', default=events_per_lumi, help='process at most EVENTS events in each lumisection') +parser.add_argument('--one-file-per-lumi', action='store_true', dest='one_file_per_lumi', default=False, help='assume that lumisections are not split across files (and disable --events_per_lumi)') + +# parse the command line arguments and options +args = parser.parse_args() +if args.output_directory and args.output_directory.endswith('/'): + args.output_directory = args.output_directory[:-1] + +# read the list of input files from the command line arguments +files = [ 'file:' + f if (not ':' in f and not f.startswith('/store/') and os.path.exists(f)) else f for f in args.files ] + +# extract the list of runs and lumiections in the input files +class FileInfo(object): + def __init__(self): + self.events = 0 + self.files = set() + +header = re.compile(r'^ +Run +Lumi +# Events$') +empty = re.compile(r'^ *$') +content = {} + +for f in files: + + # run edmFileUtil --eventsInLumis ... + output = subprocess.run(['edmFileUtil', '--eventsInLumis', f], capture_output=True, text=True) + if output.returncode < 0: + sys.stderr.write('error: edmFileUtil was killed by signal %d\n' % -output.returncode) + sys.stderr.write('\n') + sys.stderr.write(output.stderr) + sys.exit(output.returncode) + elif output.returncode > 0: + sys.stderr.write('error: edmFileUtil exited with error code %d\n' % output.returncode) + sys.stderr.write('\n') + sys.stderr.write(output.stderr) + sys.exit(output.returncode) + + # parse the output of edmFileUtil + parsing = False + for line in output.stdout.splitlines(): + if not parsing and header.match(line): + # start parsing + parsing = True + continue + + if parsing and empty.match(line): + # stop parsing + parsing = False + continue + + if parsing: + run, lumi, events = tuple(map(int, line.split())) + if not run in content: + content[run] = {} + if not lumi in content[run]: + content[run][lumi] = FileInfo() + content[run][lumi].events += events + content[run][lumi].files.add(f) + +# drop empty lumisections +for run in content: + empty_lumis = [ lumi for lumi in content[run] if content[run][lumi].events == 0 ] + for lumi in empty_lumis: + del content[run][lumi] + +# drop empty runs +empty_runs = [ run for run in content if not content[run] ] +for run in empty_runs: + del content[run] + +# locate the CMSSW configuration file +config_name = 'HLTrigger/Tools/python/convertToRaw.py' +current_area = os.environ['CMSSW_BASE'] +release_area = os.environ['CMSSW_RELEASE_BASE'] + +config_py = current_area + '/src/' + config_name +if not os.path.exists(config_py): + config_py = release_area + '/src/' + config_name +if not os.path.exists(config_py): + sys.stderr.write('error: cannot find the configuration file %s\n' % config_name) + sys.exit(1) + +# convert the input data to FED RAW data format + +# process each run +for run in sorted(content): + for lumi in sorted(content[run]): + print("events: %d" % content[run][lumi].events) + print("file: %s" % ', '.join(content[run][lumi].files)) + +for run in sorted(content): + + # create the output directory structure + run_path = args.output_directory + f'/run{run:06d}' + shutil.rmtree(run_path, ignore_errors=True) + os.makedirs(run_path) + + if args.one_file_per_lumi: + # process the whole run + lumis = sorted(content[run]) + print('found run %d, lumis %d-%d, with %d events' % (run, min(lumis), max(lumis), sum(content[run][lumi].events for lumi in lumis))) + cmsRun(config_py, inputFiles = ','.join(files), runNumber = run, eventsPerFile = args.events_per_file, outputPath = args.output_directory) + + else: + # process lumisections individualy, then merge the output + summary = { + 'data': [0, 0, 0, 0], # [ 'events', 'files', 'lumisections', 'last lumisection' ] + 'definition': run_path + '/jsd/EoR.jsd', + 'source': socket.getfqdn() + '_' + str(os.getpid()) + } + + for lumi in sorted(content[run]): + + # process individual lumisections + print('found run %d, lumi %d, with %d events' % (run, lumi, content[run][lumi].events)) + lumi_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}' + shutil.rmtree(lumi_path, ignore_errors=True) + os.makedirs(lumi_path) + cmsRun(config_py, inputFiles = ','.join(content[run][lumi].files), runNumber = run, lumiNumber = lumi, eventsPerLumi = args.events_per_lumi, eventsPerFile = args.events_per_file, outputPath = lumi_path) + + # merge all lumisetions data + + # number of events expected to be processed + if args.events_per_lumi < 0: + expected_events = content[run][lumi].events + else: + expected_events = min(args.events_per_lumi, content[run][lumi].events) + + # number of files expected to be created + expected_files = (expected_events + args.events_per_file - 1) // args.events_per_file + + # find the files produced by the conversion job and move them to the per-run path + lumi_base_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}' + lumi_path = lumi_base_path + f'/run{run:06d}' + + # jsd files + jsd_path = lumi_path + '/jsd' + if not os.path.exists(run_path + '/jsd'): + shutil.move(jsd_path, run_path) + else: + shutil.rmtree(jsd_path) + + # lumisection data and EoLS files + lumi_files = glob.glob(lumi_path + f'/run{run:06d}_ls{lumi:04d}_*') + for f in lumi_files: + shutil.move(f, run_path + '/') + + # read the partial EoR file + eor_file = lumi_path + f'/run{run:06d}_ls0000_EoR.jsn' + with open(eor_file) as f: + eor = json.load(f) + produced_events = int(eor['data'][0]) + produced_files = int(eor['data'][1]) + produced_lumis = int(eor['data'][2]) + produced_last_lumi = int(eor['data'][3]) + assert produced_events == expected_events + assert produced_files == expected_files + assert produced_lumis == 1 + assert produced_last_lumi == lumi + summary['data'][0] += expected_events + summary['data'][1] += expected_files + summary['data'][2] += 1 + summary['data'][3] = lumi + os.remove(eor_file) + + # remove the intermediate directory + shutil.rmtree(lumi_base_path, ignore_errors=True) + + # write the final EoR file + # implemented by hand instead of using json.dump() to match the style used by the DAQ tools + eor_file = run_path + f'/run{run:06d}_ls0000_EoR.jsn' + f = open(eor_file, 'w') + f.write('{\n "data" : [ "%d", "%d", "%d", "%d" ],\n "definition" : "%s",\n "source" : "%s"\n}\n' % (summary['data'][0], summary['data'][1], summary['data'][2], summary['data'][3], summary['definition'], summary['source'])) + f.close()