Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement skipBadFiles in RootEmbeddedFileSequence::readOneRandom #32821

Merged
merged 4 commits into from Feb 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 52 additions & 15 deletions IOPool/Input/src/RootEmbeddedFileSequence.cc
Expand Up @@ -15,10 +15,18 @@
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"

#include "CLHEP/Random/RandFlat.h"

#include <random>
#include <algorithm>
#include <atomic>

namespace {
std::atomic<unsigned int> badFilesSkipped_{0};
auto operator"" _uz(unsigned long long i) -> std::size_t { return std::size_t{i}; } // uz will be in C++23
} // namespace

namespace edm {
class EventPrincipal;
Expand All @@ -43,7 +51,8 @@ namespace edm {
initialNumberOfEventsToSkip_(pset.getUntrackedParameter<unsigned int>("skipEvents", 0U)),
treeCacheSize_(pset.getUntrackedParameter<unsigned int>("cacheSize", roottree::defaultCacheSize)),
enablePrefetching_(false),
enforceGUIDInFileName_(pset.getUntrackedParameter<bool>("enforceGUIDInFileName", false)) {
enforceGUIDInFileName_(pset.getUntrackedParameter<bool>("enforceGUIDInFileName", false)),
maxFileSkips_(pset.getUntrackedParameter<unsigned int>("maxFileSkips", std::min(3_uz, numberOfFiles()))) {
if (noFiles()) {
throw Exception(errors::NoSecondaryFiles)
<< "RootEmbeddedFileSequence no input files specified for secondary input source.\n";
Expand Down Expand Up @@ -92,17 +101,21 @@ namespace edm {
unsigned int seed;
f.read(reinterpret_cast<char*>(&seed), sizeof(seed));
std::default_random_engine dre(seed);
size_t count = numberOfFiles();
std::uniform_int_distribution<int> distribution(0, count - 1);
while (!rootFile() && count != 0) {
--count;
std::uniform_int_distribution<int> distribution(0, numberOfFiles() - 1);
while (!rootFile() && badFilesSkipped_ < maxFileSkips_) {
int offset = distribution(dre);
setAtFileSequenceNumber(offset);
initFile(input_.skipBadFiles());
if (not rootFile()) {
++badFilesSkipped_;
}
}
}
if (rootFile()) {
input_.productRegistryUpdate().updateFromInput(rootFile()->productRegistry()->productList());
} else {
throw Exception(errors::FileOpenError) << "RootEmbeddedFileSequence::RootEmbeddedFileSequence(): "
<< " input file retries exhausted.\n";
}
}

Expand Down Expand Up @@ -229,7 +242,7 @@ namespace edm {
if (!found) {
throw Exception(errors::NotFound) << "RootEmbeddedFileSequence::readOneSpecified(): Secondary Input files"
<< " do not contain specified event:\n"
<< id << "\n";
<< id << " in file id " << idx.fileNameHash() << "\n";
}
assert(rootFile());
found = rootFile()->readCurrentEvent(cache);
Expand All @@ -246,16 +259,36 @@ namespace edm {
assert(engine);
unsigned int currentSeqNumber = sequenceNumberOfFile();
while (eventsRemainingInFile_ == 0) {
unsigned int newSeqNumber = CLHEP::RandFlat::shootInt(engine, fileCatalogItems().size());
setAtFileSequenceNumber(newSeqNumber);
if (newSeqNumber != currentSeqNumber) {
initFile(false);
currentSeqNumber = newSeqNumber;
bool opened{false};
while (!opened && badFilesSkipped_ < maxFileSkips_) {
unsigned int newSeqNumber = CLHEP::RandFlat::shootInt(engine, fileCatalogItems().size());
setAtFileSequenceNumber(newSeqNumber);
if (newSeqNumber != currentSeqNumber) {
initFile(input_.skipBadFiles());
currentSeqNumber = newSeqNumber;
}
if (rootFile()) {
eventsRemainingInFile_ = rootFile()->eventTree().entries();
if (eventsRemainingInFile_ == 0) {
if (!input_.skipBadFiles()) {
throw Exception(errors::NotFound) << "RootEmbeddedFileSequence::readOneRandom(): Secondary Input file "
<< fileNames()[0] << " contains no events.\n";
}
LogWarning("RootEmbeddedFileSequence") << "RootEmbeddedFileSequence::readOneRandom(): Secondary Input file "
<< fileNames()[0] << " contains no events and will be skipped.\n";
++badFilesSkipped_;
} else {
opened = true;
}
} else {
if (newSeqNumber != currentSeqNumber) {
++badFilesSkipped_;
}
}
}
eventsRemainingInFile_ = rootFile()->eventTree().entries();
if (eventsRemainingInFile_ == 0) {
throw Exception(errors::NotFound) << "RootEmbeddedFileSequence::readOneRandom(): Secondary Input file "
<< fileNames()[0] << " contains no events.\n";
if (not opened) {
throw Exception(errors::FileOpenError) << "RootEmbeddedFileSequence::readOneRandom(): "
<< " input file retries exhausted.\n";
}
rootFile()->setAtEventEntry(CLHEP::RandFlat::shootInt(engine, eventsRemainingInFile_) - 1);
}
Expand Down Expand Up @@ -336,6 +369,10 @@ namespace edm {
desc.addUntracked<unsigned int>("skipEvents", 0U)
->setComment(
"Skip the first 'skipEvents' events. Used only if 'sequential' is True and 'sameLumiBlock' is False");
desc.addUntracked<unsigned int>("maxFileSkips")
->setComment(
"How many files to try if 'sequential' is False and 'skipBadFiles' is True.\n"
"Defaults to 3 (or # of files if smaller).");
desc.addUntracked<unsigned int>("cacheSize", roottree::defaultCacheSize)
->setComment("Size of ROOT TTree prefetch cache. Affects performance.");
desc.addUntracked<bool>("enforceGUIDInFileName", false)
Expand Down
1 change: 1 addition & 0 deletions IOPool/Input/src/RootEmbeddedFileSequence.h
Expand Up @@ -71,6 +71,7 @@ namespace edm {
unsigned int treeCacheSize_;
bool enablePrefetching_;
bool enforceGUIDInFileName_;
unsigned int maxFileSkips_;
}; // class RootEmbeddedFileSequence
} // namespace edm
#endif
2 changes: 1 addition & 1 deletion IOPool/Input/src/RootInputFileSequence.cc
Expand Up @@ -253,10 +253,10 @@ namespace edm {
if (filePtr) {
size_t currentIndexIntoFile = fileIter_ - fileIterBegin_;
rootFile_ = makeRootFile(filePtr);
assert(rootFile_);
if (input) {
rootFile_->setSignals(&(input->preEventReadFromSourceSignal_), &(input->postEventReadFromSourceSignal_));
}
assert(rootFile_);
fileIterLastOpened_ = fileIter_;
setIndexIntoFile(currentIndexIntoFile);
rootFile_->reportOpened(inputTypeName);
Expand Down
5 changes: 4 additions & 1 deletion IOPool/Input/src/RootInputFileSequence.h
Expand Up @@ -3,7 +3,10 @@

/*----------------------------------------------------------------------

RootInputFileSequence: This is an InputSource. initTheFile tries to open a file using a list of PFN names constructed from multiple data catalogs in site-local-config.xml. These are accessed via FileCatalogItem iterator fileIter_.
RootInputFileSequence: This is an InputSource. initTheFile tries to open
a file using a list of PFN names constructed from multiple data catalogs
in site-local-config.xml. These are accessed via FileCatalogItem iterator
fileIter_.

----------------------------------------------------------------------*/

Expand Down
42 changes: 42 additions & 0 deletions IOPool/SecondaryInput/test/SecondaryInputTestSkip_cfg.py
@@ -0,0 +1,42 @@
import FWCore.ParameterSet.Config as cms

process = cms.Process("PROD")
process.load("FWCore.Framework.test.cmsExceptionsFatal_cff")

process.maxEvents = cms.untracked.PSet(
input = cms.untracked.int32(75)
)
process.RandomNumberGeneratorService = cms.Service("RandomNumberGeneratorService",
Thing = cms.PSet(
initialSeed = cms.untracked.uint32(12345)
)
)

process.source = cms.Source("PoolSource",
skipBadFiles = cms.untracked.bool(True),
fileNames = cms.untracked.vstring(
'file:SecondaryInputTest.root',
'file:SecondaryInputTest.root',
'file:SecondaryInputTest.root'
)
)

process.Thing = cms.EDProducer("SecondaryProducer",
input = cms.SecSource("EmbeddedRootSource",
skipBadFiles = cms.untracked.bool(True),
maxFileSkips = cms.untracked.uint32(100),
fileNames = cms.untracked.vstring(
'file:SecondaryInputTest2.root',
'file:missing.root',
'file:SecondaryInputTest2.root'
)
)
)

process.Analysis = cms.EDAnalyzer("EventContentAnalyzer",
verbose = cms.untracked.bool(False)
)

process.p = cms.Path(process.Thing*process.Analysis)