Skip to content

Commit

Permalink
Merge pull request #22861 from drkovalskyi/split_read_mode_10_2_X
Browse files Browse the repository at this point in the history
[Tag-N-Probe] Split read mode
  • Loading branch information
cmsbuild committed Apr 20, 2018
2 parents 713132b + f09b280 commit 7ecedc2
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 44 deletions.
10 changes: 10 additions & 0 deletions PhysicsTools/TagAndProbe/interface/TagProbeFitter.h
Expand Up @@ -58,6 +58,11 @@ class TagProbeFitter {

/// suppress most of the output from RooFit and Minuit
void setQuiet(bool quiet_=true);

/// split mode - use it for very large input files (slower that non-split mode, which is the default)
/// 0 - import input TTree as a whole (non-split mode)
/// non-zero value - use split reading mode and read specified number of events for each iteration
void setSplitMode(unsigned int nevents);

protected:
///pointer to the input TTree Chain of data
Expand Down Expand Up @@ -116,6 +121,11 @@ class TagProbeFitter {
/// suppress most printout
bool quiet;

/// split mode - use it for very large input files (slower that non-split mode, which is the default)
/// 0 - import input TTree as a whole (non-split mode)
/// non-zero value - use split reading mode and read specified number of events for each iteration
unsigned int split_mode;

///fix or release variables selected by user
void varFixer(RooWorkspace* w, bool fix);
///store values in the vector
Expand Down
5 changes: 4 additions & 1 deletion PhysicsTools/TagAndProbe/plugins/TagProbeFitTreeAnalyzer.cc
Expand Up @@ -20,6 +20,7 @@ class TagProbeFitTreeAnalyzer : public edm::EDAnalyzer{
void calculateEfficiency(string name, const edm::ParameterSet& pset);
private:
TagProbeFitter fitter;
unsigned int split_mode; // number of events to read per cycle (slower, but memory efficient)
};

TagProbeFitTreeAnalyzer::TagProbeFitTreeAnalyzer(const edm::ParameterSet& pset):
Expand All @@ -31,9 +32,11 @@ TagProbeFitTreeAnalyzer::TagProbeFitTreeAnalyzer(const edm::ParameterSet& pset):
pset.existsAs<bool>("SaveWorkspace")?pset.getParameter<bool>("SaveWorkspace"):false,
pset.existsAs<bool>("floatShapeParameters")?pset.getParameter<bool>("floatShapeParameters"):true,
pset.existsAs<vector<string> >("fixVars")?pset.getParameter<vector<string> >("fixVars"):vector<string>()
)
),
split_mode( pset.existsAs<unsigned int>("SplitMode")?pset.getParameter<unsigned int>("SplitMode"):0 )
{
fitter.setQuiet(pset.getUntrackedParameter("Quiet",false));
fitter.setSplitMode( split_mode );

if (pset.existsAs<bool>("binnedFit")) {
bool binned = pset.getParameter<bool>("binnedFit");
Expand Down
195 changes: 152 additions & 43 deletions PhysicsTools/TagAndProbe/src/TagProbeFitter.cc
Expand Up @@ -44,6 +44,7 @@
#include "RooThresholdCategory.h"
#include "RooTrace.h"
#include "RooWorkspace.h"
#include "RooTreeDataStore.h"

using namespace std;
using namespace RooFit;
Expand Down Expand Up @@ -78,6 +79,8 @@ TagProbeFitter::TagProbeFitter(const std::vector<std::string>& inputFileNames, s
// make integration very precise
RooAbsReal::defaultIntegratorConfig()->setEpsAbs(1e-13);
RooAbsReal::defaultIntegratorConfig()->setEpsRel(1e-13);

split_mode = 0;
}

TagProbeFitter::~TagProbeFitter(){
Expand All @@ -95,6 +98,11 @@ void TagProbeFitter::setQuiet(bool quiet_) {
RooMsgService::instance().setGlobalKillBelow(RooFit::WARNING);
}
}

void TagProbeFitter::setSplitMode(unsigned int nevents) {
split_mode = nevents;
}

bool TagProbeFitter::addVariable(string name, string title, double low, double hi, string units){
RooRealVar temp(name.c_str(), title.c_str(), low, hi, units.c_str());
temp.setBins(5000,"cache");
Expand Down Expand Up @@ -158,25 +166,23 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri
for(map<string, vector<double> >::iterator v=binnedReals.begin(); v!=binnedReals.end(); v++){
TString name = v->first;
if (variables.find(name) == nullptr) { cerr << "Binned variable '"<<name<<"' not found." << endl; return "Error"; }
binnedVariables.addClone(variables[name]);
binnedVariables.add(*dataVars.addClone(variables[name]));
((RooRealVar&)binnedVariables[name]).setBinning( RooBinning(v->second.size()-1, &v->second[0]) );
binCategories.addClone( RooBinningCategory(name+"_bins", name+"_bins", (RooRealVar&)binnedVariables[name]) );
}
dataVars.addClone(binnedVariables, true);

//collect the category variables and the corresponding mapped categories
RooArgSet categories;
RooArgSet mappedCategories;
for(map<string, vector<string> >::iterator v=binnedCategories.begin(); v!=binnedCategories.end(); v++){
TString name = v->first;
if (variables.find(name) == nullptr) { cerr << "Binned category '"<<name<<"' not found." << endl; return "Error"; }
categories.addClone(variables[name]);
categories.add(*dataVars.addClone(variables[name]));
mappedCategories.addClone(RooMappedCategory(name+"_bins", name+"_bins", (RooCategory&)categories[name]));
for(unsigned int i = 0; i<v->second.size(); i++){
((RooMappedCategory&)mappedCategories[name+"_bins"]).map(v->second[i].c_str(), name+"_"+TString(v->second[i].c_str()).ReplaceAll(",","_"));
}
}
dataVars.addClone(categories, true);

// add the efficiency category if it's not a dynamic one
for (vector<string>::const_iterator effCat = effCats.begin(); effCat != effCats.end(); ++effCat) {
Expand All @@ -199,41 +205,45 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri


//now add the necessary mass and passing variables to make the unbinned RooDataSet
RooDataSet data("data", "data", inputTree,
dataVars,
/*selExpr=*/"", /*wgtVarName=*/(weightVar.empty() ? nullptr : weightVar.c_str()));

// Now add all expressions that are computed dynamically
for(vector<pair<pair<string,string>, pair<string, vector<string> > > >::const_iterator ev = expressionVars.begin(), eve = expressionVars.end(); ev != eve; ++ev){
RooArgList args;
for (vector<string>::const_iterator it = ev->second.second.begin(), ed = ev->second.second.end(); it != ed; ++it) {
args.add(dataVars[it->c_str()]);
}
RooFormulaVar expr(ev->first.first.c_str(), ev->first.second.c_str(), ev->second.first.c_str(), args);
RooRealVar *col = (RooRealVar *) data.addColumn(expr);
dataVars.addClone(*col);
RooDataSet* data(nullptr);
if (not split_mode){
data = new RooDataSet("data", "data", inputTree,
dataVars,
/*selExpr=*/"", /*wgtVarName=*/(weightVar.empty() ? nullptr : weightVar.c_str()));

// Now add all expressions that are computed dynamically
for(vector<pair<pair<string,string>, pair<string, vector<string> > > >::const_iterator ev = expressionVars.begin(), eve = expressionVars.end(); ev != eve; ++ev){
RooArgList args;
for (vector<string>::const_iterator it = ev->second.second.begin(), ed = ev->second.second.end(); it != ed; ++it) {
args.add(dataVars[it->c_str()]);
}
RooFormulaVar expr(ev->first.first.c_str(), ev->first.second.c_str(), ev->second.first.c_str(), args);
RooRealVar *col = (RooRealVar *) data->addColumn(expr);
dataVars.addClone(*col);
}

// And add all dynamic categories from thresholds
for(vector<pair<pair<string,string>, pair<string, double> > >::const_iterator tc = thresholdCategories.begin(), tce = thresholdCategories.end(); tc != tce; ++tc){
RooThresholdCategory tmp(tc->first.first.c_str(), tc->first.second.c_str(), (RooAbsReal &)dataVars[tc->second.first.c_str()], "above", 1);
tmp.addThreshold(tc->second.second, "below",0);
RooCategory *cat = (RooCategory *) data.addColumn(tmp);
RooCategory *cat = (RooCategory *) data->addColumn(tmp);
dataVars.addClone(*cat);
}

}

//merge the bin categories to a MultiCategory for convenience
RooMultiCategory allCats("allCats", "allCats", RooArgSet(binCategories, mappedCategories));
data.addColumn(allCats);
string effName;
//setup the efficiency category
if (effCats.size() == 1) {

if (not split_mode){
data->addColumn(allCats);
//setup the efficiency category
if (effCats.size() == 1) {
effName = effCats.front() + "::" + effStates.front();
RooMappedCategory efficiencyCategory("_efficiencyCategory_", "_efficiencyCategory_", (RooCategory&)dataVars[effCats.front().c_str()], "Failed");
efficiencyCategory.map(effStates.front().c_str(), "Passed");
data.addColumn( efficiencyCategory );
} else {
data->addColumn( efficiencyCategory );
} else {
RooArgSet rooEffCats;
string multiState = "{";
for (size_t i = 0; i < effCats.size(); ++i) {
Expand All @@ -246,50 +256,145 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri
RooMultiCategory efficiencyMultiCategory("_efficiencyMultiCategory_", "_efficiencyMultiCategory_", rooEffCats);
RooMappedCategory efficiencyCategory("_efficiencyCategory_", "_efficiencyCategory_", efficiencyMultiCategory, "Failed");
efficiencyCategory.map(multiState.c_str(), "Passed");
data.addColumn( efficiencyCategory );
data->addColumn( efficiencyCategory );
}
}

//setup the pdf category
RooMappedCategory pdfCategory("_pdfCategory_", "_pdfCategory_", allCats, (!binToPDFmap.empty())?binToPDFmap[0].c_str():"all");
for(unsigned int i = 1; i<binToPDFmap.size(); i+=2){
pdfCategory.map(binToPDFmap[i].c_str(), binToPDFmap[i+1].c_str());
}
data.addColumn( pdfCategory );
if (not split_mode) data->addColumn( pdfCategory );

//create the empty efficiency datasets from the binned variables
RooRealVar efficiency("efficiency", "Efficiency", 0, 1);

RooDataSet fitEfficiency("fit_eff", "Efficiency from unbinned ML fit", RooArgSet(RooArgSet(binnedVariables, categories), efficiency), StoreAsymError(RooArgSet(binnedVariables, efficiency)));
// RooDataSet sbsEfficiency("sbs_eff", "Efficiency from side band substraction", RooArgSet(RooArgSet(binnedVariables, categories), efficiency), StoreAsymError(RooArgSet(binnedVariables, efficiency)));
RooDataSet cntEfficiency("cnt_eff", "Efficiency from counting", RooArgSet(RooArgSet(binnedVariables, categories), efficiency), StoreAsymError(RooArgSet(binnedVariables, efficiency)));

RooDataSet cntEfficiency("cnt_eff", "Efficiency from counting", RooArgSet(RooArgSet(binnedVariables, categories), efficiency), StoreAsymError(RooArgSet(binnedVariables, efficiency)));

if(!floatShapeParameters){
//fitting whole dataset to get initial values for some parameters
RooWorkspace* w = new RooWorkspace();
w->import(data);
efficiency.setVal(0);//reset
efficiency.setAsymError(0,0);
std::cout << "ALL dataset: calling doFitEfficiency with pdf: " << pdfCategory.getLabel() << std::endl;
doFitEfficiency(w, pdfCategory.getLabel(), efficiency);
delete w;
if (not split_mode){
if(!floatShapeParameters){
//fitting whole dataset to get initial values for some parameters
RooWorkspace* w = new RooWorkspace();
w->import(*data);
efficiency.setVal(0);//reset
efficiency.setAsymError(0,0);
std::cout << "ALL dataset: calling doFitEfficiency with pdf: " << pdfCategory.getLabel() << std::endl;
doFitEfficiency(w, pdfCategory.getLabel(), efficiency);
delete w;
}
} else {
// disactive not needed branches
inputTree->SetBranchStatus("*", false);
TIterator* iter = dataVars.createIterator();
TObject* obj(nullptr);
while ( (obj = iter->Next()) )
inputTree->SetBranchStatus(obj->GetName(),true);
}

//loop over all bins with the help of allCats
// loop over all bins with the help of allCats
// store index values in a separate vector to avoid issues
// with iteration over changing allCats object
std::vector<Int_t> allCatValues;
TIterator* it = allCats.typeIterator();
for(RooCatType* t = (RooCatType*)it->Next(); t!=nullptr; t = (RooCatType*)it->Next() ){
for(RooCatType* t = (RooCatType*)it->Next(); t!=nullptr; t = (RooCatType*)it->Next() )
allCatValues.push_back(t->getVal());
for (auto iCat : allCatValues){
const RooCatType* t = allCats.lookupType(iCat);
//name of the multicategory
TString catName = t->GetName();
//skip unmapped states
if(catName.Contains("NotMapped")) continue;
//create the dataset
RooAbsData* data_bin = (RooDataSet*) data.reduce(Cut(TString::Format("allCats==%d",t->getVal())));

RooDataSet* data_bin(nullptr);
RooArgSet tmpVars;

if (not split_mode){
//create the dataset
data_bin = (RooDataSet*) data->reduce(Cut(TString::Format("allCats==%d",iCat)));
} else {
data_bin = new RooDataSet("data", "data",
dataVars,
(weightVar.empty() ? nullptr : weightVar.c_str()));

TDirectory* tmp = gDirectory;
gROOT->cd();

// loop over input data and fill the dataset with events for
// current category
unsigned int n_entries = inputTree->GetEntries();
printf("Input number of events: %u\n", n_entries);
unsigned int first_entry = 0;
while (first_entry<n_entries){
TTree* copyTree = inputTree->CopyTree("","",split_mode,first_entry);
RooTreeDataStore store("reader","reader",dataVars,*copyTree,
/*selExpr=*/"", /*wgtVarName=*/(weightVar.empty() ? nullptr : weightVar.c_str()));
for (unsigned int i=0; i<store.GetEntries(); ++i){
store.get(i);
if (allCats.getIndex()==iCat){
data_bin->add(dataVars,
weightVar.empty() ? 1.0 : dataVars.getRealValue(weightVar.c_str()));
}
}
delete copyTree;
first_entry += split_mode;
data_bin->Print("V");
}
data_bin->Print("V");
tmp->cd();

// Now add all expressions that are computed dynamically
for(vector<pair<pair<string,string>, pair<string, vector<string> > > >::const_iterator ev = expressionVars.begin(), eve = expressionVars.end(); ev != eve; ++ev){
RooArgList args;
for (vector<string>::const_iterator it = ev->second.second.begin(), ed = ev->second.second.end(); it != ed; ++it) {
args.add(dataVars[it->c_str()]);
}
RooFormulaVar expr(ev->first.first.c_str(), ev->first.second.c_str(), ev->second.first.c_str(), args);
RooRealVar *col = (RooRealVar *) data_bin->addColumn(expr);
tmpVars.add(*dataVars.addClone(*col));
}

// And add all dynamic categories from thresholds
for(vector<pair<pair<string,string>, pair<string, double> > >::const_iterator tc = thresholdCategories.begin(),
tce = thresholdCategories.end(); tc != tce; ++tc){
RooThresholdCategory tmp(tc->first.first.c_str(), tc->first.second.c_str(), (RooAbsReal &)dataVars[tc->second.first.c_str()], "above", 1);
tmp.addThreshold(tc->second.second, "below",0);
RooCategory *cat = (RooCategory *) data_bin->addColumn(tmp);
tmpVars.add(*dataVars.addClone(*cat));
}

//setup the efficiency category
if (effCats.size() == 1) {
effName = effCats.front() + "::" + effStates.front();
RooMappedCategory efficiencyCategory("_efficiencyCategory_", "_efficiencyCategory_", (RooCategory&)dataVars[effCats.front().c_str()], "Failed");
efficiencyCategory.map(effStates.front().c_str(), "Passed");
data_bin->addColumn( efficiencyCategory );
} else {
RooArgSet rooEffCats;
string multiState = "{";
for (size_t i = 0; i < effCats.size(); ++i) {
if (i) { multiState += ";"; effName += " && "; }
rooEffCats.add((RooCategory &) dataVars[effCats[i].c_str()]);
multiState += effStates[i];
effName = effCats[i] + "::" + effStates[i];
}
multiState += "}";
RooMultiCategory efficiencyMultiCategory("_efficiencyMultiCategory_", "_efficiencyMultiCategory_", rooEffCats);
RooMappedCategory efficiencyCategory("_efficiencyCategory_", "_efficiencyCategory_", efficiencyMultiCategory, "Failed");
efficiencyCategory.map(multiState.c_str(), "Passed");
data_bin->addColumn( efficiencyCategory );
}
data_bin->addColumn( pdfCategory );
}

//set the category variables by reading the first event
const RooArgSet* row = data_bin->get();

//get PDF name
TString pdfName(((RooCategory*)row->find("_pdfCategory_"))->getLabel());


//make directory name
TString dirName = catName;
dirName.ReplaceAll("{","").ReplaceAll("}","").ReplaceAll(";","__");
Expand All @@ -307,7 +412,7 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri
//import the data
w->import(*data_bin);
delete data_bin; // clean up earlier
data_bin = w->data("data"); // point to the data that's in the workspace now (saves memory)
data_bin = dynamic_cast<RooDataSet*>(w->data("data")); // point to the data that's in the workspace now (saves memory)

//save the distribution of variables
if (doSaveDistributionsPlot) saveDistributionsPlot(w);
Expand Down Expand Up @@ -349,6 +454,7 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri
}
//clean up
delete w;
if (split_mode) dataVars.remove(tmpVars);
//get back to the initial directory
gDirectory->cd("..");
}
Expand All @@ -368,6 +474,9 @@ string TagProbeFitter::calculateEfficiency(string dirName,const std::vector<stri
gDirectory->mkdir("cnt_eff_plots")->cd();
saveEfficiencyPlots(cntEfficiency, effName, binnedVariables, mappedCategories);
gDirectory->cd("..");

if (not split_mode) delete data;

//empty string means no error
return "";
}
Expand Down

0 comments on commit 7ecedc2

Please sign in to comment.