## Benchmarking

In [2]:
# Test Functions definition.
def dimuon_analysis(df, node_count):
    # For simplicity, select only events with exactly two muons and require opposite charge
    df_2mu = df.Filter("nMuon == 2", "Events with exactly two muons")
    df_os = df_2mu.Filter("Muon_charge[0] != Muon_charge[1]", "Muons with opposite charge")
    
    # Compute invariant mass of the dimuon system
    df_mass = df_os.Define("Dimuon_mass", "InvariantMass(Muon_pt, Muon_eta, Muon_phi, Muon_mass)")
    
    # Make histogram of dimuon mass spectrum. Note how we can set titles and axis labels in one go.
    h = df_mass.Histo1D(("Dimuon_mass", "Dimuon mass;m_{#mu#mu} (GeV);N_{Events}", 30000, 0.25, 300), "Dimuon_mass")
    
    # Produce plot
    ROOT.gStyle.SetOptStat(0); ROOT.gStyle.SetTextFont(42)
    c = ROOT.TCanvas("c", "", 800, 700)
    c.SetLogx(); c.SetLogy()
    
    #watch = ROOT.TStopwatch()
    h.SetTitle("")
    #print(f"Time elapsed {watch.RealTime()}")
    h.GetXaxis().SetTitleSize(0.04)
    h.GetYaxis().SetTitleSize(0.04)
    h.Draw()
    
    label = ROOT.TLatex(); label.SetNDC(True)
    label.DrawLatex(0.175, 0.740, "#eta")
    label.DrawLatex(0.205, 0.775, "#rho,#omega")
    label.DrawLatex(0.270, 0.740, "#phi")
    label.DrawLatex(0.400, 0.800, "J/#psi")
    label.DrawLatex(0.415, 0.670, "#psi'")
    label.DrawLatex(0.485, 0.700, "Y(1,2,3S)")
    label.DrawLatex(0.755, 0.680, "Z")
    label.SetTextSize(0.040); label.DrawLatex(0.100, 0.920, "#bf{CMS Open Data}")
    label.SetTextSize(0.030); label.DrawLatex(0.630, 0.920, "#sqrt{s} = 8 TeV, L_{int} = 11.6 fb^{-1}")
    
    c.SaveAs(f"dimuon_spectrum_{node_count}.pdf")
    
def cpubound(df):
    # Decide parameters of the random distributions of the RDF columns
    gaus_mean = 10
    gaus_sigma = 1
    exp_tau = 20
    poisson_mean = 30

    df_withcols = df.Define("x",f"gRandom->Gaus({gaus_mean},{gaus_sigma})")\
                    .Define("y",f"gRandom->Exp({exp_tau})")\
                    .Define("z",f"gRandom->PoissonD({poisson_mean})")

    # Decide how many operations per column you want to run
    # Increasing this would increase the overall runtime
    nops_percol = 10
    oplist = [df_withcols.Mean(f"{colname}") for colname in ["x","y","z"] for _ in range(nops_percol)]

    # Start a stopwatch and trigger the execution of the computation graph.
    # Asking for the first value in the list is enough to trigger everything
    print("Starting the CPU bound benchmark.")
    #t = ROOT.TStopwatch()
    first_value = oplist[0].GetValue()
    #realtime = round(t.RealTime(), 2)
    #print(f"CPU bound benchmark finished in {realtime} seconds.")

    # NOTE: All file writing is done inside the backend. 
    # Decide the name of the output csv to store runtime information.
    #outcsv = f"{folder}/distrdf_cpubound_{realtime}.csv"
    
    #with open(outcsv, "a+") as f:
    #    f.write(str(realtime))
    #    f.write("\n")


In [3]:
import ROOT
import os
RDataFrame = ROOT.RDF.Experimental.Distributed.OSCAR.RDataFrame

oscarclient = {
    "minio_endpoint": os.environ['minio_endpoint'],
    "minio_access":   os.environ['minio_access'],
    "minio_secret":   os.environ['minio_secret'],
    "bucket_name": 'root-oscar',
    # Optional arguments
    "benchmarking" : True,
    "oscar_endpoint": f"{os.environ['oscar_endpoint']}",
    "oscar_access":   os.environ['oscar_access'],
    "oscar_secret":   os.environ['oscar_secret']
}


minio_data = 'https://158.42.106.12:30300/root-common/dimuon_data.root'
cern_data = "root://eospublic.cern.ch//eos/opendata/cms/derived-data/AOD2NanoAODOutreachTool/Run2012BC_DoubleMuParked_Muons.root"
#aws_data = 'https://test-cern-data.s3.amazonaws.com/dimuon_data.root'

Welcome to JupyROOT 6.27/01


In [5]:
# Constants
nentries = int(8e9) # Simulate dataset with 
treename = "Events"

os.mkdir('benchmarks')

for vcpu in ['1', '2']: # '1.5'
    oscarclient['cpu_val'] = vcpu
    os.mkdir(f'benchmarks/{vcpu}')
    
    for backend in ['binary_tree']: #, 'client_scheduler']: #, 'serverless_scheduler', 'client_reducer']:
        oscarclient['backend'] = backend
        os.mkdir(f'benchmarks/{vcpu}/{backend}')
        
        for experiment in ['Simulated']: #, 'MINIO', 'CERN']:
            oscarclient['experiment'] = experiment
            os.mkdir(f'benchmarks/{vcpu}/{backend}/{experiment}')
            
            # 'Generate' 200 GB data set.
            if experiment == 'MINIO':
                filenames = [minio_data] * 100
            if experiment == 'CERN':
                filenames = [cern_data] * 100
            
            for node_count in [16, 8]: # 32, 64, 
                oscarclient['mapper_count'] = node_count
                oscarclient['folder'] = f'benchmarks/{vcpu}/{backend}/{experiment}'
                
                if experiment == 'Simulated': 
                    df = RDataFrame(nentries, oscarclient=oscarclient, npartitions=node_count) 
                    cpubound(df)
                else:
                    df = RDataFrame(treename, filenames, 
                                    oscarclient= oscarclient | {'node_count': node_count} , 
                                    npartitions=node_count)
                    dimuon_analysis(df, node_count)

root-oscar-a58df378-2e2c-4cc3-b6f5-0685081df0a9-benchmark
Bucket does not exist. Trying to create it.
Creating bucket...
Bucket created!
Creating services...
Creating service mapper for root-oscar-a58df378-2e2c-4cc3-b6f5-0685081df0a9-benchmark
root-oscar-a58df378-2e2c-4cc3-b6f5-0685081df0a9-benchmark
CPU: 1
Creating service reducer for root-oscar-a58df378-2e2c-4cc3-b6f5-0685081df0a9-benchmark
root-oscar-a58df378-2e2c-4cc3-b6f5-0685081df0a9-benchmark
CPU: 1
Done creating services!
Client headnode
Starting the CPU bound benchmark.
<Response [201]>
<Response [201]>
Starting timer
Target Name: 0_15
File 10_10 written to partial-results folder.
File 14_14 written to partial-results folder.
File 2_2 written to partial-results folder.
File 8_8 written to partial-results folder.
File 4_4 written to partial-results folder.
File 1_1 written to partial-results folder.
File 7_7 written to partial-results folder.
File 11_11 written to partial-results folder.
File 10_11 written to partial-results fo

<Response [204]>
