In [None]:
import json
import os.path
import glob
import pandas as pd
import plotly.express as px
from IPython.display import display_markdown, Code, Markdown, display


In [None]:
%%latex
\hypersetup{linkcolor=black}
\tableofcontents
\pagebreak

In [None]:
def getSourceMetadata (filename):
    result = "";
    with open(filename) as f:
        lines = [l.replace("\n","") for l in f.readlines() if l.startswith("//")];
        state="none"
        [i0,i1]=[-1,-1]
        ranges=[]
        for i,l in enumerate(lines):
            if l.startswith("// @"):
                state="running";
                if i0!=-1: ranges.append ([i0,i])  
                i0=i;
            elif state=="running":
                if l == len(l)*"/":  # same character            
                    ranges.append ([i0,i])
                    [i0,i1]=[-1,-1]
                    state="none"
        result={}
        for i0,i1 in ranges:
            sp = lines[i0][4:].split(":")
            tag = sp[0]
            val = "".join(sp[1:]) + " "
            for i in range(i0+1,i1):
                val += lines[i].replace("//","").strip() + " ";
            result[tag]=val.strip();
    return result;

In [None]:
def getSourceString (filename):
    result = "";
    with open(filename) as f:
        lines = [l.replace("\n","") for l in f.readlines() if not l.startswith("//")];
        running=0
        [i0,i1,braces]=[0,0,0]
        for i,l in enumerate(lines):
            if running==0:
                if "template<class ARCH>" in l:  [running,i0]=[1,i]
            else:
                braces += l.count("{") - l.count("}")
                i1=i;
                if braces==0 and running>1: break;
                running += 1;
        lines = [l for l in lines[i0:i1+1] if not l.strip().startswith("//")] 
        result = "\n".join(lines)
    return result;

In [None]:
def compute_speedup(df):
    dfTmp = pd.DataFrame()
    for task in tasklist:
        for arch in archs:
            for i in df["input"].unique():
                tmp = df[["taskname","arch","unit/nbcomp","time","input"]] [(df["taskname"]==task) & (df["arch"]==arch) & (df["input"]==i)].copy()
                if len(tmp)==0: continue;
                tmp["speedup"] = tmp["time"].iloc[0] / tmp["time"]
                dfTmp = tmp if len(dfTmp)==0 else pd.concat ([dfTmp,tmp])
    return df.merge (dfTmp, on=["taskname", "arch", "unit/nbcomp","time","input"], how="left")


In [None]:
def normalize(df, arch, changename=True, scaling=True):
    if len(df)==0: return [df,""];
    unitCol = df["unit"].iloc[0]
    if arch=="upmem":
        if scaling: df["unit/nbcomp"] = df["unit/nbcomp"]/64
        df["unit"] = "rank"
        unitCol = df["unit"].iloc[0]
    if changename:
        df = df.rename(columns={"unit/nbcomp":unitCol})
    return [df, unitCol]


In [None]:
archs   = ["multicore", "upmem"]
tracesDir  = "./traces"
taskSrcDir = "../../unit/tasks"

files = glob.glob("traces/*.txt");
#files = ["traces/SyracuseReduce.txt"]
#files = ["traces/SyracuseVector2.txt"]
#files = ["traces/SyracuseVector3.txt"]
#files = ["traces/SyracuseReduce.txt", "traces/SyracuseVector.txt"]
files = ["traces/VectorCreation.txt"]
files = ["traces/VectorSerialize.txt"]
#files = ["traces/{}.txt".format(x) for x in config.keys()]
files = ["traces/VectorChecksum.txt"]
files = ["traces/traces_a2ffb4a.txt"]
#files = ["traces/foo.txt"]

stats = [];
for f in files:
    for idx,line in enumerate(open(f)):
        line = " ".join(line.split());
        info = [x for x in line.strip().split(" ")] 
        info = [info[i] for i in [1,3, 5,6,7, 9,11,12,13,14,16]]
        stats.append (info)

schema  = {
    "taskname":"str", 
    "arch":"str", 
    "unit":"str", 
    "unit/nbcomp":"int", 
    "unit/nbproc":"int", 
    "time"        :"float", 
    "time/launch" :"float", 
    "time/pre"    :"float",  
    "time/post"   :"float", 
    "time/result" :"float", 
    "input":"object"
}
df = pd.DataFrame(stats,columns=schema.keys()).astype(schema)

tasklist = sorted(list(df["taskname"].unique()))

taskSrcFiles = ["{}/{}.hpp".format(taskSrcDir,t) for t in tasklist]
taskSrcFiles = [f for f in taskSrcFiles if  "description" in  getSourceMetadata(f)]

dfUpmem = df[(df["arch"]=="upmem")].copy()
dfMulti = df[(df["arch"]=="multicore")].copy()

[dfUpmem,xaxisUpmem] = normalize(dfUpmem,"upmem",     False)
[dfMulti,xaxisMulti] = normalize(dfMulti,"multicore", False)

xaxisMap = { "multicore":"thread", "upmem":"rank" }

df = pd.concat([dfUpmem,dfMulti]);

df = compute_speedup(df);

timeDetails = [x for x in schema.keys() if x.startswith("time/")]
df["time/unknown"] = df["time"]
for col in timeDetails:  df["time/unknown"] = df["time/unknown"] - df[col]

config = {}
for x in taskSrcFiles:
    name = x.split("/")[-1].split(".")[0]
    if name in tasklist:
        config[name] = {"source":x}
#taskSrcFiles
#tasklist
#config

# Introduction

## BPL version
The hash of the GIT repository used for this benchmark is:

In [None]:
!git rev-parse HEAD

# Benchmark suite
We describe here the tests used for the benchmark. We try to use tests showing different parts such as:

* pure calculus involving few memory management
* intensive memory management, e.g. creation of vectors and dynamically adding items into them
* intensive broadcast management in case of UPMEM arch, e.g. sending/returning a huge vector to/from the task

For each test, we will study the following:

* comparing Multicore vs Upmem in order to quickly compare the behaviour of the task for the two architectures
* the scalability for each available architectures. For the different tasks, we display as a function of process units number the ratio between the execution time for 1 process unit divided by the execution time for N process units. In a perfect world, one should be close to linearity. Note the following: for `multicore` arch, the unit process are threads and for `UPMEM` arch, process units are tasklets; in case of `UPMEM` arch and in order to have more readable `x` axis, we will display the number of ranks instead of tasklets.
* assessing execution times of the different parts during the running of a task with the `UPMEM` architecture.
  

The tests to be benchmarked are

In [None]:
for t in tasklist:
    print ("   {}".format(t))

In [None]:
pixelsize = 500;
nbcols=3;

def display_stacked(df,nbrows,nbcols):
    df = df[df["arch"]=="upmem"].copy()

    [df,unitCol] = normalize(df,"upmem",scaling=False);
    
    df["time/unknown"] = df["time"]
    for col in timeDetails:  df["time/unknown"] -= df[col]
    timeCategory = [*timeDetails,"time/unknown"]
        
    dfStacked = df [["rank","input",*timeCategory]].melt(id_vars=["rank","input"],var_name="time",value_name="% time")
    fig = px.area(dfStacked, x="rank", y="% time", color="time", groupnorm="percent", facet_col="input", facet_col_wrap=3)
    fig.update_layout(width=pixelsize*nbcols,height=pixelsize*nbrows, hovermode="x unified")
    fig.update_traces(hovertemplate="%{y:4.1f}%")
    fig.update_layout(title="Benchmark for test '{}'".format(task))
    fig.show()

def display_speeduparch(dftask):
    tmp1=dftask[dftask["arch"]=="multicore"]
    tmp2=dftask[dftask["arch"]=="upmem"]
    tmp3=tmp1.merge(tmp2, on=["unit/nbcomp", "input"])
    tmp3["speedup"] = tmp3["time_x"] / tmp3["time_y"]
    fig = px.line(tmp3, x="unit/nbcomp", y="speedup", color="input")
    fig.update_layout(title="Speedup upmem/multicore", yaxis_title="Speedup: T(multicore) / T(upmem)")
    fig.update_layout(title="Benchmark for test '{}'".format(task))
    fig.update_layout(hovermode="x unified")
    fig.update_layout(width=1000,height=800)
    fig.update_traces(hovertemplate="%{y:4.1f}x")
    fig.update_xaxes(title_text="ranks/threads")
    fig.show()

def generate4task (df, task):

    #######################################################################################
    section = "## Test '{}'".format(task)
    display_markdown(section, raw=True)

    metadata = getSourceMetadata(config[task]["source"]);

    benchmark_multicore_split = int(metadata.get("benchmark-multicore-split", "1"))
    
    #print (task, metadata, benchmark_multicore_split);
    #return
    
    #######################################################################################
    for key,val in metadata.items():
        display_markdown("### {}".format(key.capitalize()), raw=True)
        print (val)

    #######################################################################################
    display_markdown("### Source code", raw=True)
    src = getSourceString(config[task]["source"]);
    display(Code(src, language='cpp'))

    #######################################################################################
    display_markdown("### Multicore vs Upmem", raw=True)
    tmp = df[df["taskname"]==task];
    nbinputs = len(tmp["input"].unique())
    nbrows = round(nbinputs/nbcols)

    fig = px.line(tmp, x="unit/nbcomp", y="time", color="arch", facet_col="input", facet_col_wrap=nbcols, facet_row_spacing=0.05, facet_col_spacing=0.05)
    fig.update_yaxes(title_text="log(time)", type="log")
    fig.update_xaxes(title_text="threads/ranks")
    fig.update_layout(title="Benchmark for test '{}'".format(task))
    fig.update_layout(width=pixelsize*nbcols,height=pixelsize*nbrows)
    fig.update_layout(hovermode="x unified")
    fig.update_traces(hovertemplate="%{y:4.6f}")
    fig.show();

    #######################################################################################
    display_markdown("### Scalability per architecture", raw=True)
    dftask  = df[(df["taskname"]==task)].copy()
    if len(dftask)!=0:
        fig = px.line(dftask, x="unit/nbcomp", y="speedup", color="input", facet_col="arch", facet_col_wrap=2)
        fig.update_layout(width=2*600,height=600, title="Speedup", yaxis_title="Speedup T(0)/T(x)")
        fig.update_layout(title="Benchmark for test '{}'".format(task))
        fig.update_layout(hovermode="x unified")
        fig.update_traces(hovertemplate="%{y:4.1f}x")
        fig.show()

    #######################################################################################
    display_markdown("### Speedup upmem vs multicore", raw=True)
    display_speeduparch (df[(df["taskname"]==task)].copy())
    
    #######################################################################################
    display_markdown("### BPL overheads", raw=True)
    dfUpmem = df[(df["arch"]=="upmem") & (df["taskname"]==task)].copy()    
    nbinputs = len(dfUpmem["input"].unique())
    nbrows = round(nbinputs/nbcols)
    display_stacked (dfUpmem, nbrows, nbcols)

In [None]:
for task in tasklist:
    generate4task (df, task);

In [None]:
def display_rawdata(df,task,arch):
    tmp = df[(df["taskname"]==task)&(df["arch"]==arch)].copy();
    tmp = tmp[["arch","input", "unit","unit/nbcomp","time","time/launch", "time/pre","time/post", "time/result","time/unknown"]]
    tmp = tmp.rename(columns={'unit/nbcomp': 'nbcomp'})
    tmp["nbcomp"] = tmp["nbcomp"].astype(int)
    for col in tmp.columns:
        if col.startswith("time/"):
            tmp = tmp.rename(columns={col: col[5:]})
    display(Markdown(tmp.to_markdown(index=False, floatfmt="9.5f")))
    
display_markdown("## Raw data", raw=True)
for task in tasklist:
    display_markdown("### Test {}".format(task), raw=True)
    display_rawdata (df,task,"upmem")
    display_rawdata (df,task,"multicore")


In [None]:
!jupyter nbconvert "1.Benchmark.ipynb"  \
--no-input  \
--to pdf    \
--LatexPreprocessor.title="Benchmark for the BPL library" \
--LatexPreprocessor.author_names="BioPim" \
--LatexPreprocessor.date="`date +"%d/%m/%Y"`" \
--TagRemovePreprocessor.enabled=True  \
--TagRemovePreprocessor.remove_cell_tags remove_cell  \
2> /dev/null
