# Experiment Results

## Preliminaries

### Imports

In [None]:
import sys
!{sys.executable} -m pip install plotly numpy pandas

In [None]:
# Library Imports
import os
import csv
import sys
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statistics
import glob

### Plot Functions

In [None]:
# Label names

x_axis_jobs = "Mapper Count"
y_axis_time = "Time (s)"

def plot_mapper_times(csvs):
    '''
    Generate 
    '''
    times = {}
    for item in csvs:
        tmp = csvs[item]
        tmp = tmp[tmp['function'] ==  'mapper']

        tmp_time = []
        name = f'size_{item.split("_")[0]}'

        times[name] = [[], []]
        for elem in pd.unique(tmp['id']):
            tmp2 = tmp[tmp['id'] == elem]
            mapper_time = float(tmp2['time'].iloc[[-1]].values[0]) - float(tmp2['time'].iloc[[0]].values[0]) 
            tmp_time.append(mapper_time)
            times[name][0].append(tmp2['time'].iloc[[-1]].values[0] - tmp2['time'].iloc[[0]].values[0]) 
            times[name][1].append(tmp2['node'].iloc[[-1]].values[0])

        print(f'Job: {name}. Average Time: {sum(tmp_time)/len(tmp_time)} - Median: {statistics.median(tmp_time)}')


    for k in times:
        bp = go.Figure()
        bp.add_trace(go.Box(y=times[k][0],
                            boxpoints='all', 
                            text=times[k][1]))

        bp.update_layout(
        title="Mapper times",
        xaxis_title= f"Total Mapper Count {k.split('-')[1]}",
        yaxis_title= y_axis_time,
        font=dict(size=22),
        legend=dict(
            #x=.85,
            x=.80,
            y=.95,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=22,
                color="black")
        )
    )
        
        bp.show()
       
    
def eficiency(csvs_list):
    '''
    Generate Time to plot and speedup plots for all data source types.
    '''
    
    fig_time = go.Figure()
    fig_speedup = go.Figure()
    names = csvs_list[3]
    colors = csvs_list[4]
    counter = 0
    
    for csvs in csvs_list[:3]:
        jobs = ['1','2','4','8','16','32', '48', '64', '80']
        times = []

        for item in csvs.keys():
            times.append(csvs[item].values[0][0])
            
        efi_ts = list(map(lambda x: times[0] / x , times))
        efi_jobs = [float(i) for i in jobs]

        fig_time.add_trace(go.Scatter(x=jobs, y=times, 
                                      name=names[counter], 
                                      #mode="markers",
                                      marker=colors[counter]))
        
        fig_speedup.add_trace(go.Scatter(x=jobs,y=efi_ts, 
                                         name = names[counter],
                                         marker=colors[counter]))
    
        counter += 2
    
    fig_time.update_layout(
        title="Time to Plot",
        xaxis_title = x_axis_jobs,
        yaxis_title = y_axis_time,
        legend_title = "Data Source",
        font=dict(size=22),
        legend=dict(
            #x=.85,
            x=.80,
            y=.95,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=22,
                color="black")
        )
    )
    
    fig_time.show()
    
    fig_speedup.update_xaxes(type='linear')
    fig_speedup.add_trace(go.Scatter(x=jobs,y=efi_jobs, 
                                     line=dict(dash='dash'),
                                     name='Optimal Speedup',
                                     marker=dict(color="LightSeaGreen")))
    
    fig_speedup.update_layout(
        title="Speedup",
        xaxis_title=x_axis_jobs,
        yaxis_title="Speedup",
        legend_title="Data Source",
        font=dict(size=22),
        legend=dict(
            #x=.85,
            x=.02,
            y=.95,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=22,
                color="black")
        )
    )
    
    fig_speedup.show()
    
        
    
    
def usage_timeline(csvs_list):
    '''
    Time line representation of cpu and memory usage for all experiments.
    '''
    
    names = csvs_list[3]
    colors = csvs_list[4]
    counter = 0
    
    for csvs in csvs_list[:3]:
        fig_cpu = go.Figure()
        fig_mem = go.Figure()
        
        key = '32'
        tmp = csvs[key]
        tmp = tmp[tmp['function'] == 'mapper']

        mapper_data = tmp[tmp['id'] == '31_31']
        
        
        fig_cpu.add_trace(go.Scatter(x=mapper_data['time'],
                                     y=mapper_data['cpu_percent'],
                                     marker=colors[counter]))
        
        fig_mem.add_trace(go.Scatter(x=mapper_data['time'],
                                     y=mapper_data['mem_percent'],
                                     marker=colors[counter+1]))

        fig_cpu.update_layout(
            title=f'{names[counter]} Data - CPU Usage',
            xaxis_title=y_axis_time,
            yaxis_title="Usage (%)",
            legend_title="Data Source",
            font=dict(size=22)
        )

        fig_mem.update_layout(
            title=f'{names[counter]} Data - Mem Usage',
            xaxis_title=y_axis_time,
            yaxis_title="Usage (%)",
            legend_title="Data Source",
            font=dict(size=22)
        )
        
        counter += 2
        fig_cpu.show()
        fig_mem.show()
        
def cpu_mem_comparison(csvs_list):
    """
    Boxplot comparion of cpu and memory usages by the different experiments. In %.
    """
    names = csvs_list[3]
    colors = csvs_list[4]
    counter = 0
    
    fig_cpu = go.Figure()
    fig_mem = go.Figure()
    
    for csvs in csvs_list[:3]:
        
        for csv in csvs:
            key = '32' # Select Job with 96 mappers as default.
            tmp = csvs[key]
            tmp = tmp[tmp['function'] == 'mapper']
            cpu_use = []
            mem_use = []
            # For each mapper get the mean of the usage, this will be a data point for the bp.
            for elem in pd.unique(tmp['id']):
                tmp2 = tmp[tmp['id'] == elem]
                cpu_use.append(tmp2['cpu_percent'].mean())
                mem_use.append(tmp2['mem_percent'].mean())

        # Generate Plot
        fig_cpu.add_trace(go.Box(y=cpu_use, name=f'{names[counter]}', marker=colors[counter]))
        fig_mem.add_trace(go.Box(y=mem_use, name=f'{names[counter]}', marker=colors[counter]))
        
        counter += 2
        
    fig_cpu.update_layout(
            title=f'CPU Usage',
            yaxis_title="Usage (%)",
            legend_title="Data Source",
            font=dict(size=22),
            legend=dict(
                #x=.85,
                x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
        )

    fig_mem.update_layout(
        title=f'Memory Usage',
        yaxis_title="Usage (%)",
        legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.80,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )

    fig_cpu.show()
    fig_mem.show()
    
    
def rss_bp_mem(csvs_list):
    """
    Summary of the Resident Set Size (in MB) used by the different experiments.
    Each boxplot contains all the datapoint of all the experiments for a 
    data source.
    """
    names = csvs_list[3]
    colors = csvs_list[4]
    counter = 0
    rss_fig = go.Figure()
    
    for csvs in csvs_list[:3]:

        points = []
        for key in csvs:
            df = csvs[key]
            df = df[df['function'] == 'mapper']
            df = df[df['phase'] == 'end']
            points.append(df['mem_rss'].mean())

        # Convert Bytes to MB
        points = [point/1048576 for point in points]
            
        rss_fig.add_trace(go.Box(y=points, name=f'{names[counter]}', marker=colors[counter]))
        counter += 2
        
    rss_fig.update_layout(
        title=f'Memory Usage',
        yaxis_title="Usage (MB)",
        legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.80,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )
    
    rss_fig.show()

def network_usage(csvs_list):
    """
    Plot network usage. For each experiment, for each node select the mappers
    with the smallest time stamp and the one that took the longest. 
    Print the node info for each experiment.
    """
    
    names = csvs_list[3]
    colors = csvs_list[4]
    network_fig = go.Figure()
    counter = 0
    
    for csvs in csvs_list[:2]:
        net_bytes_recv = []
        
        # Select experiment with 80 mappers
        tmp = csvs['80']
        
        # Select the mapper functio
        tmp = tmp[tmp['function'] == 'mapper']
        
        # Split by nodes
        for node in pd.unique(tmp['node']):
            tmp2 = tmp[tmp['node'] == node]
            
            tmp2_st = tmp2[tmp2['phase'] == 'start']
            tmp2_end = tmp2[tmp2['phase'] == 'end']
            mapper_network = np.subtract(tmp2_end['net_bytes_recv'].tolist(), tmp2_st['net_bytes_recv'].tolist())
            net_bytes_recv.append(sum(mapper_network)/(1048576*1024))
            
        # Add trace to figure.
        network_fig.add_trace(go.Box(y=net_bytes_recv, 
                                     name=names[counter], 
                                     marker = colors[counter], 
                                     boxpoints='all'))
        
        counter += 2
        
    network_fig.update_layout(
        title=f'Network Usage',
        yaxis_title="Received Data (GB)",
        legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.80,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )
    
    network_fig.show()
    
def reducer_time(csvs_list):
    """
    Box plot that shows the time to perform the reduction 
    reported by the rerducer benchmarks. 
    """
    names = csvs_list[3]
    colors = csvs_list[4]
    x = [1,2,4,8,16,32,48,64,80]
    
    
    csvs = csvs_list[2] # Solo nos interesa la carga simulada.
    points = []
    
    print("Slowest reducers:")
    for key in csvs:
        
        #print(key)
        df = csvs[key]
        df = df[df['function'] == 'reducer']
        
        for reducer in pd.unique(df['id'] ):
            #print(reducer)
            df2 = df[df['id'] == reducer]
            points.append(float(df2['time'].iloc[[-1]].values[0]))
        
            if df2['time'].iloc[[-1]].values[0] == 2:
                    print(reducer)
                    
        reducer_bp = go.Figure()
        
        reducer_bp.add_trace(go.Box(y=points, name=f"Reduction Service", boxpoints='all'))
    
    reducer_bp.update_layout(
        title=f'Reducer Service Time',
        yaxis_title=y_axis_time,
        #legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.80,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )
    
    reducer_bp.show()
    
def mapper_variability(csvs_list):
    """
    Compare the time difference between the slowest and the 
    fastest mapper for each experiment.
    """
    names = csvs_list[3]
    colors = csvs_list[4]
    counter = 0
    x = [1,2,4,8,16,32,48,64,80]
    
    # Generate one plot per experiment
    for csvs in csvs_list[:3]:
        fig = go.Figure()
        
        min_times = []
        max_times = []
        avg_times = []
        
        
        # For each mapper size//node count in the experiment.
        for key in csvs:
            df = csvs[key]
            # We are only interested in the mappers.
            df = df[df['function'] == 'mapper']
            #print(df[''])
            #max_times.append(df['time'].max)
            #min_times.append(df['time'].min)
            #avg_times.append(df['time'])
            times_list = []
            # Para cada mapper de este trabajo buscar el tiempo maximo
            for elem in pd.unique(df['id']):
                df2 = df[df['id'] == elem]
                times_list.append(df2['time'].iloc[[-1]].values[0])
                
                    
            max_times.append(max(times_list))
            min_times.append(min(times_list))
            avg_times.append(sum(times_list)/len(times_list))

           
        fig.add_trace(go.Scatter(x=x, y=max_times, name="Mapper max times"))
        fig.add_trace(go.Scatter(x=x, y=avg_times, name="Mapper avg times"))
        fig.add_trace(go.Scatter(x=x, y=min_times, name="Mapper min times"))
        
        fig.update_layout(
        title=f'{names[counter]} Mapper Variability',
            xaxis_title=x_axis_jobs,
            yaxis_title=y_axis_time
            #font=dict(size=22),
            #legend=dict(
            #        x=.80,
            #        #x=.02,
            #        y=.95,
            #        traceorder="normal",
            #        font=dict(
            #            family="sans-serif",
            #            size=22,
            #            color="black")
            #    )
        )
        
        fig.show()
        counter +=2
        
        for idx in range(len(x)):
            print(f"Node Count: {x[idx]}: Abs time diff (s): {abs(max_times[idx] - min_times[idx])} | % Diff time {100* abs(min_times[idx]-max_times[idx]) / min_times[idx]}") # {100 * abs(min_times[idx]-max_times[idx]) / ( (min_times[idx]+max_times[idx]) / 2) }")
    
def compare_reduction(ttps_tree, mappers_tree, ttps_coord, mappers_coord):
    """
    To compare the reduction processes we substract from the time to plot the 
    execution time of the mapper and 3 seconds corresponding to the first invocation
    of the service. 
    """
    
    coord_times = []
    tree_times =  []
    #names = mappers[3]
    #colors = mappers[4]
    counter = 0
    
    # Obtain time to plots
    jobs = ['16','32', '48', '64', '80']
    for item in jobs:
        coord_time = ttps_coord[item].values[0][0] - 3
        
        coord_times.append(coord_time) # Substract initiation time
        tree_times.append(ttps_tree[item].values[0][0] - 3)
        
    reduction_fig = go.Figure()
    
    reduction_fig.add_trace(go.Scatter(x=jobs, y=coord_times, name="Coordinated Reduction"))
    reduction_fig.add_trace(go.Scatter(x=jobs, y=tree_times, name="Tree Reduction"))
    
    reduction_fig.update_layout(
        title=f'Reduction model Time to Plot comparison',
        yaxis_title=y_axis_time,
        xaxis_title =x_axis_jobs,
        #legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.65,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )
    
    reduction_fig.show()
    
def mapper_distribution(csvs):
    """
    Box plot representing the distribution of the mapper times invoked for 
    48, 64 and 80 mappers.
    """
    bp = go.Figure()
    times = {}
    for item in ['48','64','80']:
        tmp = csvs[item]
        tmp = tmp[tmp['function'] ==  'mapper']

        tmp_time = []
        #name = f'size_{item.split("/")[1].split("_")[0]}'
        name = item
        times[name] = [[], []]
        for elem in pd.unique(tmp['id']):
            tmp2 = tmp[tmp['id'] == elem]
            mapper_time = float(tmp2['time'].iloc[[-1]].values[0]) - float(tmp2['time'].iloc[[0]].values[0]) 
            tmp_time.append(mapper_time)
            times[name][0].append(tmp2['time'].iloc[[-1]].values[0] - tmp2['time'].iloc[[0]].values[0]) 
            times[name][1].append(tmp2['node'].iloc[[-1]].values[0])
            #times[name][0].sort()

    #print(f'Job: {name}. Average Time: {sum(tmp_time)/len(tmp_time)} - Median: {statistics.median(tmp_time)}')
    
        bp.add_trace(go.Box(y=times[item][0], 
                            name=f'{item} Mappers', 
                            boxpoints='all', 
                            text=times[item][1]))
                            #marker=dict(color=colors
                            #            ),
                            #mode='markers'))

                    
    bp.update_layout(
        title=f'Mapper Time Distribution',
        yaxis_title=y_axis_time,
        xaxis_title =x_axis_jobs,
        #legend_title="Data Source",
        font=dict(size=22),
            legend=dict(
                x=.75,
                #x=.02,
                y=.95,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=22,
                    color="black")
            )
    )
    bp.show()
    
    
def data_load(root_folder, experiment, backend):
    usage = {}
    process = {}
    ttp = {}
    
    csv_files = sorted(glob.glob(f'{root_folder}/{backend}/{experiment}/*.csv'), 
                       key=lambda x: int(x.split('/')[3].split('_')[0]))

    for file_name in csv_files:
        key = file_name.split('/')[3].split('_')[0]

        if file_name.endswith('usage.csv'):
            usage[key] = pd.read_csv(file_name, delimiter='|')
        elif file_name.endswith('process.csv'):
            process[key] = pd.read_csv(file_name, delimiter='|')
        elif file_name.endswith('ttp.csv'):
            ttp[key] = pd.read_csv(file_name, delimiter='|')
    
    return usage, process, ttp

### Data Import

In [None]:
csvs_minio_usage, csvs_minio_process, ttp_minio = data_load('benchmarks','MINIOData', 'tree_reduce')
csvs_cern_usage, csvs_cern_process, ttp_cern = data_load('benchmarks', 'CERNData', 'tree_reduce')
csvs_cpu_usage, csvs_cpu_process, ttp_cpu = data_load('benchmarks', 'SimulatedData', 'tree_reduce')
coord_cpu_usage, coord_cpu_process, coord_ttp_cpu = data_load('benchmarks', 'SimulatedData', 'coord_reduce')


usages =    [csvs_cern_usage,   csvs_minio_usage,   csvs_cpu_usage]
processes = [csvs_cern_process, csvs_minio_process, csvs_cpu_process]
ttps =      [ttp_cern,          ttp_minio,          ttp_cpu]

source_list = ['CERN', 'CERN', 'MINIO', 'MINIO', 'Simulated', 'Simulated']
color_list = [dict(color='blue'), dict(color='lightblue'),
               dict(color='crimson'), dict(color='lightcoral'),
               dict(color='orange'), dict(color='aquamarine')]

usages.append(source_list)
usages.append(color_list)

processes.append(source_list)
processes.append(color_list)

ttps.append(source_list)
ttps.append(color_list)

## Plot Generation

In [None]:
eficiency(ttps)

In [None]:
usage_timeline(usages)

In [None]:
cpu_mem_comparison(usages)

In [None]:
rss_bp_mem(processes)

In [None]:
mapper_variability(usages)

In [None]:
reducer_time(usages)

In [None]:
mapper_distribution(csvs_cpu_usage)
mapper_distribution(csvs_minio_usage)
mapper_distribution(csvs_cern_usage)

In [None]:
compare_reduction(ttp_cpu, csvs_cpu_usage, coord_ttp_cpu, coord_cpu_usage)

In [None]:
network_usage(processes)

## 

In [None]:
df2 = csvs_cpu_process['1']
df2

In [None]:
# Network Usage
print('MINIO')
tmp = csvs_minio_process['80']
tmp = tmp[tmp['function'] == 'mapper']
for node in pd.unique(tmp['node']):
    tmp2 = tmp[tmp['node'] == node]
    print(f"{node}: {tmp2['net_bytes_recv'].max()}")

print('CERN')
import numpy as np
tmp = csvs_cern_process['80']
tmp = tmp[tmp['function'] == 'mapper']
for node in pd.unique(tmp['node']):
    tmp2 = tmp[tmp['node'] == node]
    tmp2_st = tmp2[tmp2['phase'] == 'start']
    tmp2_end = tmp2[tmp2['phase'] == 'end']
    mapper_network = np.subtract(tmp2_end['net_bytes_recv'].tolist(), tmp2_st['net_bytes_recv'].tolist())

    print(f"{node}: {sum(mapper_network)}")

In [None]:
#cpu1_1 = pd.read_csv('', delimiter='')
cpu1_2 = pd.read_csv('benchmarks_cpu/cpu_bound_1/28eb5163-faf9-485c-85e7-ed594cf7994f_usage.csv', delimiter='|')
cpu1_4 = pd.read_csv('benchmarks_cpu/cpu_bound_1/deea2089-6ee4-4a7b-a8a0-1f25b0083f8c_usage.csv', delimiter='|')
cpu1_8 = pd.read_csv('benchmarks_cpu/cpu_bound_1/c3ce453f-9869-4482-9a87-abc7ad1f9bc6_usage.csv', delimiter='|')
cpu1_16= pd.read_csv('benchmarks_cpu/cpu_bound_1/21bcd8bb-7d0b-4f32-b998-a71ddad2318b_usage.csv', delimiter='|')

csvs = { 'cpu1-16': cpu1_16, 'cpu1-8': cpu1_8, 'cpu1-4': cpu1_4, 'cpu1-2': cpu1_2}
plot_mapper_times(csvs)

In [None]:
cpu0_98_2 = pd.read_csv('benchmarks_cpu/cpu_bound_0.98/2_usage.csv', delimiter='|')
cpu0_98_4 = pd.read_csv('benchmarks_cpu/cpu_bound_0.98/4_usage.csv', delimiter='|')
cpu0_98_8 = pd.read_csv('benchmarks_cpu/cpu_bound_0.98/8_usage.csv', delimiter='|')
cpu0_98_16= pd.read_csv('benchmarks_cpu/cpu_bound_0.98/16_usage.csv', delimiter='|')

csvs = { 'cpu0.98-16': cpu0_98_16, 'cpu0.98-8': cpu0_98_8, 'cpu0.98-4': cpu0_98_4, 'cpu0.98-2': cpu0_98_2}
plot_mapper_times(csvs)

## Other

In [None]:
fig_time = go.Figure()

mappers=[1,2,4,6,12,24,36,48,60,72,84]

mappers = [str(mapper) for mapper in mappers]

times_grycap=[5410.87811708,2818.7710524665,1419.30029473305,966.532416772842,
       507.255814909935,280.837961244583,197.483443427086,153.335338521004,
       142.95996222496,180.472412848473,166.029173898697]

times_batch = [5688.08864247799,2940.99840228558,1507.3983455658,
               1012.97974174023,508.00926399231,255.174373030662,
               171.5165848732,130.024196314812,104.598661708832,
               87.5948684215546,75.9565841913223]

times_oscar = [9020, 4478, 2281, 1202, ]

cut = 4

fig_time.add_trace(go.Scatter(x=mappers[cut:], y=times_grycap[cut:],
                                      name="GRyCAP Cluster"))
                                      #mode="markers",
                                     # marker=colors[counter]))
fig_time.add_trace(go.Scatter(x=mappers[cut:], y=times_batch[cut:],
                                      name="CERN Cluster"))
                                      #mode="markers",
                                      #marker=colors[counter]))
        
fig_time.update_layout(
        title="Time to Plot",
        xaxis_title=x_axis_jobs,
        yaxis_title=y_axis_time,
        legend_title="Data Source",
        font=dict(size=22),
        legend=dict(
            x=.70,
            y=.95,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=22,
                color="black")
        )
    )        
        

fig_time.show()



fig_time = go.Figure()

mappers=[1,2,4,6,12,24,36,48,60,72,84]

mappers = [str(mapper) for mapper in mappers]

times_grycap=[5410.87811708,2818.7710524665,1419.30029473305,966.532416772842,
       507.255814909935,280.837961244583,197.483443427086,153.335338521004,
       142.95996222496,180.472412848473,166.029173898697]

times_batch = [5688.08864247799,2940.99840228558,1507.3983455658,
               1012.97974174023,508.00926399231,255.174373030662,
               171.5165848732,130.024196314812,104.598661708832,
               87.5948684215546,75.9565841913223]

cut = 5

fig_time.add_trace(go.Scatter(x=mappers[:cut], y=times_grycap[:cut],
                                      name="GRyCAP Cluster"))
                                      #mode="markers",
                                     # marker=colors[counter]))
fig_time.add_trace(go.Scatter(x=mappers[:cut], y=times_batch[:cut],
                                      name="CERN Cluster"))
                                      #mode="markers",
                                      #marker=colors[counter]))
        
fig_time.update_layout(
        title="Time to Plot",
        xaxis_title=x_axis_jobs,
        yaxis_title=y_axis_time,
        legend_title="Data Source",
        font=dict(size=22),
        legend=dict(
            x=.70,
            y=.95,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=22,
                color="black")
        )
    )        
        

fig_time.show()