# Experiment Results

## Preliminaries

### Imports

In [None]:
# Library Imports
import os
import csv
import sys
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statistics
import glob

### Plot Functions

In [None]:
def plot_mapper_times(csvs):
    '''
    
    '''
    times = {}
    for item in csvs:
        tmp = csvs[item]
        tmp = tmp[tmp['function'] ==  'mapper']

        tmp_time = []
        name = f'size_{item.split("_")[0]}'

        times[name] = [[], []]
        for elem in pd.unique(tmp['id']):
            tmp2 = tmp[tmp['id'] == elem]
            mapper_time = float(tmp2['time'].iloc[[-1]].values[0]) - float(tmp2['time'].iloc[[0]].values[0]) 
            tmp_time.append(mapper_time)
            times[name][0].append(tmp2['time'].iloc[[-1]].values[0] - tmp2['time'].iloc[[0]].values[0]) 
            times[name][1].append(tmp2['node'].iloc[[-1]].values[0])
            #times[name][0].sort()

        print(f'Job: {name}. Average Time: {sum(tmp_time)/len(tmp_time)} - Median: {statistics.median(tmp_time)}')


    for k in times:
        bp = go.Figure()
        bp.add_trace(go.Box(y=times[k][0], 
                            name=f'Job {k}', 
                            boxpoints='all', 
                            text=times[k][1]))

        bp.show()
       
    
def eficiency(csvs):
    '''
    
    '''
    
    jobs = ['1','3','6','12','24','48','96']
    times = []
    
    for item in csvs.keys(): 
        tmp = csvs[item]
        tmp = tmp[tmp['function'] == 'mapper']
        max_time = 0
        
        # Para cada mapper de este trabajo buscar el tiempo maximo
        for elem in pd.unique(tmp['id']):
            tmp2 = tmp[tmp['id'] == elem]
            if max_time < tmp2['time'].iloc[[-1]].values[0]:
                max_time = tmp2['time'].iloc[[-1]].values[0]
        
        times.append(max_time)

    efi_ts = list(map(lambda x: times[0] / x , times))
    efi_jobs = [float(i) for i in jobs]

    # Creacion del grafico
    fig = make_subplots(rows=1, cols=2,
                       subplot_titles=('Time', 'Speedup'))
                        
    fig.add_trace(go.Scatter(x=jobs,y=times),row=1,col=1)

    fig.add_trace(go.Scatter(x=jobs,y=efi_jobs, line=dict(dash='dash')),row=1,col=2)
    fig.add_trace(go.Scatter(x=jobs,y=efi_ts), row=1, col=2)
    
    fig.update_xaxes(type='linear' ,row=1, col=2)
    
    fig.update_layout(showlegend=False)
    
    fig.show()
    
    
def usage_bp(csvs): # _minio, csvs_cern, csvs_cpu):
    '''
    
    '''
    #TODO: Add for experiment type
    
    # Select Required Data
    key = '96' # Select Job with 96 mappers as default.
    tmp = csvs[key]
    tmp = tmp[tmp['function'] == 'mapper']
    cpu_use = []
    mem_use = []
    # For each mapper get the mean of the usage, this will be a data point for the bp.
    for elem in pd.unique(tmp['id']):
        tmp2 = tmp[tmp['id'] == elem]
        cpu_use.append(tmp2['cpu_percent'].mean())
        mem_use.append(tmp2['mem_percent'].mean())
    
    # Generate Plots
    fig = make_subplots(rows=2, cols=2, 
                       subplot_titles=('CERN Data', 'Minio Data', 'CPU'))
    
    fig.add_trace(go.Box(y=cpu_use, name='CPU Usage'), col=1,row=1)
    fig.add_trace(go.Box(y=mem_use, name='Mem Usage'), col=1,row=1)
    
    fig.add_trace(go.Box(y=cpu_use, name='CPU Usage'), col=2,row=1)
    fig.add_trace(go.Box(y=mem_use, name='Mem Usage'), col=2,row=1)
    
    fig.add_trace(go.Box(y=cpu_use, name='CPU Usage'), col=1,row=2)
    fig.add_trace(go.Box(y=mem_use, name='Mem Usage'), col=1,row=2)
    
    fig.update_layout(showlegend=False)
    
    fig.show()
    
    
def usage_timeline(csvs):
    '''
    
    '''
    #TODO: Add for experiment
    key = '96'
    tmp = csvs[key]
    tmp = tmp[tmp['function'] == 'mapper']

    mapper_data = tmp[tmp['id'] == '95_95']

    fig = make_subplots(rows=3, cols=2,
                         subplot_titles=['CERN CPU Usage','CERN Mem Usage',
                                        'MINIO CPU Usage','MINIO Mem Usage',
                                        'CPU CPU Usage','CPU Mem Usage'],
                         x_title= 'Time (s)',
                         y_title='Usage %')
    # CERN
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['cpu_percent']), 
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['mem_percent']), 
                  row=1, col=2)
    
    # MINIO
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['cpu_percent']), 
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['mem_percent']), 
                  row=2, col=2)
    
    # CPU
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['cpu_percent']), 
                  row=3, col=1)
    fig.add_trace(go.Scatter(x=mapper_data['time'],
                             y=mapper_data['mem_percent']), 
                  row=3, col=2)
    
    fig.update_layout(showlegend=False)
    
    fig.show()

### Data Import

In [None]:
# Minio Data Load
csvs_minio_usage = {}
csvs_minio_process = {}
csv_files = sorted(glob.glob(f'MinioData/*.csv'), key=lambda x: int(x.split('/')[1].split('_')[0]))

for file_name in csv_files:
    key = file_name.split('_')[0].split('/')[1]
    
    if file_name.endswith('usage.csv'):
        csvs_minio_usage[key] = pd.read_csv(file_name, delimiter='|')
    elif file_name.endswith('process.csv'):
        csvs_minio_process[key] = pd.read_csv(file_name, delimiter='|')

In [None]:
# CERN Data Load
csvs_cern_usage = {}
csvs_cern_process = {}
csv_files = sorted(glob.glob(f'CERNData/*.csv'), key=lambda x: int(x.split('/')[1].split('_')[0]))

for file_name in csv_files:
    key = file_name.split('_')[0].split('/')[1]
    
    if file_name.endswith('usage.csv'):
        csvs_cern_usage[key] = pd.read_csv(file_name, delimiter='|')
    elif file_name.endswith('process.csv'):
        csvs_cern_process[key] = pd.read_csv(file_name, delimiter='|')

In [None]:
# CPU Data Load
csvs_cpu_usage = {}
csvs_cpu_process = {}
csv_files = sorted(glob.glob(f'SimulatedData/*.csv'), key=lambda x: int(x.split('/')[1].split('_')[0]))

for file_name in csv_files:
    key = file_name.split('_')[0].split('/')[1]
    
    if file_name.endswith('usage.csv'):
        csvs_cpu_usage[key] = pd.read_csv(file_name, delimiter='|')
    elif file_name.endswith('process.csv'):
        csvs_cpu_process[key] = pd.read_csv(file_name, delimiter='|')

## Plot Generation

In [None]:
eficiency(csvs_minio_usage)
usage_bp(csvs_minio_usage)
usage_timeline(csvs_minio_usage)

## Process Data Insights

In [None]:
df2 = pd.read_csv('264285c4-098b-433f-90ae-e6c91eb68716_process.csv' , delimiter='|')
df2 = df2[df2['function'] == 'mapper']
cols = ['function', 'id', 'phase', 'node'] + [col_name for col_name in df2 if col_name.startswith('ctx_') ]
df_ctx = df2[cols]
df_ctx = df_ctx[df_ctx['phase'] == 'end']
df_ctx.sort_values('ctx_involuntary')

In [None]:



#cpu1_1 = pd.read_csv('', delimiter='')
cpu1_2 = pd.read_csv('cpu_bound_1/28eb5163-faf9-485c-85e7-ed594cf7994f_usage.csv', delimiter='|')
cpu1_4 = pd.read_csv('cpu_bound_1/deea2089-6ee4-4a7b-a8a0-1f25b0083f8c_usage.csv', delimiter='|')
cpu1_8 = pd.read_csv('cpu_bound_1/c3ce453f-9869-4482-9a87-abc7ad1f9bc6_usage.csv', delimiter='|')
cpu1_16= pd.read_csv('cpu_bound_1/21bcd8bb-7d0b-4f32-b998-a71ddad2318b_usage.csv', delimiter='|')

csvs = { 'cpu1-16': cpu1_16, 'cpu1-8': cpu1_8, 'cpu1-4': cpu1_4, 'cpu1-2': cpu1_2}
plot_mapper_times(csvs)

In [None]:
cpu0_98_2 = pd.read_csv('cpu_bound_0.98/2_usage.csv', delimiter='|')
cpu0_98_4 = pd.read_csv('cpu_bound_0.98/4_usage.csv', delimiter='|')
cpu0_98_8 = pd.read_csv('cpu_bound_0.98/8_usage.csv', delimiter='|')
cpu0_98_16= pd.read_csv('cpu_bound_0.98/16_usage.csv', delimiter='|')

csvs = { 'cpu0.98-16': cpu0_98_16, 'cpu0.98-8': cpu0_98_8, 'cpu0.98-4': cpu0_98_4, 'cpu0.98-2': cpu0_98_2}
plot_mapper_times(csvs)

In [None]:

for item in csvs:
    tmp = csvs[item]
    tmp = tmp[tmp['function'] ==  'reducer']

    tmp_time = 0
    name = f'size_{item.split("_")[0]}'
    times[name] = [[], []]
    for elem in pd.unique(tmp['id']):
        tmp2 = tmp[tmp['id'] == elem]
        print(tmp2)
        reducer_time = float(tmp2['time'].iloc[[-1]].values[0]) - float(tmp2['time'].iloc[[0]].values[0]) 
        print('\n\n')
        tmp_time += reducer_time
        times[name][0].append(tmp2['time'].iloc[[-1]].values[0] - tmp2['time'].iloc[[0]].values[0]) 
        times[name][1].append(tmp2['node'].iloc[[-1]].values[0])
        #times[name][0].sort()

    #print('Reduer count: ')    
    #print(f'Job: {name}. Average Time: {sum(tmp_time)/len(tmp_time)} - Median: {statistics.median(tmp_time)}')
    print(tmp_time)

In [None]:
cpu1_16 = pd.read_csv('test1/16_usage.csv', delimiter='|')

csvs = { 'cpu16': cpu1_16}
plot_mapper_times(csvs)

## Other

In [None]:
prefix = 'SimulatedData'
csvs = [f'{prefix}/{file_name}' for file_name in os.listdir('SimulatedData/') if file_name.endswith('usage.csv')]
times = {}
print(csvs)
for item in csvs:
    tmp = pd.read_csv(item, delimiter='|')
    tmp = tmp[tmp['function'] ==  'mapper']

    tmp_time = []
    name = f'size_{item.split("/")[1].split("_")[0]}'
    
    times[name] = [[], []]
    for elem in pd.unique(tmp['id']):
        tmp2 = tmp[tmp['id'] == elem]
        mapper_time = float(tmp2['time'].iloc[[-1]].values[0]) - float(tmp2['time'].iloc[[0]].values[0]) 
        tmp_time.append(mapper_time)
        times[name][0].append(tmp2['time'].iloc[[-1]].values[0] - tmp2['time'].iloc[[0]].values[0]) 
        times[name][1].append(tmp2['node'].iloc[[-1]].values[0])
        #times[name][0].sort()

    print(f'Job: {name}. Average Time: {sum(tmp_time)/len(tmp_time)} - Median: {statistics.median(tmp_time)}')

# Box Plot
#bp = go.Figure()
nodes = ['wn1.localdomain', 'wn2.localdomain', 'wn3.localdomain',
         'wn4.localdomain', 'wn5.localdomain', 'wn6.localdomain']
colors = ['red', 'green', 'blue', 'black', 'orange', 'purple']


for k in times:
bp = go.Figure()
    bp.add_trace(go.Box(y=times[k][0], 
                        name=f'Job {k}', 
                        boxpoints='all', 
                        text=times[k][1]))
                        #marker=dict(color=colors
                        #            ),
                        #mode='markers'))

    bp.show()