In [None]:
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt

#### Auxiliary functions used to generate plots

In [None]:
def plot_comparison_metric(competitors: dict[str, pd.DataFrame],
                           column : str,
                           title_plot : str,
                           y_label : str) :
    ''' 
    This function generates a plot that compares various scoring plugin under some metric as the % of requested
    GPU cluster resources increases.
    '''
    
    # Plotting
    fig, ax1 = plt.subplots(figsize=(10, 6))
    
    for k, v in competitors.items() : 
        ax1.plot(v.index, v[column], label=k)
    ax1.set_xlabel('% GPU cluster capacity requested by arrived pods')
    ax1.set_ylabel(y_label)
    
    # Legends
    ax1.legend()
    
    plt.title(title_plot)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_energy_savings(competitors_pwr: dict[str, pd.DataFrame], 
                        reference_competitor : str, column_power : str, title_plot : str) :
    
    fig, ax1 = plt.subplots(figsize=(10, 6))
    reference = competitors_pwr[reference_competitor]
    for k, v in competitors_pwr.items() :
        if k == reference_competitor: continue
        ax1.plot(v.index, (reference[column_power] - v[column_power]) / reference[column_power] * 100, label=k)
    ax1.set_xlabel('% GPU cluster capacity requested by arrived pods')
    ax1.set_ylabel('% power savings')
    
    # Legends
    ax1.legend()
    
    plt.title(title_plot)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

### Retrieve power consumption and failed pods data from the experiments

In [None]:
# Dictionaries where the results will be stored.
df_pwr_dict = {}
df_sched_pod_dict = {}

DATADIR = "./2024_0606"
data = Path(DATADIR)

fileDirs = sorted([x for x in data.iterdir() if x.is_dir()])
for fdir in fileDirs:
    df_pwr_dict[fdir.name] = {}
    df_sched_pod_dict[fdir.name] = {}

    policyDirs = sorted([x for x in fdir.iterdir() if x.is_dir()])
    for pdir in policyDirs:            
        tuneDirs = sorted([x for x in pdir.iterdir() if x.is_dir()])
        for tdir in tuneDirs:
            seedDirs = sorted([x for x in tdir.iterdir() if x.is_dir()])
            for sdir in seedDirs:
                pwrfile = sdir / 'analysis_pwr.csv'
                grep = sdir / 'analysis_grep.out'
                alloc = sdir / 'analysis_allo.csv'
                schedfile = sdir / 'analysis_cdol.csv'


                # Retrieve the total GPU cluster capacity (in millis).
                try:
                    with open(grep, 'r') as file:
                        content = file.read()  # Read the entire file into a string
                        # Find the position of 'allocation: ' in the content
                        start_idx = content.find('MilliGpu: ')
                        if start_idx != -1:
                            # Find the end of the line
                            end_idx = content.find('\n', start_idx)
                            # Extract the allocation part
                            selection = content[start_idx:end_idx]
                            # Extract the integer part by splitting the string
                            allocation_value = int(selection.split("/")[1].split(")")[0])
                        else:
                            raise Exception("MilliGpu cluster info not found. Error!\n")

                except Exception as e:
                    print("ERROR grep analysis: %s\n" % (e))

                # print(f"Total GPU cluster capacity: {allocation_value}\n")

                
                
                ### Collect telemetry about the GPU workload (in millis) that the cluster has received. ###
                try:
                    df_allo = pd.read_csv(alloc)
                    df_allo.rename(columns = lambda x: x.split('-')[-1], inplace=True)
                    df_allo = df_allo.loc[:, 'arrived_gpu_milli'] / allocation_value

                except Exception as e:
                    print("ERROR alloc analysis: %s\n" % (e))

                # print(df_allo)
                # sys.exit() # DEBUG!
                

                
                # Set up the index for GPU requests received by the cluster.
                new_index = np.arange(0, 1.205, 0.005)

                ### Collect telemetry about power consumption. ###
                try:
                    df_pwr = pd.read_csv(pwrfile)
                    df_pwr.rename(columns = lambda x: x.split('-')[-1], inplace=True)
                    df_pwr["cumulative_workload"] = df_allo
                    df_pwr.set_index("cumulative_workload", inplace = True)

                    # Remove rows with duplicated index entries, keeping only the first entry for each group of duplicates.
                    # Then, add the entries in new_index in df_pwr's existing index via a union.
                    # Then, reindex and interpolate the missing values.
                    # Finally, keep only the rows whose index entries are present in new_index (i.e., the regularly spaced ones).
                    df_pwr = df_pwr[~df_pwr.index.duplicated(keep='first')]
                    df_pwr = df_pwr.reindex(df_pwr.index.union(new_index)).interpolate(method='linear').ffill().bfill()
                    df_pwr = df_pwr.loc[new_index]
                    if df_pwr.isna().any().any(): 
                        # print(df_pwr[df_pwr.isna().any(axis=1)])
                        raise Exception("dataframe contains NaNs!\n")
                        
                    df_pwr_dict[fdir.name].setdefault(pdir.name, list()).append(df_pwr)

                except Exception as e:
                    print("ERROR power analysis: %s\n" % (e))

                # print(df_pwr)
                # sys.exit() # DEBUG!
                

                
                ### Collect telemetry about pods that the cluster failed to schedule. ###
                try:
                    df_sched_pod = pd.read_csv(schedfile)
                    df_sched_pod.rename(columns = lambda x: x.split('-')[-1], inplace=True)
                    df_sched_pod = df_sched_pod[['event']]
                    df_sched_pod["cumulative_workload"] = df_allo
                    df_sched_pod['event'] = 1 * (df_sched_pod['event'] == 'failed')
                    df_sched_pod['event'] = df_sched_pod['event'].cumsum()
                    df_sched_pod.set_index("cumulative_workload", inplace = True)

                    df_sched_pod = df_sched_pod[~df_sched_pod.index.duplicated(keep='first')]
                    df_sched_pod = df_sched_pod.reindex(df_sched_pod.index.union(new_index)).interpolate(method='linear').ffill().bfill()
                    df_sched_pod = df_sched_pod.loc[new_index]
                    if df_sched_pod.isna().any().any(): 
                        raise Exception("dataframe contains NaNs!\n")
                    
                    df_sched_pod_dict[fdir.name].setdefault(pdir.name, list()).append(df_sched_pod)
                
                except Exception as e:
                    print("ERROR scheduling analysis: %s\n" % (e))

                # print(df_sched_pod)
                # sys.exit() # DEBUG!


# display(df_pwr_dict.keys())
# display(df_pwr_dict)
# display(df_sched_pod_dict.keys())
# display(df_sched_pod_dict)

#### Compute the average power consumption and number of failed plugins within each score plugin's set of runs.

In [None]:
# Compute the average power consumption for each score plugin.
dict_pwr_final_res = {}
for k in df_pwr_dict.keys() :
    dict_pwr_final_res[k] = {}
    for k2 in df_pwr_dict[k].keys() :
        print(f"Computing cluster power consumption mean for level ({k},{k2}) ({len(df_pwr_dict[k][k2])} reps)")
        dict_pwr_final_res[k][k2] = sum(df_pwr_dict[k][k2]) / len(df_pwr_dict[k][k2])


# Compute the average failed pod for each score plugin.
dict_sched_final_res = {}
for k in df_sched_pod_dict.keys() :
    dict_sched_final_res[k] = {}
    for k2 in df_sched_pod_dict[k].keys() :
        print(f"Computing mean of pods that have failed to schedule for level ({k},{k2}) ({len(df_sched_pod_dict[k][k2])} reps)")
        dict_sched_final_res[k][k2] = sum(df_sched_pod_dict[k][k2]) / len(df_sched_pod_dict[k][k2])

    
# display(dict_pwr_final_res.keys())
# display(dict_pwr_final_res)
# display(dict_sched_final_res.keys())
# display(dict_sched_final_res)

### Generation of power consumption plots: overall, GPU only, and CPU only 

In [None]:
reference_competitor = '06-FGD'
for level in dict_pwr_final_res.keys() :
    # Plot the number of pods that failed to be scheduled w.r.t. the arrived workloads in % of GPU resources available in the cluster. 
    plot_comparison_metric(dict_sched_final_res[level], 
                       "event",
                       f"# of pods that failed to be scheduled (set experiments: {level})",
                       "# failed pods")

    
    # Plot the overall cluster energy consumption w.r.t. the arrived workloads in % of GPU resources available in the cluster.
    plot_comparison_metric(dict_pwr_final_res[level],
                           "power_cluster",
                           f"Overall cluster energy consumption (set experiments: {level})",
                           "Watts")
    plot_energy_savings(dict_pwr_final_res[level], reference_competitor, 
                        "power_cluster", f"Overall power savings vs {reference_competitor} (set experiments: {level})")



    # Plot the GPU cluster energy consumption w.r.t. the arrived workloads in % of GPU resources available in the cluster.
    plot_comparison_metric(dict_pwr_final_res[level],
                           "power_cluster_GPU",
                           f'GPU cluster energy consumption (set experiments: {level})',
                           "Watts")
    plot_energy_savings(dict_pwr_final_res[level], reference_competitor, 
                        "power_cluster_GPU", f"GPU power savings vs {reference_competitor} (set experiments: {level})")

    

    # Plot the CPU cluster energy consumption w.r.t. the arrived workloads in % of GPU resources available in the cluster.
    #plot_comparison_metric(dict_pwr_final_res[level],
    #                       "power_cluster_CPU",
    #                       f'CPU cluster energy consumption (set experiments: {level})',
    #                       "Watts")
    #plot_energy_savings(dict_pwr_final_res[level], reference_competitor, 
    #                    "power_cluster_CPU", f"CPU cluster power savings vs {reference_competitor} (set experiments: {level})")