In [307]:
import pandas as pd
import numpy as np

blink = pd.read_csv("blink-eval.csv")
crispy = pd.read_csv("crsipy-eval.csv")
costs = pd.read_csv("arrow_cluster_jobs.csv")

crispy.rename(columns={'mtype':'VM_type'}, inplace=True)

all = pd.merge(blink, crispy, on="VM_type", how='outer')
all.fillna(0, inplace=True)
#all.iloc[:, 1:] = all.iloc[:, 1:].astype(int)

all

Unnamed: 0,VM_type,Cluster_Size_Kmeans,Cluster_Size_PageRank,kmeans_scaleout,pagerank_scaleout
0,m4.l,7.0,0.0,24.0,32.0
1,m4.xl,3.0,0.0,10.0,16.0
2,m4.2xl,2.0,7.0,10.0,8.0
3,r4.l,3.0,0.0,10.0,32.0
4,r4.xl,2.0,7.0,4.0,16.0
5,r4.2xl,1.0,4.0,8.0,8.0
6,c4.l,0.0,0.0,0.0,0.0
7,c4.xl,7.0,0.0,20.0,0.0
8,c4.2xl,3.0,0.0,10.0,0.0


In [308]:
# since arrows dataset doesn't contain exact scaleout as blink proposes we round it up to the next even number to simulate worth case

def add_one_to_uneven(x):
    if x!=0 and x <= 4: # 4 is a minimal amount of machines in arrow dataset
        return 4
    if x != 0 and x % 2 != 0:
        return x + 1
    return x
all.iloc[:, 1:] = all.iloc[:, 1:].applymap(add_one_to_uneven)
all.iloc[:, 0:3]

Unnamed: 0,VM_type,Cluster_Size_Kmeans,Cluster_Size_PageRank
0,m4.l,8.0,0.0
1,m4.xl,4.0,0.0
2,m4.2xl,4.0,8.0
3,r4.l,4.0,0.0
4,r4.xl,4.0,8.0
5,r4.2xl,4.0,4.0
6,c4.l,0.0,0.0
7,c4.xl,8.0,0.0
8,c4.2xl,4.0,0.0


In [309]:
costs[costs['scaleout'] == 8.0]
all['VM_type'] = all['VM_type'] + 'arge'
all.rename(columns={'Cluster_Size_Kmeans':'Kmeans_Blink', 'Cluster_Size_PageRank':'PageRank_Blink', 'kmeans_scaleout': 'Kmeans_Crispy', 'pagerank_scaleout':'PageRank_Crispy' }, inplace=True)
all

Unnamed: 0,VM_type,Kmeans_Blink,PageRank_Blink,Kmeans_Crispy,PageRank_Crispy
0,m4.large,8.0,0.0,24.0,32.0
1,m4.xlarge,4.0,0.0,10.0,16.0
2,m4.2xlarge,4.0,8.0,10.0,8.0
3,r4.large,4.0,0.0,10.0,32.0
4,r4.xlarge,4.0,8.0,4.0,16.0
5,r4.2xlarge,4.0,4.0,8.0,8.0
6,c4.large,0.0,0.0,0.0,0.0
7,c4.xlarge,8.0,0.0,20.0,0.0
8,c4.2xlarge,4.0,0.0,10.0,0.0


In [310]:
costs[(costs['mtype'] == 'm4.2xlarge') & (costs['scaleout'] == 2.0) & (costs['job'] == 'kmeans_spark_huge')]

Unnamed: 0,duration,input_size,cost,cost_norm,scaleout,mtype,cores,total_cores,memory,total_memory,...,agg_disk.%util,agg_network.rxpck/s,agg_network.txpck/s,agg_network.rxkB/s,agg_network.txkB/s,agg_network.rxcmp/s,agg_network.txcmp/s,agg_network.rxmcst/s,agg_network.%ifutil,bread


In [311]:
def get_cost_norm(row, cost_df, col_name, job):
    # Filter cost_df to find matching VM_type and scaleout
    match = cost_df[(cost_df['mtype'] == row['VM_type']) & (cost_df['scaleout'] == row[col_name]) & (cost_df['job'] == job)]
    # Return the cost_norm value if found, otherwise return the original scaleout value
    if not match.empty:
        return match.iloc[0]['cost_norm']
    else:
        return row[col_name]
    
all['Kmeans_Blink'] = all.apply(lambda row:get_cost_norm(row, costs, 'Kmeans_Blink', 'kmeans_spark_huge'), axis=1)
all['PageRank_Blink'] = all.apply(lambda row:get_cost_norm(row, costs, 'PageRank_Blink', 'pagerank_spark_huge'), axis=1)
all['Kmeans_Crispy'] = all.apply(lambda row:get_cost_norm(row, costs, 'Kmeans_Crispy', 'kmeans_spark_huge'), axis=1)
all['PageRank_Crispy'] = all.apply(lambda row:get_cost_norm(row, costs, 'PageRank_Crispy', 'pagerank_spark_huge'), axis=1)

all

Unnamed: 0,VM_type,Kmeans_Blink,PageRank_Blink,Kmeans_Crispy,PageRank_Crispy
0,m4.large,3.182346,0.0,3.408816,2.009014
1,m4.xlarge,3.174576,0.0,3.1209,1.669936
2,m4.2xlarge,3.149639,1.554029,1.175389,1.554029
3,r4.large,4.110828,0.0,4.417155,2.343946
4,r4.xlarge,4.211399,1.390374,4.211399,2.164199
5,r4.2xlarge,1.348891,1.354557,1.293583,1.808106
6,c4.large,0.0,0.0,0.0,0.0
7,c4.xlarge,4.433197,0.0,4.90443,0.0
8,c4.2xlarge,6.866215,0.0,5.59756,0.0


In [312]:
all = all.replace(0, np.nan)
mean_row = all.mean(numeric_only=True, skipna=True)
print(mean_row)


Kmeans_Blink       3.809636
PageRank_Blink     1.432987
Kmeans_Crispy      3.516154
PageRank_Crispy    1.924871
dtype: float64


In [313]:
# Convert the mean values into a DataFrame and add 'mean' to the 'VM_type' column
mean_df = pd.DataFrame(mean_row).T
mean_df['VM_type'] = 'mean'

# Concatenate the mean row to the original DataFrame
all = pd.concat([all, mean_df], ignore_index=True)
all = all.replace(np.nan, '-')
all

Unnamed: 0,VM_type,Kmeans_Blink,PageRank_Blink,Kmeans_Crispy,PageRank_Crispy
0,m4.large,3.182346,-,3.408816,2.009014
1,m4.xlarge,3.174576,-,3.1209,1.669936
2,m4.2xlarge,3.149639,1.554029,1.175389,1.554029
3,r4.large,4.110828,-,4.417155,2.343946
4,r4.xlarge,4.211399,1.390374,4.211399,2.164199
5,r4.2xlarge,1.348891,1.354557,1.293583,1.808106
6,c4.large,-,-,-,-
7,c4.xlarge,4.433197,-,4.90443,-
8,c4.2xlarge,6.866215,-,5.59756,-
9,mean,3.809636,1.432987,3.516154,1.924871


In [314]:
# compare the mean of the selected cluster costs for kmeans and pagerank seperately
# mention that due to possible inaccurate memory extrapolation on the full dataset blink predict too high memory requirements comparing to the cirpsy, so that it didn't select the configurations with lower memory, while Crispy was able to 