In [16]:
import pandas as pd

#Page_Rank
#Size of cached RDDs = 123723212149 (retrieved from sizePredictor)
size_cached_RDD = 123723212149/1024/1024 # (in MB) = 123(GB) - Unified Memory

data = {
    'VM_type': ['m4.l', 'm4.xl', 'm4.2xl', 
                'r4.l', 'r4.xl', 'r4.2xl', 
                'c4.l', 'c4.xl', 'c4.2xl'],
    'RAM (MB)': [8, 16, 32, 
               15.25, 30.5, 61, 
               3.75, 7.5, 15],
    'Unified Memory(M)': [None] * 9,  # Placeholder for Unified Memory
    'Storage Memory(R)': [None] * 9   # Placeholder for Storage Memory
}

df = pd.DataFrame(data)
os_overhead = 2048
df['Available RAM (MB)'] = df['RAM (MB)']*1024 - os_overhead
df['Unified Memory(M)'] = (df['Available RAM (MB)'] - 300) * 0.6
df['Storage Memory(R)'] = df['Unified Memory(M)'] * 0.5

def recommend_cluster_size(row, size_cached_RDD):
    for num_machines in range(1, 13):  # Checking for up to 12 machines
        total_memory_available = row['Unified Memory(M)'] * num_machines #all unified memory could be taken up for the RDD caching
        if size_cached_RDD <= total_memory_available:
            return num_machines
    return None  # If no suitable number of machines is found

# Apply the recommendation function to each row in the DataFrame
df['Recommended_Cluster_Size'] = df.apply(lambda row: recommend_cluster_size(row, size_cached_RDD), axis=1)

df_pagerank = df
df_pagerank

Unnamed: 0,VM_type,RAM (MB),Unified Memory(M),Storage Memory(R),Available RAM (MB),Recommended_Cluster_Size
0,m4.l,8.0,3506.4,1753.2,6144.0,
1,m4.xl,16.0,8421.6,4210.8,14336.0,
2,m4.2xl,32.0,18252.0,9126.0,30720.0,7.0
3,r4.l,15.25,7960.8,3980.4,13568.0,
4,r4.xl,30.5,17330.4,8665.2,29184.0,7.0
5,r4.2xl,61.0,36069.6,18034.8,60416.0,4.0
6,c4.l,3.75,895.2,447.6,1792.0,
7,c4.xl,7.5,3199.2,1599.6,5632.0,
8,c4.2xl,15.0,7807.2,3903.6,13312.0,


In [17]:
import pandas as pd

# Kmeans
# ExecutionMemory = 85983559 (retrieved from executionMemoryParser)
# Size Cached RDDS = 21978205388 (retrieved from sizeParser)

execution_memory = 85983559/1024/1024   # 0.08 GB
size_cached_RDD = 21978205388/1024/1024   # 21 GB 
# Unified Memory (21.1 GB) + 0.3 GB + 21.1/0.5 + 2 =  65,6 GB (Total required memory)

data = {
    'VM_type': ['m4.l', 'm4.xl', 'm4.2xl', 
                'r4.l', 'r4.xl', 'r4.2xl', 
                'c4.l', 'c4.xl', 'c4.2xl'],
    'RAM (GB)': [8, 16, 32, 
               15.25, 30.5, 61, 
               3.75, 7.5, 15],
    'Unified Memory(M)': [None] * 9,  # Placeholder for Unified Memory
    'Storage Memory(R)': [None] * 9   # Placeholder for Storage Memory
}

df = pd.DataFrame(data)
os_overhead = 2048
df['Available RAM (MB)'] = df['RAM (GB)']*1024 - os_overhead
df['Unified Memory(M)'] = (df['Available RAM (MB)'] - 300) * 0.6
df['Storage Memory(R)'] = df['Unified Memory(M)'] * 0.5

for num_machines in range(1, 13):
    # Create a new column for each number of machines
    column_name = f'Machine_Memory_Exec_{num_machines}_Machines'
    
    # Calculate memory execution based on the number of machines
    df[column_name] = df.apply(lambda row: min(row['Unified Memory(M)'] - row['Storage Memory(R)'], execution_memory/num_machines), axis=1)

def recommend_cluster_size(row, size_cached_RDD):
    for num_machines in range(1, 13):  # Checking for up to 12 machines
        total_memory_available = (row['Unified Memory(M)'] - row[f'Machine_Memory_Exec_{num_machines}_Machines']) * num_machines #check if the dataset would fit in the storage memory
        if size_cached_RDD <= total_memory_available:
            return num_machines
    return None  # If no suitable number of machines is found

# Apply the recommendation function to each row in the DataFrame
df['Recommended_Cluster_Size'] = df.apply(lambda row: recommend_cluster_size(row, size_cached_RDD), axis=1)

df


Unnamed: 0,VM_type,RAM (GB),Unified Memory(M),Storage Memory(R),Available RAM (MB),Machine_Memory_Exec_1_Machines,Machine_Memory_Exec_2_Machines,Machine_Memory_Exec_3_Machines,Machine_Memory_Exec_4_Machines,Machine_Memory_Exec_5_Machines,Machine_Memory_Exec_6_Machines,Machine_Memory_Exec_7_Machines,Machine_Memory_Exec_8_Machines,Machine_Memory_Exec_9_Machines,Machine_Memory_Exec_10_Machines,Machine_Memory_Exec_11_Machines,Machine_Memory_Exec_12_Machines,Recommended_Cluster_Size
0,m4.l,8.0,3506.4,1753.2,6144.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,7.0
1,m4.xl,16.0,8421.6,4210.8,14336.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,3.0
2,m4.2xl,32.0,18252.0,9126.0,30720.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,2.0
3,r4.l,15.25,7960.8,3980.4,13568.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,3.0
4,r4.xl,30.5,17330.4,8665.2,29184.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,2.0
5,r4.2xl,61.0,36069.6,18034.8,60416.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,1.0
6,c4.l,3.75,895.2,447.6,1792.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,
7,c4.xl,7.5,3199.2,1599.6,5632.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,7.0
8,c4.2xl,15.0,7807.2,3903.6,13312.0,82.000312,41.000156,27.333437,20.500078,16.400062,13.666719,11.71433,10.250039,9.111146,8.200031,7.454574,6.833359,3.0


In [18]:
#Summarize results for kmeans and pageRank

df.iloc[:, [0,1,2,3,4,5,15,16]]

df_selected_cluster_sizes_kmeans = df.iloc[:, [0, -1]]
#df_selected_cluster_sizes_kmeans['Recommended_Cluster_Size_Kmeans'] =  df_selected_cluster_sizes_kmeans['Recommended_Cluster_Size']

df_selected_cluster_sizes_pagerank = df_pagerank.iloc[:, [0, -1]]
#df_selected_cluster_sizes_pagerank['Recommended_Cluster_Size_Pagerank'] =  df_selected_cluster_sizes_kmeans['Recommended_Cluster_Size']

df_selected_cluster_size = pd.merge(df_selected_cluster_sizes_kmeans, 
                       df_selected_cluster_sizes_pagerank, 
                       on='VM_type', 
                       how='inner')
df_selected_cluster_size.rename(columns={
'Recommended_Cluster_Size_x': 'Cluster_Size_Kmeans',
'Recommended_Cluster_Size_y': 'Cluster_Size_PageRank'   
}, inplace= True)
df_selected_cluster_size['Cluster_Size_PageRank'] = df_selected_cluster_size['Cluster_Size_PageRank'].astype(float).astype('Int64')
df_selected_cluster_size

df_selected_cluster_size.to_csv('blink-eval.csv', index=False)
