#### Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
import joblib
from yellowbrick.cluster import KElbowVisualizer
import json
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Function used to fill NaN values within the dataframe X
def fill_NaN(X):
  for col in X:
    X.loc[:, col] = X.loc[:, col].fillna(1)
  return X

In [None]:
# Elbow method plot
def plot_elbow(K, distortions):
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distorsione')
    plt.title('Il Metodo del Gomito')
    plt.show()

#### Retrieving dataset

In [None]:
PATH_TO_CSV = 'output/generator'
FUNCTION_NAMES = ['figlet', 'shasum', 'nmap', 'env', 'curl', 'eat-memory']
FUNCTION_NAMES_REDUCED = ['figlet', 'shasum', 'nmap', 'env', 'curl']
COLUMNS_NAMES = ['name', 'rate', 'success_rate', 'cpu_usage', 'ram_usage', 'power', 'overloaded', 'medium_latency']
COLUMNS_REDUCED = ['rate', 'success_rate', 'cpu_usage', 'ram_usage', 'power', 'overloaded', 'medium_latency']
COLUMNS_SCALED = ['rate', 'success_rate', 'cpu_usage', 'ram_usage', 'power', 'medium_latency']
# Retrieve all files in the output folder
file_csv = [file for file in os.listdir(PATH_TO_CSV) if file.endswith('.csv')]
file_csv.sort()

#### Get all data from files

In [None]:
# Initialize an empty DataFrame
df = pd.DataFrame()

for file in file_csv:
    file_path = os.path.join(PATH_TO_CSV, file)
    # Read CSV file
    df_file = pd.read_csv(file_path)
    # Add the current DataFrame to the main DataFrame
    df = pd.concat([df, df_file], ignore_index=True)

# Delete 'name' column
df_no_name = df.drop(columns='name', errors='ignore')

##### Preprocessing

In [None]:
scaler = MinMaxScaler()

# Apply MinMax scaling to the DataFrame
df_no_name_scaled = scaler.fit_transform(df_no_name)

# Create a scaled DataFrame with the same columns
df_one_function_scaled = pd.DataFrame(df_no_name_scaled, columns=df_no_name.columns)

df_one_function_scaled['name'] = df['name']       

##### Function to create plots

In [None]:
def print_line_plots(df):
    for col in df.columns:
        if col not in ['rate', 'name']:  
            plt.figure(figsize=(10, 6))  
            for name in FUNCTION_NAMES:
                temp_df = df[df['name'] == name].copy()  
                if col == 'medium_latency':
                    temp_df[col] = temp_df[col] / 1e9  
                if col == 'ram_usage':
                    temp_df[col] = temp_df[col] / 1e6  
                if col == 'success_rate':
                    temp_df[col] = temp_df[col] * 100  
                plt.plot(temp_df['rate'], temp_df[col], label=name, marker='o', linestyle='-')
            
            unit = {
                'success_rate': '%',  
                'cpu_usage': '%',
                'ram_usage': 'MB',  
                'power_usage': 'μW',
                'overloaded': '',  
                'medium_latency': 's'  
            }[col]

            plt.title(f'{col.capitalize()} per differenti funzioni')
            plt.xlabel('Rate (req/s)')
            plt.ylabel(f'{col.capitalize()} ({unit})')
            
            if col == 'overloaded':
                plt.yticks([0, 1])

            plt.legend()
            plt.grid(True)
            plt.show()


In [None]:
print_line_plots(df)

#### Create vectors

In [None]:
# Split the DataFrame according to the column "name"
data_per_function = {}
for name, group in df_one_function_scaled.groupby('name'):
    data_per_function[name] = group

# Loop through each function's DataFrame in the dictionary
for function_name, function_data in data_per_function.items():

    # Remove the "name" column from each DataFrame in the dictionary
    function_data = function_data.drop(columns=['name'])
    
    # Add the prefix "avg_" to each column
    function_data.columns = 'avg_' + function_data.columns

    # Calculate the average of the values for each group with the same 'rate'
    compressed_data = function_data.groupby('avg_rate').mean().reset_index()

    # Calculate the maximum values for each group of three rows
    max_data = function_data.groupby('avg_rate').max().reset_index()

    # Calculate the minimum values for each group of three rows
    min_data = function_data.groupby('avg_rate').min().reset_index()

    # Add the "max_" columns to the compressed DataFrame
    compressed_data['max_success_rate'] = max_data['avg_success_rate']
    compressed_data['max_cpu_usage'] = max_data['avg_cpu_usage']
    compressed_data['max_ram_usage'] = max_data['avg_ram_usage']
    compressed_data['max_power'] = max_data['avg_power_usage']
    compressed_data['max_overloaded'] = max_data['avg_overloaded']
    compressed_data['max_medium_latency'] = max_data['avg_medium_latency']

    # Add the "min_" columns to the compressed DataFrame
    compressed_data['min_success_rate'] = min_data['avg_success_rate']
    compressed_data['min_cpu_usage'] = min_data['avg_cpu_usage']
    compressed_data['min_ram_usage'] = min_data['avg_ram_usage']
    compressed_data['min_power'] = min_data['avg_power_usage']
    compressed_data['min_overloaded'] = min_data['avg_overloaded']
    compressed_data['min_medium_latency'] = min_data['avg_medium_latency']

    compressed_data = compressed_data.drop(columns='avg_rate')

    data_per_function[function_name] = compressed_data

# Create one DataFrame for each function

# Initialize a dictionary to store vectors for each functions
vectors_per_function = {}

# Loop through each DataFrame function in the dictionary
for function_name, function_data in data_per_function.items():
    # Concatenates all rows in the DataFrame into a vector
    vector = function_data.to_numpy().flatten()
    # Saves the vector in the dictionary with the function name as the key
    vectors_per_function[function_name] = vector

# Trasform Numpy Array to DataFrame pandas

# Initialize a dictionary to store DataFrames for each function
dataframes_per_function = {}

# Loop through each vector in the dictionary
for function_name, vector in vectors_per_function.items():
    # Create a DataFrame from the vector
    dataframe = pd.DataFrame(vector)

    # Save the DataFrame in the new dictionary with the function name as the key. Traspose Dataframe
    dataframes_per_function[function_name] = dataframe.T

# Create functions Dataframe

# Create an empty list to store DataFrames
all_dataframes = []
function_order = []
# Loop through the DataFrames in dataframes_per_function and add them to the list
for function_name, dataframe in dataframes_per_function.items():
    all_dataframes.append(dataframe)
    function_order.append(function_name)

# Concatenate all DataFrames in the list to create a single one
combined_dataframe = pd.concat(all_dataframes, ignore_index=True)

# Scale

fill_NaN(combined_dataframe)

# Calculate cosine distance for all the vectors representing the functions
df_cosine=pd.DataFrame(cosine_similarity(combined_dataframe), columns=function_order)

# Create a PCA model
pca = PCA()

# Apply PCA to the df_functions DataFrame
pca_results = pca.fit_transform(combined_dataframe)

# Number of dimension for PCA
pca_dimensions = len(combined_dataframe.index)

# Create a new DataFrame to store the PCA results
pca_df = pd.DataFrame(data=pca_results, columns=[f'PC{i}' for i in range(1, pca_dimensions + 1)])

# Get the explained variance ratios
explained_variance = pca.explained_variance_ratio_
print(explained_variance)

plt.plot(range(1, pca_dimensions + 1),
             explained_variance, marker='o', linestyle='--')
plt.xlabel('Numero di Componenti')
plt.ylabel('Varianza')
plt.show()

summed_variance = 0
index = 0
while summed_variance <= 0.95:
    summed_variance += explained_variance[index]
    index += 1
after_column = "PC" + str(index)
pca_df = pca_df.truncate(before="PC1", after=after_column, axis="columns")
# Save PCA model
joblib.dump(pca, 'pca_model.joblib')

# Calculate cosine distance for all the pca vectors representing the functions
df_cosine_pca=pd.DataFrame(cosine_similarity(pca_df), columns=function_order)

#### Search the best K params

In [None]:
distortions = []
inertias = []
silhouette_scores = {}
K = range(1,7)
for k in K:
    model_kmeans_k = KMeans(n_clusters = k)
    model_kmeans_k.fit(pca_df)
    distortions.append(sum(np.min(cdist(pca_df, model_kmeans_k.cluster_centers_, 'euclidean'), axis=1)) / pca_df.shape[0])
    inertias.append(model_kmeans_k.inertia_)

plot_elbow(K, distortions)

#### Apply K-Means and show the tabular results 

In [None]:
# Define the K-Means model
kmeans_model = KMeans(n_clusters = 3)
kmeans_model.fit(pca_df)
pca_df['function_name'] = function_order
pca_df['cluster'] = kmeans_model.labels_
print(pca_df)
joblib.dump(kmeans_model, "profiling-model.joblib")
grouped_df = pca_df.groupby('cluster')['function_name'].apply(list).reset_index()
result_dict = dict(zip(grouped_df['cluster'], grouped_df['function_name']))
file_path = 'group_list.json'
with open(file_path, 'w') as json_file:
    json.dump(result_dict, json_file)