In [1]:
from scipy.stats import wilcoxon, shapiro
import pandas as pd
import math
import random

In [2]:
def calculate_FTTSE(energy, time, is_training=True):
    """
    Calculate the FTTSE (Function of Time To Solution and Energy) metric.
    :param energy: Energy consumption in Joules
    :param time: Time to solution in seconds
    :param is_training: Whether the calculation is for training or inference
    :return: FTTSE value
    """
    if is_training:
        return energy * (time**2)
    else:
        return energy * math.exp(time)

def calculate_G_score_per_trial(data, number_of_inferences, alpha):
    """
    Calculate the G-score for each model within each trial in the data.
    :param data: DataFrame containing the data
    :param number_of_inferences: Number of inferences to consider
    :param alpha: Performance weight (0 <= alpha <= 1)
    """
    for trial_number in data['Trial'].unique():
        trial_data = data[data['Trial'] == trial_number].copy()  # Work with a copy for safe modification

        # Calculate combined and normalized FTTSE
        trial_data['FTTSE_combined'] = (1 - trial_data['FTTSE_Training']) + (1 - trial_data['FTTSE_Inference']) * number_of_inferences
        fttse_min, fttse_max = trial_data['FTTSE_combined'].min(), trial_data['FTTSE_combined'].max()
        trial_data['FTTSE_norm'] = (trial_data['FTTSE_combined'] - fttse_min) / (fttse_max - fttse_min)

        # Calculate G_Score for each model within the trial
        for model in trial_data['Model'].unique():
            model_data = trial_data[trial_data['Model'] == model]
            f1 = model_data['F1_Score'].values[0]
            energy = model_data['FTTSE_norm'].values[0]
            beta = 1 - alpha
            g_score = alpha * f1 + beta * energy

            # Assign the G_Score value to the original data
            data.loc[(data['Trial'] == trial_number) & (data['Model'] == model), 'G_Score'] = g_score


def test(x, y):
    """
    Perform a Wilcoxon signed-rank test on the given data.
    :param x: First data array
    :param y: Second data array
    :return: p-value
    """
    return wilcoxon(x, y, nan_policy='omit').pvalue

In [3]:
df = pd.read_csv("data/model_performance_results_20241029_050347.csv")

df['FTTSE_Training'] = df.apply(
    lambda row: calculate_FTTSE(row['Training_Energy_Joules'], row['Training_Time_Seconds'], is_training=True), axis=1)
df['FTTSE_Inference'] = df.apply(
    lambda row: calculate_FTTSE(row['Inference_Energy_Joules'], row['Inference_Time_Seconds'], is_training=False), axis=1)

nr_of_inferences = random.uniform(1e3, 1e9)
alpha = random.random()
calculate_G_score_per_trial(df, nr_of_inferences, alpha)

models = df['Model'].unique()

print(df)

    Trial                   Model  Training_Time_Seconds  \
0       1           Decision Tree                 259.62   
1       1     Logistic Regression                  23.04   
2       1             Naive Bayes                   8.28   
3       1  Support Vector Machine                   8.47   
4       2           Decision Tree                 258.72   
..    ...                     ...                    ...   
75     19  Support Vector Machine                   8.79   
76     20           Decision Tree                 259.32   
77     20     Logistic Regression                  22.43   
78     20             Naive Bayes                   8.28   
79     20  Support Vector Machine                   8.51   

    Training_Energy_Joules  Inference_Time_Seconds  Inference_Energy_Joules  \
0                  3479.99                    0.04                     0.61   
1                   735.11                    0.05                     1.38   
2                    89.44                

In [4]:
# Check if each column in the dataframe is normally distributed
for column in ['FTTSE_Training', 'FTTSE_Inference', 'G_Score', 'F1_Score']:
    for model in models:
        data = df[df['Model'] == model][column].values
        stat, p = shapiro(data, nan_policy='omit')
        normally_distributed = True if p > 0.05 else False
        print(f"{column}, {model}: stat={stat}, p={p}, normally_distributed={normally_distributed}")

FTTSE_Training, Decision Tree: stat=0.9248715046927829, p=0.13925935884419222, normally_distributed=True
FTTSE_Training, Logistic Regression: stat=0.9525510425096398, p=0.4074801551812648, normally_distributed=True
FTTSE_Training, Naive Bayes: stat=0.8395279929638169, p=0.0035765026841463013, normally_distributed=False
FTTSE_Training, Support Vector Machine: stat=0.8993012878362476, p=0.040014221090994714, normally_distributed=False
FTTSE_Inference, Decision Tree: stat=0.881804841639356, p=0.019068250647961457, normally_distributed=False
FTTSE_Inference, Logistic Regression: stat=0.9134244009520304, p=0.07409209328647123, normally_distributed=True
FTTSE_Inference, Naive Bayes: stat=0.8995917255412967, p=0.0405185502723816, normally_distributed=False
FTTSE_Inference, Support Vector Machine: stat=0.9274985330697426, p=0.13820184762937382, normally_distributed=True
G_Score, Decision Tree: stat=0.9893410175012145, p=0.997736071927819, normally_distributed=True
G_Score, Logistic Regression:

  res = hypotest_fun_out(*samples, **kwds)


In [5]:
"""
RQ: How can we define a measure for evaluating machine learning models that takes into account both energy and performance?

H0: There is no difference in G-scores among the classification models (and is therefore not a useful metric).
H1: There is a difference in G-scores among the classification models (and is therefore a useful metric).
"""

test_results_g_score = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_g_score.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:            
            data1 = df[df['Model'] == model1]['G_Score'].values
            data2 = df[df['Model'] == model2]['G_Score'].values
            
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_g_score.loc[model1, model2] = (p_value, accept)

print(test_results_g_score)

                                       Decision Tree  \
Decision Tree            (0, 0, 1.0, Fail to reject)   
Logistic Regression     (3.814697265625e-06, Reject)   
Naive Bayes             (3.814697265625e-06, Reject)   
Support Vector Machine  (3.814697265625e-06, Reject)   

                                  Logistic Regression  \
Decision Tree            (3.814697265625e-06, Reject)   
Logistic Regression       (0, 0, 1.0, Fail to reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                          Naive Bayes  \
Decision Tree            (3.814697265625e-06, Reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes               (0, 0, 1.0, Fail to reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                               Support Vector Machine  
Decision Tree            (3.814697265625e-06, Reject)  
Logistic Regression     (1.90734863

In [6]:
"""
RQ1: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during training when applied to the Microsoft's Cats vs Dogs dataset?

H0a: There is no difference in energy efficiency among the classification models during training.
H1a: There is a difference in energy efficiency among the classification models during training.
"""

test_results_training = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_training.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Training'].values
            data2 = df[df['Model'] == model2]['FTTSE_Training'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"
            
            test_results_training.loc[model1, model2] = (p_value, accept)
            
            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0a")
            
print(test_results_training)

                                       Decision Tree  \
Decision Tree            (0, 0, 1.0, Fail to reject)   
Logistic Regression     (3.814697265625e-06, Reject)   
Naive Bayes             (3.814697265625e-06, Reject)   
Support Vector Machine  (3.814697265625e-06, Reject)   

                                  Logistic Regression  \
Decision Tree            (3.814697265625e-06, Reject)   
Logistic Regression       (0, 0, 1.0, Fail to reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                          Naive Bayes  \
Decision Tree            (3.814697265625e-06, Reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes               (0, 0, 1.0, Fail to reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                               Support Vector Machine  
Decision Tree            (3.814697265625e-06, Reject)  
Logistic Regression     (1.90734863

In [7]:
"""
RQ2: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during inference when applied to the Microsoft's Cats vs Dogs dataset?

H0b: There is no difference in energy efficiency among the classification models during inference.
H1b: There is a difference in energy efficiency among the classification models during inference.
"""

test_results_inference = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_inference.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Inference'].values
            data2 = df[df['Model'] == model2]['FTTSE_Inference'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_inference.loc[model1, model2] = (p_value, accept)

            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0b")

print(test_results_inference)

                                        Decision Tree  \
Decision Tree             (0, 0, 1.0, Fail to reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                  Logistic Regression  \
Decision Tree           (1.9073486328125e-06, Reject)   
Logistic Regression       (0, 0, 1.0, Fail to reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                          Naive Bayes  \
Decision Tree           (1.9073486328125e-06, Reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes               (0, 0, 1.0, Fail to reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                               Support Vector Machine  
Decision Tree           (1.9073486328125e-06, Reject)  
Logistic Regression     (1.907

In [8]:
test_results_f1_score = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_f1_score.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['F1_Score'].values
            data2 = df[df['Model'] == model2]['F1_Score'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_f1_score.loc[model1, model2] = (p_value, accept)

print(test_results_f1_score)

                                        Decision Tree  \
Decision Tree             (0, 0, 1.0, Fail to reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                  Logistic Regression  \
Decision Tree           (1.9073486328125e-06, Reject)   
Logistic Regression       (0, 0, 1.0, Fail to reject)   
Naive Bayes             (1.9073486328125e-06, Reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                                          Naive Bayes  \
Decision Tree           (1.9073486328125e-06, Reject)   
Logistic Regression     (1.9073486328125e-06, Reject)   
Naive Bayes               (0, 0, 1.0, Fail to reject)   
Support Vector Machine  (1.9073486328125e-06, Reject)   

                               Support Vector Machine  
Decision Tree           (1.9073486328125e-06, Reject)  
Logistic Regression     (1.907