In [1]:
from scipy.stats import ttest_rel
import pandas as pd
import math

In [2]:
def calculate_FTTSE(energy, time, is_training=True):
    """
    Calculate the FTTSE (Function of Time To Solution and Energy) metric.
    :param energy: Energy consumption in Joules
    :param time: Time to solution in seconds
    :param is_training: Whether the calculation is for training or inference
    :return: FTTSE value
    """
    if is_training:
        return energy * (time**2)
    else:
        return energy * math.exp(time)

def calculate_G_score_per_trial(data, number_of_inferences, alpha):
    """
    Calculate the G-score for each model within each trial in the data.
    :param data: DataFrame containing the data
    :param number_of_inferences: Number of inferences to consider
    :param alpha: Performance weight (0 <= alpha <= 1)
    """
    for trial_number in data['Trial'].unique():
        trial_data = data[data['Trial'] == trial_number].copy()  # Work with a copy for safe modification

        # Calculate combined and normalized FTTSE
        trial_data['FTTSE_combined'] = (1 - trial_data['FTTSE_Training']) + (1 - trial_data['FTTSE_Inference']) * number_of_inferences
        fttse_min, fttse_max = trial_data['FTTSE_combined'].min(), trial_data['FTTSE_combined'].max()
        trial_data['FTTSE_norm'] = (trial_data['FTTSE_combined'] - fttse_min) / (fttse_max - fttse_min)

        # Calculate G_Score for each model within the trial
        for model in trial_data['Model'].unique():
            model_data = trial_data[trial_data['Model'] == model]
            f1 = model_data['F1_Score'].values[0]
            energy = model_data['FTTSE_norm'].values[0]
            beta = 1 - alpha
            g_score = alpha * f1 + beta * energy

            # Assign the G_Score value to the original data
            data.loc[(data['Trial'] == trial_number) & (data['Model'] == model), 'G_Score'] = g_score


def test(x, y):
    """
    Perform a sign test on the given data.
    :param x: First data array
    :param y: Second data array
    :return: p-value of the paired two-tailed t-test test
    """
    return ttest_rel(x, y).pvalue

In [9]:
df = pd.read_csv("data/model_performance_results_20241029_050347.csv")

df['FTTSE_Training'] = df.apply(
    lambda row: calculate_FTTSE(row['Training_Energy_Joules'], row['Training_Time_Seconds'], is_training=True), axis=1)
df['FTTSE_Inference'] = df.apply(
    lambda row: calculate_FTTSE(row['Inference_Energy_Joules'], row['Inference_Time_Seconds'], is_training=False), axis=1)
calculate_G_score_per_trial(df, 1e7, 0.5)

models = df['Model'].unique()

print(df)

    Trial                   Model  Training_Time_Seconds  \
0       1           Decision Tree                 259.62   
1       1     Logistic Regression                  23.04   
2       1             Naive Bayes                   8.28   
3       1  Support Vector Machine                   8.47   
4       2           Decision Tree                 258.72   
..    ...                     ...                    ...   
75     19  Support Vector Machine                   8.79   
76     20           Decision Tree                 259.32   
77     20     Logistic Regression                  22.43   
78     20             Naive Bayes                   8.28   
79     20  Support Vector Machine                   8.51   

    Training_Energy_Joules  Inference_Time_Seconds  Inference_Energy_Joules  \
0                  3479.99                    0.04                     0.61   
1                   735.11                    0.05                     1.38   
2                    89.44                

In [10]:
"""
RQ: How can we define a measure for evaluating machine learning models that takes into account both energy and performance?

H0: There is no difference in G-scores among the classification models (and is therefore not a useful metric).
H1: There is a difference in G-scores among the classification models (and is therefore a useful metric).
"""

test_results_g_score = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_g_score.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['G_Score'].values
            data2 = df[df['Model'] == model2]['G_Score'].values
            
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_g_score.loc[model1, model2] = (p_value, accept)

print(test_results_g_score)

                                      Decision Tree  \
Decision Tree           (0, 0, 1.0, Fail to reject)   
Logistic Regression                   (nan, Reject)   
Naive Bayes                           (nan, Reject)   
Support Vector Machine                (nan, Reject)   

                                     Logistic Regression  \
Decision Tree                              (nan, Reject)   
Logistic Regression          (0, 0, 1.0, Fail to reject)   
Naive Bayes             (1.7538849843773226e-24, Reject)   
Support Vector Machine   (3.537434816805542e-46, Reject)   

                                             Naive Bayes  \
Decision Tree                              (nan, Reject)   
Logistic Regression     (1.7538849843773226e-24, Reject)   
Naive Bayes                  (0, 0, 1.0, Fail to reject)   
Support Vector Machine   (2.831034345513787e-34, Reject)   

                                 Support Vector Machine  
Decision Tree                             (nan, Reject)  
Logist

In [11]:
"""
RQ1: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during training when applied to the Microsoft's Cats vs Dogs dataset?

H0a: There is no difference in energy efficiency among the classification models during training.
H1a: There is a difference in energy efficiency among the classification models during training.
"""

test_results_training = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_training.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Training'].values
            data2 = df[df['Model'] == model2]['FTTSE_Training'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"
            
            test_results_training.loc[model1, model2] = (p_value, accept)
            
            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0a")
            
print(test_results_training)

                                      Decision Tree  \
Decision Tree           (0, 0, 1.0, Fail to reject)   
Logistic Regression                   (nan, Reject)   
Naive Bayes                           (nan, Reject)   
Support Vector Machine                (nan, Reject)   

                                     Logistic Regression  \
Decision Tree                              (nan, Reject)   
Logistic Regression          (0, 0, 1.0, Fail to reject)   
Naive Bayes             (2.3729092572275148e-27, Reject)   
Support Vector Machine   (2.834156300905853e-27, Reject)   

                                             Naive Bayes  \
Decision Tree                              (nan, Reject)   
Logistic Regression     (2.3729092572275148e-27, Reject)   
Naive Bayes                  (0, 0, 1.0, Fail to reject)   
Support Vector Machine   (4.622624226444514e-15, Reject)   

                                 Support Vector Machine  
Decision Tree                             (nan, Reject)  
Logist

In [12]:
"""
RQ2: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during inference when applied to the Microsoft's Cats vs Dogs dataset?

H0b: There is no difference in energy efficiency among the classification models during inference.
H1b: There is a difference in energy efficiency among the classification models during inference.
"""

test_results_inference = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_inference.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Inference'].values
            data2 = df[df['Model'] == model2]['FTTSE_Inference'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_inference.loc[model1, model2] = (p_value, accept)

            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0b")

print(test_results_inference)

                                           Decision Tree  \
Decision Tree                (0, 0, 1.0, Fail to reject)   
Logistic Regression      (1.934443613508342e-11, Reject)   
Naive Bayes             (1.9775360246491267e-24, Reject)   
Support Vector Machine   (6.245241657541452e-38, Reject)   

                                    Logistic Regression  \
Decision Tree           (1.934443613508342e-11, Reject)   
Logistic Regression         (0, 0, 1.0, Fail to reject)   
Naive Bayes             (3.757496533247398e-24, Reject)   
Support Vector Machine  (4.488581059348191e-38, Reject)   

                                             Naive Bayes  \
Decision Tree           (1.9775360246491267e-24, Reject)   
Logistic Regression      (3.757496533247398e-24, Reject)   
Naive Bayes                  (0, 0, 1.0, Fail to reject)   
Support Vector Machine   (4.903270229677079e-33, Reject)   

                                 Support Vector Machine  
Decision Tree           (6.245241657541452e-

In [13]:
test_results_f1_score = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            test_results_f1_score.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['F1_Score'].values
            data2 = df[df['Model'] == model2]['F1_Score'].values
            p_value = test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            test_results_f1_score.loc[model1, model2] = (p_value, accept)

print(test_results_f1_score)

                                           Decision Tree  \
Decision Tree                (0, 0, 1.0, Fail to reject)   
Logistic Regression     (1.3245636530748174e-15, Reject)   
Naive Bayes              (9.425076072631499e-13, Reject)   
Support Vector Machine   (1.923974461908341e-16, Reject)   

                                     Logistic Regression  \
Decision Tree           (1.3245636530748174e-15, Reject)   
Logistic Regression          (0, 0, 1.0, Fail to reject)   
Naive Bayes                                (0.0, Reject)   
Support Vector Machine   (5.234625450512132e-10, Reject)   

                                             Naive Bayes  \
Decision Tree            (9.425076072631499e-13, Reject)   
Logistic Regression                        (0.0, Reject)   
Naive Bayes                  (0, 0, 1.0, Fail to reject)   
Support Vector Machine  (3.6187726418714596e-14, Reject)   

                                  Support Vector Machine  
Decision Tree            (1.9239744619

  res = hypotest_fun_out(*samples, **kwds)
