In [1]:
from scipy.stats import ttest_rel
import pandas as pd
import math

In [2]:
def calculate_FTTSE(energy, time, is_training=True):
    """
    Calculate the FTTSE (Function of Time To Solution and Energy) metric.
    :param energy: Energy consumption in Joules
    :param time: Time to solution in seconds
    :param is_training: Whether the calculation is for training or inference
    :return: FTTSE value
    """
    if is_training:
        return energy * (time**2)
    else:
        return energy * math.exp(time)

def calculate_G_score_per_trial(data, number_of_inferences, alpha):
    """
    Calculate the G-score for each model within each trial in the data.
    :param data: DataFrame containing the data
    :param number_of_inferences: Number of inferences to consider
    :param alpha: Performance weight (0 <= alpha <= 1)
    """
    for trial_number in data['Trial'].unique():
        trial_data = data[data['Trial'] == trial_number].copy()  # Work with a copy for safe modification

        # Calculate combined and normalized FTTSE
        trial_data['FTTSE_combined'] = (1 - trial_data['FTTSE_Training']) + (1 - trial_data['FTTSE_Inference']) * number_of_inferences
        fttse_min, fttse_max = trial_data['FTTSE_combined'].min(), trial_data['FTTSE_combined'].max()
        trial_data['FTTSE_norm'] = (trial_data['FTTSE_combined'] - fttse_min) / (fttse_max - fttse_min)

        # Calculate G_Score for each model within the trial
        for model in trial_data['Model'].unique():
            model_data = trial_data[trial_data['Model'] == model]
            f1 = model_data['F1_Score'].values[0]
            energy = model_data['FTTSE_norm'].values[0]
            beta = 1 - alpha
            g_score = alpha * f1 + beta * energy

            # Assign the G_Score value to the original data
            data.loc[(data['Trial'] == trial_number) & (data['Model'] == model), 'G_Score'] = g_score


def sign_test(x, y):
    return 2 * min(ttest_rel(x, y).pvalue, 0.5)

In [3]:
df = pd.read_csv("data/presentation.csv")

df['FTTSE_Training'] = df.apply(
    lambda row: calculate_FTTSE(row['Training_Energy_Joules'], row['Training_Time_Seconds'], is_training=True), axis=1)
df['FTTSE_Inference'] = df.apply(
    lambda row: calculate_FTTSE(row['Inference_Energy_Joules'], row['Inference_Time_Seconds'], is_training=False), axis=1)
calculate_G_score_per_trial(df, 1000, 0.5)

models = df['Model'].unique()

print(df)

   Trial                Model  Training_Time_Seconds  Training_Energy_Joules  \
0      1        Decision Tree                 260.61                 4085.22   
1      1          Naive Bayes                   8.26                  102.34   
2      2        Decision Tree                 262.69                 4258.20   
3      2          Naive Bayes                   8.19                  103.03   
4      3        Decision Tree                 261.91                 4219.94   
5      3          Naive Bayes                   8.26                  105.70   
6      1  Logistic Regression                  24.36                  772.60   
7      2  Logistic Regression                  23.97                  728.63   
8      3  Logistic Regression                  24.25                  678.65   

   Inference_Time_Seconds  Inference_Energy_Joules  F1_Score  FTTSE_Training  \
0                    0.04                     0.69    0.5544    2.774582e+08   
1                    1.25              

In [4]:
"""
RQ: How can we define a measure for evaluating machine learning models that takes into account both energy and performance?

H0: There is no difference in G-scores among the classification models (and is therefore not a useful metric).
H1: There is a difference in G-scores among the classification models (and is therefore a useful metric).
"""

sign_test_results_g_score = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            sign_test_results_g_score.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['G_Score'].values
            data2 = df[df['Model'] == model2]['G_Score'].values
            
            p_value = sign_test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            sign_test_results_g_score.loc[model1, model2] = (p_value, accept)

print(sign_test_results_g_score)

                                       Decision Tree  \
Decision Tree            (0, 0, 1.0, Fail to reject)   
Naive Bayes          (4.901613574096284e-06, Reject)   
Logistic Regression  (4.159125753224131e-06, Reject)   

                                          Naive Bayes  \
Decision Tree         (4.901613574096284e-06, Reject)   
Naive Bayes               (0, 0, 1.0, Fail to reject)   
Logistic Regression  (1.4580099291385602e-05, Reject)   

                                  Logistic Regression  
Decision Tree         (4.159125753224131e-06, Reject)  
Naive Bayes          (1.4580099291385602e-05, Reject)  
Logistic Regression       (0, 0, 1.0, Fail to reject)  


In [5]:
"""
RQ1: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during training when applied to the Microsoft's Cats vs Dogs dataset?

H0a: There is no difference in energy efficiency among the classification models during training.
H1a: There is a difference in energy efficiency among the classification models during training.
"""

sign_test_results_training = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            sign_test_results_training.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Training'].values
            data2 = df[df['Model'] == model2]['FTTSE_Training'].values
            p_value = sign_test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"
            
            sign_test_results_training.loc[model1, model2] = (p_value, accept)
            
            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0a")
            
print(sign_test_results_training)

                                       Decision Tree  \
Decision Tree            (0, 0, 1.0, Fail to reject)   
Naive Bayes          (0.0005826531803877175, Reject)   
Logistic Regression  (0.0005878274873670021, Reject)   

                                         Naive Bayes  \
Decision Tree        (0.0005826531803877175, Reject)   
Naive Bayes              (0, 0, 1.0, Fail to reject)   
Logistic Regression   (0.003499942864720424, Reject)   

                                 Logistic Regression  
Decision Tree        (0.0005878274873670021, Reject)  
Naive Bayes           (0.003499942864720424, Reject)  
Logistic Regression      (0, 0, 1.0, Fail to reject)  


In [6]:
"""
RQ2: What is the energy efficiency, measured in FTTSE, of selected classification models (Naïve Bayes, Logistic Regression,
Decision Tree, and Support Vector Machine) during inference when applied to the Microsoft's Cats vs Dogs dataset?

H0b: There is no difference in energy efficiency among the classification models during inference.
H1b: There is a difference in energy efficiency among the classification models during inference.
"""

sign_test_results_inference = pd.DataFrame(index=models, columns=models)
for model1 in models:
    for model2 in models:
        if model1 == model2:
            sign_test_results_inference.loc[model1, model2] = (0, 0, 1.0, "Fail to reject")
        else:
            data1 = df[df['Model'] == model1]['FTTSE_Inference'].values
            data2 = df[df['Model'] == model2]['FTTSE_Inference'].values
            p_value = sign_test(data1, data2)
            accept = "Fail to reject" if p_value > 0.05 else "Reject"

            sign_test_results_inference.loc[model1, model2] = (p_value, accept)

            # print(f"{model1} vs {model2}: {n_pos} positive, {n_neg} negative, p-value = {p_value}, {accept} the null hypothesis H0b")

print(sign_test_results_inference)

                                       Decision Tree  \
Decision Tree            (0, 0, 1.0, Fail to reject)   
Naive Bayes          (0.0005781806676582863, Reject)   
Logistic Regression   (0.013792409162570152, Reject)   

                                         Naive Bayes  \
Decision Tree        (0.0005781806676582863, Reject)   
Naive Bayes              (0, 0, 1.0, Fail to reject)   
Logistic Regression   (0.000554567004837298, Reject)   

                                Logistic Regression  
Decision Tree        (0.013792409162570152, Reject)  
Naive Bayes          (0.000554567004837298, Reject)  
Logistic Regression     (0, 0, 1.0, Fail to reject)  
