# Study the percentages and their relationship with the metrics 


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")
import os 
os.environ['MPLCONFIGDIR'] = '/myhome'
from utils.plots import plot_label_distribution_datasets
from utils.helpers_config import set_up_config, save_config
from utils.helpers_testing import get_test_stats_from_model, load_test_dataset, save_test_stats
from utils.plots_test import plots_all_figs_at_test
import matplotlib.pyplot as plt
from utils.test.load_model import get_all_runs, get_loaded_model_and_criterion, get_model_and_model_path
from utils.helpers_mu import get_mus_from_config
from utils.results_analysis.extract_video import download_images, add_files_to_images, save_video
import argparse


In [3]:
local_dataroot = os.path.join(os.environ.get("HOME", os.environ.get("USERPROFILE")),"elects_data")
entity, project = "aurenore", "MasterThesis"
sweep = "piecewise_lin_regr_with_wrong_pred_penalties_4"


load the runs of the sweep 

In [4]:
print("Local dataroot: ", local_dataroot)

# ## Download the model from wandb 
runs_df, runs = get_all_runs(entity, project)

# get the runs from the sweep 
df = runs_df[runs_df.sweep == sweep]
print("number of runs: ", len(df))  
df.head()

Local dataroot:  C:\Users\anyam\elects_data
number of runs:  115


Unnamed: 0,summary,config,name,sweep,start_date
0,"{'elects_earliness': 0.6484803387303276, 'time...","{'mu': 150, 'mus': [171, 102, 103, 144, 150, 1...",amber-sweep-115,piecewise_lin_regr_with_wrong_pred_penalties_4,2024-06-20T11:41:47
1,"{'loss': {'testloss': 5.931120872497559, 'trai...","{'mu': 150, 'mus': [102, 98, 25, 102, 150, 150...",crisp-sweep-114,piecewise_lin_regr_with_wrong_pred_penalties_4,2024-06-20T11:34:59
2,"{'_wandb': {'runtime': 3933}, 'alphas': [0.5, ...","{'mu': 150, 'mus': [173, 113, 105, 142, 150, 1...",pious-sweep-113,piecewise_lin_regr_with_wrong_pred_penalties_4,2024-06-20T11:28:07
3,"{'_runtime': 3200.3417558670044, 'precision': ...","{'mu': 150, 'mus': [102, 96, 26, 104, 150, 150...",dark-sweep-112,piecewise_lin_regr_with_wrong_pred_penalties_4,2024-06-20T11:23:55
4,"{'_wandb': {'runtime': 3985}, 'alphas': [0.899...","{'mu': 150, 'mus': [102, 92, 25, 104, 150, 150...",rose-sweep-111,piecewise_lin_regr_with_wrong_pred_penalties_4,2024-06-20T11:15:36


The loss is written as:

$$ \alpha_1 C_m - \alpha_2 C_d + \alpha_3 C_{penalty} + \alpha_4 C_{lr} $$
where 
- $C_m$ is the misclassification cost 
- $C_d$ is the earliness reward
- $C_{penalty}$ is the early wrong prediction penalty
- $C_{lr}$ is the piecewise linear regression cost
- $\alpha_i$ are the weights for $i=1,2,3,4$

$\alpha_1$ is fixed at 1. at the beginning of the training and decays linearly through the training. The other alphas are given by a their corresponding percentage $p_{\alpha_i}$ and the current value of $\alpha_1$ as follows:
$$ \alpha_i = p_{\alpha_i} (1 - \alpha_1)$$

That way the sum of the weights is always 1.

In [18]:
df.loc[:, "percentages_alphas"] = df.loc[:, "config"].apply(lambda x: x["percentages_other_alphas"])
df.loc[:, "percentage_alpha_1"] = df.loc[:, "percentages_alphas"].apply(lambda x: x[0])
df.loc[:, "percentage_alpha_2"] = df.loc[:, "percentages_alphas"].apply(lambda x: x[1])
df.loc[:, "percentage_alpha_3"] = df.loc[:, "percentages_alphas"].apply(lambda x: x[2])
df.loc[:, "alphas"] = df.loc[:, "summary"].apply(lambda x: x["alphas"])

df.loc[:, "earliness"] = df.loc[:, "summary"].apply(lambda x: x["elects_earliness"])
df.loc[:, "accuracy"] = df.loc[:, "summary"].apply(lambda x: x["accuracy"])
df.loc[:, "harmonic_mean"] = df.loc[:, "summary"].apply(lambda x: x["harmonic_mean"])

In [16]:
df["alphas"]

0      [0.699999988079071, 0.07762829214334488, 0.037...
1      [0.699999988079071, 0.1269819587469101, 0.0211...
2      [0.5, 0.05641905218362808, 0.3937248885631562,...
3      [0.9237499833106996, 0.011222507804632189, 0.0...
4      [0.8999999761581421, 0.01832260750234127, 0.00...
                             ...                        
110    [0.699999988079071, 0.09069057554006577, 0.091...
111    [0.9277777671813964, 0.0018344582058489325, 0....
112    [0.7699999809265137, 0.1678219735622406, 0.039...
113    [0.7551020383834839, 0.10898122936487198, 0.04...
114    [0.9144444465637208, 0.0437154658138752, 0.027...
Name: alphas, Length: 115, dtype: object

# Correlations between the percentages and the metrics
metrics : 
- accuracy 
- earliness
- harmonic mean of accuracy and earliness

In [19]:
import numpy as np 

def compute_correlation(metric_1, metric_2):
    """ Compute the correlation between two metrics """
    return np.corrcoef(metric_1, metric_2)[0, 1]

compute_correlation(df["percentage_alpha_1"], df["earliness"])

0.25158199574749956

In [24]:
df_correlations = df[["percentage_alpha_1", "percentage_alpha_2", "percentage_alpha_3", "earliness", "accuracy", "harmonic_mean"]].corr()
df_correlations.loc[["percentage_alpha_1", "percentage_alpha_2", "percentage_alpha_3"],["earliness", "accuracy", "harmonic_mean"]]

Unnamed: 0,earliness,accuracy,harmonic_mean
percentage_alpha_1,0.251582,-0.228036,-0.159191
percentage_alpha_2,-0.235076,0.25757,0.229097
percentage_alpha_3,-0.002188,-0.038647,-0.071988


with alphas: 

In [31]:
df_alphas = df[["alphas", "earliness", "accuracy", "harmonic_mean"]]
df_alphas.loc[:, "alpha_1"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[0])
df_alphas.loc[:, "alpha_2"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[1])
df_alphas.loc[:, "alpha_3"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[2])
df_alphas.loc[:, "alpha_4"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[3])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alphas.loc[:, "alpha_1"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alphas.loc[:, "alpha_2"] = df_alphas.loc[:, "alphas"].apply(lambda x: x[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alphas.loc[:, "alpha_3"] = df_alphas.loc[:, 

In [32]:
df_corr_alphas = df_alphas[["alpha_1", "alpha_2", "alpha_3", "alpha_4", "earliness", "accuracy", "harmonic_mean"]].corr()
df_corr_alphas.loc[["alpha_1", "alpha_2", "alpha_3", "alpha_4"],["earliness", "accuracy", "harmonic_mean"]]

Unnamed: 0,earliness,accuracy,harmonic_mean
alpha_1,0.144029,-0.265138,-0.212594
alpha_2,0.112238,0.000178,0.073185
alpha_3,-0.239713,0.335556,0.286007
alpha_4,-0.120868,0.162248,0.067077
