In [52]:
#|default_exp metrics

#|export
import sys
sys.path.append('..')
from abc import ABC, abstractmethod
import torch
import numpy as np
import pandas as pd
from tsai.basics import *
from swdf.losses import wMAELoss, MSELoss, WeightedLoss, ClassificationLoss
from sklearn.metrics import precision_recall_curve, auc
from optuna.study import StudyDirection




# Metrics
---
## Index
#### [1 Loss Metrics](#loss-metrics)
  - [1.1 Regression Metrics](#regression-metrics)
    - [1.1.1 Solar Indices FSMY 10.7 Metrics](#solar-indices-fsmy-107-metrics)
    - [1.1.2 Geomagnetic Indices DST and AP Metrics](#geomagnetic-indices-dst-and-ap-metrics)
  - [1.2 Classification Metrics](#classification-metrics)
    - [1.2.1 Solar Indices FSMY 10.7 Metrics](#solar-indices-fsmy-107-metrics)
    - [1.2.2 Geomagnetic Indices DST and AP Metrics](#geomagnetic-indices-dst-and-ap-metrics)
  - [1.3 Loss Metrics Retrieval](#loss-metrics-retrieval)
#### [2 Validation Metrics](#validation-metrics)
  - [2.1 Outliers Evaluation Metrics](#outliers-evaluation-metrics)
    - [2.1.1 F1 Score Metric](#f1-score-metric)
    - [2.1.2 Area Under the Precision-Recall Curve (AUPRC)](#area-under-the-precision-recall-curve-auprc)
    - [2.1.3 Kurtois and Skewness Difference (KSD)](#kurtois-and-skewness-difference-ksd)
  - [2.2 Association Metrics](#association-metrics)
    - [2.2.1 Pearson Linear Correlation Coefficient ($R$)](#pearson-linear-correlation-coefficient-r)
    - [2.2.2 Coefficient of Determination Metric ($R^2$)](#coefficient-of-determination-metric-r2)
  - [2.3 Accuracy Metrics](#accuracy-metrics)
    - [2.3.1 Symmetric Mean Absolute Percentage Error Metric (sMAPE)](#symmetric-mean-absolute-percentage-error-metric-smape)
    - [2.3.1 Median Symmetric Accuracy Metric (MSA)](#median-symmetric-accuracy-metric-msa)
  - [2.4 Bias Metrics](#bias-metrics)
    - [2.4.1 Symmetric Signed Percentage Bias Metric (SSPB)](#symmetric-signed-percentage-bias-metric-sspb)
  - [2.5 Validation Metrics Handler](#validation-metrics-handler)
#### [3 Tests](#tests)

---

In this notebook, we have implemented metrics to be used within the `TSForecaster()` to provide relevant insights into the training process. Additionally, we have compiled a selection of metrics that can be used to compare the performance of each model in the forecasting task. All the classes follow the pattern of the `Metrics()` class, which is implemented below.


In [53]:
#|export

class Metrics(ABC):
    def __init__(self):
        super().__init__()

    @abstractmethod
    def get_metrics(self) -> list:
        return NotImplementedError

## Loss Metrics

We have implemented a class that generates relevant metrics to better evaluate the performance of the weighted loss function. This class calculates how much of the loss is associated with each activity level, providing **deeper insights into the model's behavior**. The number of methods is elevated because each condition requires its own function to be coded, as dynamically generating these functions can lead to errors.

### Regression Metrics

Here we have implemented metrics to determine the portion of the loss represented by each of the categories in the weighted losses.


In [54]:
#|export

class RegressiveMetrics(Metrics):
    def __init__(self, loss_func):
        super().__init__()
        self.loss_func = loss_func

    def _apply_weighted_loss_by_level(self, input, target, weight_idx):
        loss_copy = deepcopy(self.loss_func)
        
        for idx1 in range(len(loss_copy.weights)):
            if is_iter(loss_copy.weights[0]):
                for idx2 in range(len(loss_copy.weights[idx1])):
                    if (idx1 != weight_idx[0]) | (idx2 != weight_idx[1]):
                        loss_copy.weights[idx1][idx2] = 0
            else:
                if idx1 != weight_idx[1]:
                    loss_copy.weights[idx1] = 0
                
        return loss_copy(input, target)

    @abstractmethod
    def get_metrics(self) -> list:
        return NotImplementedError

#### Solar Indices FSMY 10.7 Metrics

Here we calculate how each solar activity category contributes to the overall loss.


In [55]:
#|export

class SOLFMYMetrics(RegressiveMetrics):
    def __init__(self, loss_func):
        super().__init__(loss_func)
        self.loss_func = loss_func



    # Metrics
    def Loss_Low(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,0])
    
    def Loss_Moderate(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,1])
    
    def Loss_Elevated(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,2])
    
    def Loss_High(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,3])
    
    
    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [
                self.Loss_Low, 
                self.Loss_Moderate, 
                self.Loss_Elevated, 
                self.Loss_High
            ]

#### Geomagnetic Indices DST and AP Metrics

Below, we compile the performance of each category within the geomagnetic indices studied.


In [56]:
#|export

class GEODSTAPMetrics(RegressiveMetrics):
    def __init__(self, loss_func, indices:str='geodstap'):
        super().__init__(loss_func)
        self.indices = indices
        
        
    # Metrics
    def Loss_Low(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,0])
    
    def Loss_Medium(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,1])
    
    def Loss_Active(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [0,2])
    
    def Loss_G0(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,0])
    
    def Loss_G1(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,1])
    
    def Loss_G2(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,2])
        
    def Loss_G3(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,3])
    
    def Loss_G4(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,4])
    
    def Loss_G5(self, input, target):
        return self._apply_weighted_loss_by_level(input, target, [1,5])
    

    # Metrics retrieval function
    def get_metrics(self) -> list:
        if self.indices == 'geodst':
            return [
                self.Loss_G0, 
                self.Loss_G1, 
                self.Loss_G2, 
                self.Loss_G3, 
                self.Loss_G4, 
                self.Loss_G5
            ]
        
        elif self.indices == 'geoap':
            return [
                    self.Loss_Low, 
                    self.Loss_Medium, 
                    self.Loss_Active
                ]
        
        return [
                self.Loss_Low, 
                self.Loss_Medium, 
                self.Loss_Active,
                self.Loss_G0, 
                self.Loss_G1, 
                self.Loss_G2, 
                self.Loss_G3, 
                self.Loss_G4, 
                self.Loss_G5
            ]

### Classification Metrics

These metrics are used to assess the average misclassifications detected by the `ClassificationLoss()`, providing insights into how the model is improving when the solar or geomagnetic categories of the predictions impact the loss function.


In [57]:
#|export

class ClassificationMetrics(Metrics):
    def __init__(self, loss_func):
        super().__init__()
        self.loss_func = loss_func



    def _compute_misclassifications(self, predictions, targets):
        # Use the weighted loss tensor from the provided loss function
        classifier = self.loss_func.weighted_loss_tensor
        
        # Get the true and predicted labels using the classifier
        true_labels = classifier(targets)
        predicted_labels = classifier(predictions)

        # Misclassifications are those where the predicted label does not match the true label
        misclassified_labels = (true_labels != predicted_labels).int() * predicted_labels

        return misclassified_labels

    def _count_misclassifications_by_position(self, predictions, targets, row, col):
        # Calculate misclassifications for a specific (row, column) pair
        misclassified_labels = self._compute_misclassifications(predictions, targets)
        
        # Extract the specific misclassification at the (row, column) position and sum across the time dimension
        if row < misclassified_labels.shape[1] and col < misclassified_labels.shape[2]:
            misclassification_count = misclassified_labels[:, row, col].sum().item()
        else:
            misclassification_count = 0  # Out of bounds, assume no misclassification
        
        return misclassification_count
  
    
    @abstractmethod
    def get_metrics(self) -> list:
        return NotImplementedError

#### Solar Indices FSMY 10.7 Metrics

This metrics calculate the average of missclasifications for each of the solar activity levels.

In [58]:
#|export

class SOLFMYClassificationMetrics(ClassificationMetrics):
    def __init__(self, loss_func):
        super().__init__(loss_func)


    # Metrics
    def Missclassifications_Low(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 1)

    def Missclassifications_Moderate(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 2)

    def Missclassifications_Elevated(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 3)

    def Missclassifications_High(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 4)


    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [
                self.Missclassifications_Low,
                self.Missclassifications_Moderate, 
                self.Missclassifications_Elevated, 
                self.Missclassifications_High
            ]

#### Geomagnetic Indices DST and AP Metrics

This metrics calculate the average of missclasifications for each of DST and AP activity levels.

In [59]:
#|export

class GEODSTAPClassificationMetrics(ClassificationMetrics):
    def __init__(self, loss_func, indices:str='geodstap'):
        super().__init__(loss_func)
        self.indices = indices


    # Metrics
    def Missclassifications_Low(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 1)

    def Missclassifications_Medium(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 2)

    def Missclassifications_Active(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 0, 3)

    def Missclassifications_G0(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 1)

    def Missclassifications_G1(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 2)

    def Missclassifications_G2(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 3)

    def Missclassifications_G3(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 4)

    def Missclassifications_G4(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 5)

    def Missclassifications_G5(self, predictions, targets):
        return self._count_misclassifications_by_position(predictions, targets, 1, 6)

    # Metrics retrieval function
    def get_metrics(self) -> list:
        if self.indices == 'geodst':
            return [
                    self.Missclassifications_G0,
                    self.Missclassifications_G1, 
                    self.Missclassifications_G2, 
                    self.Missclassifications_G3, 
                    self.Missclassifications_G4, 
                    self.Missclassifications_G5
                ]
        
        elif self.indices == 'geoap':
            return [
                    self.Missclassifications_Low, 
                    self.Missclassifications_Medium, 
                    self.Missclassifications_Active
                ]
        
        return [
                self.Missclassifications_Low, 
                self.Missclassifications_Medium, 
                self.Missclassifications_Active, 
                self.Missclassifications_G0, 
                self.Missclassifications_G1,
                self.Missclassifications_G2,
                self.Missclassifications_G3,
                self.Missclassifications_G4,
                self.Missclassifications_G5
            ]

### Loss Metrics Retrieval

This class consolidates all the logic needed to retrieve the appropriate metrics, depending on the type of data used in the training and the specific loss function applied.


In [60]:
#|export

class LossMetrics(Metrics):
    def __init__(self, loss_func, indices:str = ''):
        super().__init__()
        self.loss_func = loss_func
        self.indices = indices

    ## Metrics Not Available
    def Metrics_Not_Available(self, input, target): return np.nan 
    
    # Metrics retrieval
    def get_metrics(self):
        if isinstance(self.loss_func, ClassificationLoss):
            if self.indices.lower() == 'solfsmy':
                return SOLFMYClassificationMetrics(self.loss_func).get_metrics()
            if self.indices.lower() in ['geodstap', 'geoap', 'geodst']:
                return GEODSTAPClassificationMetrics(self.loss_func, self.indices).get_metrics()
        
        if isinstance(self.loss_func, WeightedLoss):
            if self.indices.lower() == 'solfsmy':
                return SOLFMYMetrics(self.loss_func).get_metrics()
            
            if self.indices.lower() in ['geodstap', 'geoap', 'geodst']:
                return GEODSTAPMetrics(self.loss_func, self.indices).get_metrics()
        
        return [self.Metrics_Not_Available]

## Validation Metrics

Here we have implemented several metrics to assess various aspects of our models. These metrics are crucial for comparing model performance, enabling us to select the most appropriate loss functions and hyperparameters. This evaluation approach is especially valuable during the Optuna study to identify the best-performing models.

<details>
<summary><u>References</u></summary>
<ul>
    <li>M. Steurer, R. J. Hill, and N. Pfeifer, “Metrics for evaluating the performance of machine learning based automated valuation models,” Journal of Property Research, vol. 38, Art. no. 2, Apr. 2021. doi: <a href="https://doi.org/10.1080/09599916.2020.1858937">https://doi.org/10.1080/09599916.2020.1858937</a></li>
    <li>S. K. Morley, T. V. Brito, and D. T. Welling, “Measures of model performance based on the log accuracy ratio,” Space Weather, vol. 16, Art. no. 1, 2018. doi: <a href="https://doi.org/10.1002/2017SW001669">https://doi.org/10.1002/2017SW001669</a></li>
    <li>M. W. Liemohn, A. D. Shane, A. R. Azari, A. K. Petersen, B. M. Swiger, and A. Mukhopadhyay, “RMSE is not enough: Guidelines to robust data-model comparisons for magnetospheric physics,” Journal of Atmospheric and Solar-Terrestrial Physics, vol. 218, p. 105624, Jul. 2021. doi: <a href="https://doi.org/10.1016/j.jastp.2021.105624">https://doi.org/10.1016/j.jastp.2021.105624</a></li>
    <li>Matthew, L. H. Hansen, H. Zhang, G. Angelotti, and J. Gallifant, “A Closer Look at AUROC and AUPRC under Class Imbalance,” arXiv.org, 2024. <a href="https://arxiv.org/abs/2401.06091v1">https://arxiv.org/abs/2401.06091v1</a> (accessed Jul. 2024)</li>
</ul>
</details>

> **Note:** While many of these functions are available in ML libraries, we have reimplemented them to better suit our specific use cases.


### Outliers Evaluation Metrics

Metrics in this category focus on outliers and are particularly helpful in studies where outlier detection is crucial. This is especially important in our situation, as outliers are associated with solar storms, a significant phenomenon for our predictions.

Firstly, we have implemented a general class for Z-score calculation, which is useful for the detection of outliers.

> **Note:** The choice of 3.5 as a threshold comes from empirical observations that in normally distributed data, approximately 99.7% of data points should fall within a Z-Score of 3. If a point has a Z-Score greater than 3.5, it is considered significantly deviant.


In [61]:
#|export

class OutlierDetectionMetrics(Metrics):
    def __init__(self, threshold=3.5):
        super().__init__()
        self.threshold = threshold

    @staticmethod
    def _modified_z_score(x):
        """
        Calculate the Modified Z-Score for each variable in the tensor.
        
        Parameters:
        tensor (torch.Tensor): Input tensor of shape (batch_size, variables, horizon)
        
        Returns:
        torch.Tensor: Modified Z-Score tensor of the same shape as input
        """
        median = torch.median(x, dim=2, keepdim=True).values
        
        mad = torch.median(torch.abs(x - median), dim=2, keepdim=True).values
        mad = torch.where(mad == 0, torch.tensor(1.0, device=x.device), mad)
        
        modified_z_scores = 0.6745 * (x - median) / mad
        
        return modified_z_scores

    def _detect_outliers(self, values):
        """
        Detect outliers based on Modified Z-Scores.
        
        Parameters:
        z_scores (torch.Tensor): Modified Z-Scores tensor
        
        Returns:
        torch.Tensor: Boolean tensor indicating outliers
        """
        z_scores = self._modified_z_score(values)
        return torch.abs(z_scores) > self.threshold
    
    def _evaluate_outlier_predicted(self, y_true, y_pred):
        """
        Evaluate the performance of outlier detection.
        
        Parameters:
        y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)
        y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true
        
        Returns:
        AttrDict: Dictionary with true/false positives, false negatives, indices of true/predicted outliers
        """    
        # Detect outliers based on the threshold
        true_outliers = self._detect_outliers(y_true)
        pred_outliers = self._detect_outliers(y_pred)
        
        # Evaluate the detection by comparing true outliers and predicted outliers
        tp = torch.sum((pred_outliers & true_outliers).float())  # True Positives
        fp = torch.sum((pred_outliers & ~true_outliers).float()) # False Positives
        fn = torch.sum((~pred_outliers & true_outliers).float()) # False Negatives
        tn = torch.sum((~pred_outliers & ~true_outliers).float()) # True Negatives

        return AttrDict({
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn,
            "true_outliers": true_outliers,
            "predicted_outliers": pred_outliers
        })
    
    
    @abstractmethod
    def get_metrics(self) -> list:
        return NotImplementedError

#### F1 Score Metric

$$
\text{F1\_Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} \qquad \text{where: }
\begin{cases}
    \text{Precision} = \frac{TP}{TP + FP} \\
    \text{Recall} = \frac{TP}{TP + FN}
\end{cases}
$$

The F1 Score is a classification metric that we will use to evaluate how well the model is forecasting outliers in the dataset. We have expanded this implementation to include additional metrics related to the F1 Score, using true positives (TP), false positives (FP), true negatives (TN), and false negatives (FN). These metrics include:
$$
\text{} \\
\text{Accuracy} = \frac{TP + TN}{TP + FP + FN + TN} \\ 
\text{} \\
\text{Negative Predictive Value (NPV)} = \frac{TN}{TN + FN} \\
\text{} \\
\text{Specificity} = \frac{TN}{TN + FP} \\
\text{} \\
\Delta_\text{Detected Outliers} = |y_{outliers} - \hat{y}_{outliers}|
$$

In [62]:
#| export

class F1ScoreMetrics(OutlierDetectionMetrics):
    def __init__(self, threshold=3.5, metrics='F1_Score'):
        super().__init__(threshold)
        self.metrics = metrics
        self.epsilon = torch.finfo(torch.float32).eps # Used to avoid division per 0

    

    # Metrics
    def Precision(self, y_true, y_pred):
        """
        <p>Calculate the precision metric, which measures the ratio of correctly predicted positive observations to the total predicted positives.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Precision score<p>
        """
        stats = self._evaluate_outlier_predicted(y_true, y_pred)

        # To avoid divide by 0
        precision = (stats.tp + self.epsilon) / ((stats.tp + stats.fp) + self.epsilon)

        return precision

    
    def Recall(self, y_true, y_pred):
        """
        <p>Calculate the recall metric, which measures the ratio of correctly predicted positive observations to all observations in the actual class.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Recall score<p>
        """
        stats = self._evaluate_outlier_predicted(y_true, y_pred)

        recall = (stats.tp + self.epsilon) / ((stats.tp + stats.fn) + self.epsilon)
    

        return recall

    
    def F1_Score(self, y_true, y_pred):
        """
        <p>Calculate the F1 score, which is the harmonic mean of precision and recall. It is used as a measure of a model’s accuracy on a dataset.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: F1 score<p>
        """
        precision = self.Precision(y_true, y_pred)
        recall = self.Recall(y_true, y_pred)

        f1_score = ((2 * (precision * recall)) + self.epsilon) / ((precision + recall) + self.epsilon)
 

        return f1_score

    
    def Accuracy_Score(self, y_true, y_pred):
        """
        <p>Calculate the accuracy score, which is the ratio of correctly predicted observations to the total observations.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Accuracy score<p>
        """
        stats = self._evaluate_outlier_predicted(y_true, y_pred)
            
        ((stats.tp + stats.tn) + self.epsilon) / ((stats.tp + stats.fp + stats.fn + stats.tn) + self.epsilon)


    
    def Specificity(self, y_true, y_pred):
        """
        <p>Calculate the specificity metric, which measures the proportion of true negatives that are correctly identified.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Specificity score<p>
        """
        stats = self._evaluate_outlier_predicted(y_true, y_pred)
        
        return (stats.tn+ self.epsilon) / ((stats.tn + stats.fp) + self.epsilon)



    def Negative_Predictive_Value(self, y_true, y_pred):
        """
        <p>Calculate the Negative Predictive Value (NPV), which measures the proportion of true negatives among all negative predictions.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Negative Predictive Value score<p>
        """
        stats = self._evaluate_outlier_predicted(y_true, y_pred)

        return (stats.tn + self.epsilon) / ((stats.tn + stats.fn) + self.epsilon)
  

    
    def Detected_Outliers_Difference (self, y_true, y_pred):
        """
        <p>Calculate the change in detected outliers (Δ Detected Outliers), representing the number of true outliers not predicted as outliers.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Count of undetected outliers<p>
        """    
        stats = self._evaluate_outlier_predicted(y_true, y_pred)
        
        return torch.sum(stats.true_outliers & ~stats.predicted_outliers)


    # Metrics retrieval function
    def get_metrics(self) -> list:
        if self.metrics == 'F1_Score':
            return [self.F1_Score]
        elif self.metrics == 'All':
            return [self.Precision, self.Recall, self.F1_Score, self.Accuracy_Score, self.Specificity, self.Negative_Predictive_Value, self.Detected_Outliers_Difference ]
        else:
            return [self.Precision, self.Recall, self.F1_Score, self.Detected_Outliers_Difference ]

#### Area Under the Precision-Recall Curve (AUPRC)

$$
\text{AUPRC} = \int_{0}^{1} \text{Precision}(\text{Recall}) \, d(\text{Recall})
$$

The Area Under the Precision-Recall Curve (AUPRC) is a metric used to evaluate the effectiveness of a model in identifying rare, important events (outliers) in imbalanced datasets. In time series forecasting, especially when detecting outliers like solar storms, AUPRC is particularly useful. Since solar storms are rare compared to normal solar activity, traditional metrics like accuracy may not provide a clear picture of the model’s performance. However, AUPRC focuses on how well the model balances identifying true solar storms (high recall) while minimizing false positives (high precision).

A higher AUPRC value suggests that the model is effective in detecting solar storms without generating too many false alarms, making it a crucial metric for evaluating the model’s ability to correctly identify these rare but significant events in your forecasting tasks.


In [63]:
#|export

class AUPRCMetric(OutlierDetectionMetrics):
    def __init__(self, threshold=3.5):
        super().__init__(threshold)


    # Metrics
    def AURPC(self, y_true, y_pred):
        """
        <p>Calculate the Area Under the Precision-Recall Curve (AUPRC), a 
        metric used to evaluate the effectiveness of a model in identifying rare, important events (outliers)</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        
        <h3>Returns:</h3>
        <p>torch.Tensor: AUPRC score<p>
        """
        pred_z_scores = self._modified_z_score(y_pred)
        
        pred_z_scores_flat = pred_z_scores.view(-1).cpu().numpy()
        true_outliers_flat = self._detect_outliers(y_true).view(-1).cpu().numpy()
        
        # Use precision_recall_curve to get precision and recall for different thresholds
        precision, recall, _ = precision_recall_curve(true_outliers_flat, pred_z_scores_flat)
        
        auprc_value = auc(recall, precision)
        
        return torch.tensor(auprc_value, device=y_true.device)
    

    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [self.AURPC]

#### Kurtois and Skewness Difference (KSD)

$$
\text{Skewness}(S) = \frac{\frac{1}{n} \sum_{i=1}^{n} (X_i - \bar{X})^3}{\left(\frac{1}{n} \sum_{i=1}^{n} (X_i - \bar{X})^2\right)^{3/2}} \qquad
\text{Kurtois}(K) = \frac{\frac{1}{n} \sum_{i=1}^{n} (X_i - \bar{X})^4}{\left(\frac{1}{n} \sum_{i=1}^{n} (X_i - \bar{X})^2\right)^2}
$$


Differences in skewness (S), which measures the "tailedness" of a distribution, and kurtosis (K), which quantifies the asymmetry of a distribution, are valuable metrics for evaluating the ability of the model to detect outliers because they assess the tails of distributions. Traditional metrics often focus on central tendencies, such as the mean or median, and might not adequately capture the presence or absence of outliers.

In [64]:
#| export

class KSDifferenceMetric(Metrics):
    def __init__(self, threshold=3.5):
        super().__init__()
        self.threshold = threshold

    @staticmethod
    def skewness(x):
        # As batches are randomly generated and each variable is independent
        mean = torch.mean(x, dim=2, keepdim=True)
        std_dev = torch.std(x, dim=2, unbiased=True, keepdim=True)
        
        skewness = torch.mean(((x - mean) / std_dev) ** 3, dim=2)
        return skewness

    @staticmethod
    def kurtosis(x):
        mean = torch.mean(x, dim=2, keepdim=True)
        std_dev = torch.std(x, dim=2, unbiased=True, keepdim=True)
        
        kurtosis = torch.mean(((x - mean) / std_dev) ** 4, dim=2)
        return kurtosis
    

    # Metrics
    def Skewness_Difference(self, y_true, y_pred):
        """
        <p>Calculate the absolute difference in skewness between the actual and predicted values, which measures the asymmetry of the data distribution.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Absolute difference in skewness between y_true and y_pred<p>
        """
        true_skewness = KSDifferenceMetric.skewness(y_true)
        pred_skewness = KSDifferenceMetric.skewness(y_pred)
        
        return torch.mean(torch.abs(true_skewness - pred_skewness), dim=[0, 1])

    
    def Kurtosis_Difference(self, y_true, y_pred):
        """
        <p>Calculate the absolute difference in kurtosis between the actual and predicted values, which measures the tailedness of the data distribution.</p>

        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>

        <h3>Returns:</h3>
        <p>torch.Tensor: Absolute difference in kurtosis between y_true and y_pred<p>
        """
        true_kurtosis = KSDifferenceMetric.kurtosis(y_true)
        pred_kurtosis = KSDifferenceMetric.kurtosis(y_pred)
        
        return torch.mean(torch.abs(true_kurtosis - pred_kurtosis), dim=[0, 1])


    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [self.Skewness_Difference, self.Kurtosis_Difference]

### Association Metrics
Metrics in this category quantify how well a model captures the trends in observed data. We will use two from this category:

#### Pearson Linear Correlation Coefficient ($R$)

$$
r = \frac{\sum_{i=1}^{n} (X_i - \bar{X})(Y_i - \bar{Y})}{\sqrt{\sum_{i=1}^{n} (X_i - \bar{X})^2} \cdot \sqrt{\sum_{i=1}^{n} (Y_i - \bar{Y})^2}}
$$

The most commonly used metric for correlation is the **Pearson Linear Correlation Coefficient ($R$)**. It is important to note that a good $R$-value should be both statistically significant and above an appropriate threshold for the study.

#### Coefficient of Determination Metric ($R^2$)

$$
R^2 = 1 - \frac{\text{RSS}}{\text{TSS}} \qquad \text{where: }
\begin{cases}
    \text{Residual Sum of Squares (RSS)} = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 \\
    \text{Total Sum of Squares (TSS)} = \sum_{i=1}^{n} (y_i - \bar{y})^2
\end{cases}
$$

The **Coefficient of Determination Metric ($R^2$)** quantifies the proportion of variance in the observed solar indices that is explained by the model's predictions. This measure is particularly useful because it provides a direct indication of how well the model captures the underlying patterns in the data.


In [65]:
#|export

class AssociationMetrics(Metrics):
    def __init__(self):
        super().__init__()
        self.epsilon = torch.finfo(torch.float32).eps # Used to avoid division per 0


    # Metrics
    def R_Correlation(self, y_true, y_pred):
        """
        <p>Calculate the Pearson Correlation Coefficient (R Correlation) between true and predicted values.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>torch.Tensor: R Correlation coefficient<p>
        """
        y_true_flat = y_true.reshape(-1)
        y_pred_flat = y_pred.reshape(-1)
        
        # To be able to use torch.corrcoef, we need to stack the tensors
        stacked = torch.stack([y_true_flat, y_pred_flat])
        
        corr_matrix = torch.corrcoef(stacked)
        
        r_value = corr_matrix[0, 1]
        return r_value


    def R2_Score(self, y_true, y_pred):
        """
        <p>Calculate the R^2 score, which measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s).</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>torch.Tensor: R^2 score<p>
        """
        y_true_mean = torch.mean(y_true, dim=2, keepdim=True)
        
        # Total Sum of Squares
        ss_tot = torch.sum((y_true - y_true_mean) ** 2)
        
        # Residual Sum of Squares
        ss_res = torch.sum((y_true - y_pred) ** 2)

        
        r2 = 1 - ((ss_res + self.epsilon) / (ss_tot + self.epsilon))
        
        return r2

    
    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [self.R_Correlation, self.R2_Score]
    
    

### Extra: Values scalation when they go under the log asymptote

In [66]:
#| export
def inverse_scale_values_below_threshold(tensor, threshold, lower_bound, upper_bound):
    mask = tensor < threshold

    if mask.sum() == 0:
        # If no values are below the threshold, return the original tensor
        return tensor
    
    values_to_scale = tensor[mask]
    min_orig = values_to_scale.min()
    max_orig = values_to_scale.max()
    
    if min_orig == max_orig:
        scaled_values = torch.full_like(tensor, upper_bound)
    else:
        scaled_values = upper_bound - (tensor - min_orig) * (upper_bound - lower_bound
    ) / (max_orig - min_orig)
    
    result_tensor = torch.where(mask, scaled_values, tensor)
    
    return result_tensor

### Accuracy Metrics

The metrics in this category are used to quantify the closeness of model predictions to actual observations, providing a measure of how well a model reproduces real-world data.

#### Symmetric Mean Absolute Percentage Error Metric (sMAPE)

$$
\text{sMAPE} = \frac{1}{N} \sum_{i=1}^{N} \frac{|y_i - \hat{y}_i|}{\frac{|y_i| + |\hat{y}_i|}{2}} \times 100
$$

The Symmetric Mean Absolute Percentage Error (sMAPE) offers an alternative to traditional metrics like MAE (Mean Absolute Error), which are sensitive to larger values. By scaling the absolute error to the average magnitude of the data and model pair, sMAPE mitigates the undue influence of outliers. However, as an average-based metric, it remains susceptible to extreme values within the dataset. 

Since sMAPE is not directly aligned with either of the losses used, it may not fully capture the strengths or weaknesses of models trained with these loss functions. For example, a model optimized for MAE might perform poorly on sMAPE if the predictions are close in absolute terms but relatively inaccurate for small actual values.

#### Median Symmetric Accuracy Metric (MSA)

$$
\text{MSA} = 100 \times (e^{\tilde{x}}-1) \qquad \text{where: } x = \left|\ln\left(\frac{\hat{y_i}}{y_i}\right)\right|
$$

The Median Symmetric Accuracy (MSA) provides a more robust accuracy assessment, particularly for datasets prone to outliers. It leverages the median of the log-transformed ratios between model and observed values, minimizing the impact of extreme deviations. This characteristic makes MSA particularly suitable for characterizing model performance in capturing the overall distribution and minimizing the effect of outliers inherent in observational data.


In [67]:
#| export

class AccuracyMetrics(Metrics):
    def __init__(self):
        super().__init__()
        self.epsilon = torch.finfo(torch.float32).eps # Used to avoid division per 0



    # Metrics
    def sMAPE(self, y_true, y_pred):
        """
        Calculate the Symmetric Mean Absolute Percentage Error (sMAPE).
        
        Parameters:
        y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)
        y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true
        
        Returns:
        torch.Tensor: sMAPE value
        """
        abs_error = torch.abs(y_true - y_pred)
        symetric_error = ((torch.abs(y_true) + torch.abs(y_pred)) / 2.0) 
        
        smape = torch.mean((abs_error + self.epsilon)/ (symetric_error + self.epsilon)) * 100
        return smape
    
    def MSA(self, y_true, y_pred):
        """
        Calculate the Median Symmetric Accuracy (MSA).
        
        Parameters:
        y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)
        y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true
        
        Returns:
        torch.Tensor: MSA value
        """
        q = (y_pred + self.epsilon) / (y_true + self.epsilon)

        log_ratio = torch.abs(torch.log(inverse_scale_values_below_threshold(q, 0, 0.9, self.epsilon)))

        msa = (torch.exp(torch.median(log_ratio)) - 1) * 100
        
        return msa
    

    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [self.sMAPE, self.MSA]


### Bias Metrics

This category examines the systematic overestimation or underestimation of observations by a model.

#### Symmetric Signed Percentage Bias Metric (SSPB)

$$
\text{SSPB} = \text{sign}(\tilde{x}) \times \left|e^{\tilde{x}} - 1\right| \times 100 \qquad \text{where: }
\begin{cases}
    x = \ln\left(\frac{\hat{y_i}}{y_i}\right) \\
    \\
    \text{sign}(x) = 
    \begin{cases} 
        -1 & \text{if } x < 0, \\
        0 & \text{if } x = 0, \\
        1 & \text{if } x > 0.
    \end{cases}
\end{cases}
$$

The Symmetric Signed Percentage Bias (SSPB) is particularly well-suited for assessing bias in time series data characterized by high variability spanning several orders of magnitude. Unlike conventional bias metrics like Mean Error (ME), which are sensitive to extreme values, SSPB leverages the logarithm of the model-to-observation ratio. This logarithmic transformation effectively mitigates the disproportionate influence of outliers, providing a more balanced assessment of systematic overestimation or underestimation by the model.

This characteristic is especially valuable when evaluating models dealing with data such as radiation belt electron fluxes, which exhibit significant fluctuations. By using the median of these logarithmic ratios, SSPB further enhances its robustness, offering a stable measure of central tendency even in the presence of non-normal error distributions.


In [68]:
#| export

class BiasMetrics(Metrics):
    def __init__(self):
        super().__init__()
        self.epsilon = torch.finfo(torch.float32).eps # Used to avoid division per 0


    # Metrics
    def SSPB(self, y_true, y_pred):
        """
        <p>Calculate the Symmetric Signed Percentage Bias (SSPB), which measures the percentage bias with consideration for the direction of the bias.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>y_true (torch.Tensor): Actual values tensor of shape (batch_size, variables, horizon)</li>
            <li>y_pred (torch.Tensor): Predicted values tensor of the same shape as y_true</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>torch.Tensor: SSPB value<p>
        """
        q = (y_pred + self.epsilon) / (y_true + self.epsilon)

        log_ratio = torch.abs(torch.log(inverse_scale_values_below_threshold(q, 0, 0.9, self.epsilon)))
        median_log_ratio = torch.median(log_ratio)

        sign = torch.sign(median_log_ratio)
        
        return sign * (torch.exp(torch.abs(median_log_ratio)) - 1) * 100



    # Metrics retrieval function
    def get_metrics(self) -> list:
        return [self.SSPB]

### Validation Metrics Handler

In [69]:
#| export

class ValidationMetricsHandler:
    """
    <p>A class to manage validation metrics for model evaluation. It allows listing available metrics, uploading requested metrics, and retrieving study directions and objective values.</p>
    
    <h3>Attributes:</h3>
    <ul>
        <li>available_metrics (list)[<i>Static</i>]: A list of available metrics provided by different metric classes.</li>
        <li>study_directions (dict)[<i>Static</i>]: A dictionary mapping metrics to their respective optimization directions (maximize or minimize).</li>
        <li>requested_metrics (dict): A dictionary storing metrics that have been requested for evaluation.</li>
    </ul>
    """
    
    available_metrics = [
        *F1ScoreMetrics(metrics='All').get_metrics(),
        *AUPRCMetric().get_metrics(),
        *KSDifferenceMetric().get_metrics(),
        *AssociationMetrics().get_metrics(),
        *AccuracyMetrics().get_metrics(),
        *BiasMetrics().get_metrics()
    ]

    study_directions = {
        'precision': StudyDirection.MAXIMIZE,                    # Higher precision is better (Range: [0, 1])
        'recall': StudyDirection.MAXIMIZE,                       # Higher recall is better (Range: [0, 1])
        'f1_score': StudyDirection.MAXIMIZE,                     # Higher F1 score is better (Range: [0, 1])
        'accuracy_score': StudyDirection.MAXIMIZE,               # Higher accuracy is better (Range: [0, 1])
        'specificity': StudyDirection.MAXIMIZE,                  # Higher specificity is better (Range: [0, 1])
        'negative_predictive_value': StudyDirection.MAXIMIZE,    # Higher NPV is better (Range: [0, 1])
        'detected_outliers_difference': StudyDirection.MINIMIZE, # Minimize the difference in detected outliers (Range: [0, ∞))
        'aurpc': StudyDirection.MAXIMIZE,                        # Higher AUPRC is better (Range: [0, 1])
        'skewness_difference': StudyDirection.MINIMIZE,          # Minimize skewness difference to target (Range: [−∞, ∞])
        'kurtosis_difference': StudyDirection.MINIMIZE,          # Minimize kurtosis difference to target (Range: [−∞, ∞])
        'r_correlation': StudyDirection.MAXIMIZE,                # Higher Pearson correlation is better (Range: [−1, 1])
        'r2_score': StudyDirection.MAXIMIZE,                     # Higher R² is better (Range: [−∞, 1])
        'smape': StudyDirection.MINIMIZE,                        # Lower SMAPE is better (Range: [0, ∞))
        'msa': StudyDirection.MAXIMIZE,                          # Higher MSA is better (Range: [0, 1])
        'sspb': StudyDirection.MINIMIZE                          # Minimize absolute SSPB (optimize for bias close to zero) (Range: [−100%, 100%])
    }


    def __init__(self, metrics:list=None, main_metric:str=''):
        self.requested_metrics = {}
        if metrics is not None:
            self.add(metrics)

        self.main_metric = main_metric.lower()


    # Visualization functions
    def list(self):
        """
        <p>Display a list of available metrics along with their descriptions in a table format.</p>
        """
        table_rows = []

        if not bool(self.requested_metrics):
            for metric in ValidationMetricsHandler.available_metrics:
                doc_html = metric.__doc__.strip().replace("\n", " ")
                table_rows.append(f"<tr><td style='text-align: left;'><strong>{metric.__name__}</strong></td><td style='text-align: left;'>{doc_html}</td></tr>")
        else:
            for metric in self.requested_metrics.values():
                doc_html = metric.__doc__.strip().replace("\n", " ")
                metric_name = metric.__name__.lower()
                if metric_name in self.requested_metrics.values():
                    table_rows.append(f"<tr><td style='text-align: left;'><strong>{metric.__name__}</strong></td><td style='text-align: left;'>{doc_html}</td></tr>")
                else:
                    table_rows.append(f"<tr><td style='text-align: left;'>{metric.__name__}</td><td style='text-align: left;'>{doc_html}</td></tr>")
        
        
        table_html = f"""
        <table>
            <thead>
                <tr>
                    <th style='text-align: left;'>Metric Name</th>
                    <th style='text-align: left;'>Description</th>
                </tr>
            </thead>
            <tbody>
                {''.join(table_rows)}
            </tbody>
        </table>
        """
        
        display(HTML(table_html))

    def _show_metrics(self, metrics:List[AvgMetric]) -> None:
        metric_column_width = '250px'  # Fixed width to accommodate 'detected_outliers_difference'
        value_column_width = '100px'   # Fixed width for values up to 12 characters
        table_rows = []
        for metric in metrics:
            value = f"{metric.value:.4f}"
            table_rows.append(f"<tr><td style='padding: 4px; text-align: left; width: {metric_column_width}; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;'><strong>{metric.name}</strong></td><td style='padding: 4px; text-align: right; width: {value_column_width};'>{value}</td></tr>")
        
        table_html = f"""
        <table style='border-collapse: collapse; table-layout: fixed; width: {int(metric_column_width[:-2]) + int(value_column_width[:-2]) + 20}px;'>
            <thead>
                <tr>
                    <th style='padding: 4px; text-align: left; border: 1px solid black; width: {metric_column_width};'>Metric Name</th>
                    <th style='padding: 4px; text-align: right; border: 1px solid black; width: {value_column_width};'>Value</th>
                </tr>
            </thead>
            <tbody>
                {''.join(table_rows)}
            </tbody>
        </table>
        """
        
        display(HTML(table_html))



    # Metrics management functions
    def add(self, metrics:list):
        """
        <p>Upload a list of metrics to the factory for evaluation. The metrics are converted to lowercase for consistency.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>metrics (list): A list of metric names to be uploaded for evaluation.</li>
        </ul>
        
        <h3>Raises:</h3>
        <p>ValueError: If any metric in the provided list is not found in the available metrics.</p>
        """
        metrics = [metric.lower() for metric in metrics]

        for metric in ValidationMetricsHandler.available_metrics:
            metric_name = metric.__name__.lower()
            if metric_name in metrics:
                self.requested_metrics[metric_name] = metric
                metrics.remove(metric.__name__.lower())
        
        if len(metrics) > 0:
            raise ValueError(f"Metrics not found: {metrics}. Please use ValidationMetricsFactory.list() to see available metrics.")
        
    def remove(self, metrics:list):
        """
        <p>Remove a list of metrics from the factory that were previously uploaded for evaluation. The metrics are converted to lowercase for consistency.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>metrics (list): A list of metric names to be removed from evaluation.</li>
        </ul>
        
        <h3>Raises:</h3>
        <p>ValueError: If any metric in the provided list is not found in the requested metrics.</p>
        """
        metrics = [metric.lower() for metric in metrics]

        for metric in metrics:
            if metric in self.requested_metrics:
                self.requested_metrics.pop(metric)
            else:
                raise ValueError(f"Metric not found: {metric}. Please use ValidationMetricsFactory.get_metrics() to see requested metrics.")
        


    # Metrics utility functions
    def get_metrics(self) -> list:
        """
        <p>Retrieve the list of requested metrics for evaluation.</p>
        
        <h3>Returns:</h3>
        <p>list: A list of requested metric objects.</p>
        """
        return list(self.requested_metrics.values())

    def get_study_directions(self) -> list:    
        """
        <p>Retrieve the study directions (maximize or minimize) for the requested metrics.</p>
        
        <h3>Returns:</h3>
        <p>list: A list of study directions corresponding to the requested metrics.</p>
        """
        return [self.study_directions[metric] for metric in self.requested_metrics.keys()]
    
    def get_objective_values(self, metrics_results:List[AvgMetric], show_metrics=False) -> list:
        """
        <p>Extract the objective values from the results of the requested metrics.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>metrics_results (List[AvgMetric]): A list of metric result objects from which to extract values.</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>list: A list of metric values extracted from the provided results.</p>
        """
        if show_metrics:
            self._show_metrics(metrics_results)

        object_values = []
        for metric, requested_metric in zip(metrics_results, self.requested_metrics.keys()):
            metric_name = metric.name.lower()
            if metric_name == requested_metric:
                # As SSPB could be positive or negative, but the better is to be closer to 0
                if metric_name == 'sspb':
                    object_values.append(np.abs(metric.value)) 
                else:
                    object_values.append(metric.value)
            else:
                raise ValueError(f"Unexpected metric found: {metric_name}. Expected: {requested_metric}")
            
            
        return (metric_result.value for metric_result in metrics_results)
    
    def save(self, path:str='tmp/') -> str:
        """
        <p>Save the requested metrics to a file to share with the training notebook.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li>path (str): The path to save the requested metrics. | <i>Default</i>: tmp</li>
        </ul>
        """
        path = f"{path}metrics.pkl"
        save_object(self, path)

        return path
    

    # Metrics evaluation functions
    @staticmethod
    def _has_improved(trial_value, best_value, direction):
        """
        <p>Determine if the trial value has improved over the best value based on the optimization direction.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li><b>trial_value</b> (float or int): The value from the current trial to evaluate.</li>
            <li><b>best_value</b> (float or int): The current best value for comparison.</li>
            <li><b>direction</b> (StudyDirection): The direction of optimization, either maximizing or minimizing.</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>bool: True if the trial value represents an improvement over the best value in the given direction, False otherwise.</p>
        """
        if direction == StudyDirection.MAXIMIZE:
            return trial_value > best_value
        elif direction == StudyDirection.MINIMIZE:
            return trial_value < best_value
        return False

    def are_best_values(self, best_values, trial_values) -> bool:
        """
        <p>Determine if the trial values are better than the current best values for a given metric.</p>
        
        <h3>Parameters:</h3>
        <ul>
            <li><b>main_metric</b> (str): The name of the main metric used for comparison.</li>
            <li><b>best_values</b> (list): A list of current best values for various metrics.</li>
            <li><b>trial_values</b> (list): A list of new trial values to compare against the best values.</li>
        </ul>
        
        <h3>Returns:</h3>
        <p>bool: True if the trial values are overall better than the best values, False otherwise.</p>
        """
        cls = ValidationMetricsHandler

        if best_values is None:
            return True

        if len(best_values) == 1:
            return cls._has_improved(
                    trial_value=trial_values[0].value, 
                    best_value=best_values[0].value, 
                    direction=cls.study_directions.get(self.main_metric)
                )

        improvement_count = 0
        comparison_threshold = len(best_values) // 2

        for best_metric, trial_metric in zip(best_values, trial_values):
            metric_name = best_metric.name.lower()
            direction = cls.study_directions.get(metric_name)

            if cls._has_improved(trial_metric.value, best_metric.value, direction):
                if self.main_metric == metric_name:
                    improvement_count += comparison_threshold
                else:
                    improvement_count += 1

        return improvement_count > comparison_threshold
            


## Tests

In [70]:
# Test
device = 'cpu'
ranges = {'A': np.array([[0, 1], [1, 2], [2, 3], [3, 4]]),
          'B': np.array([[0, 1], [1, 2], [2, 3], [3, 4]]),
          'C': np.array([[0, 1], [1, 2], [2, 3], [3, 4]]),
          'D': np.array([[0, 1], [1, 2], [2, 3], [3, 4]])}

weights = {'A': np.array([1, 2, 3, 4])}

target = torch.tensor([[[0.5, 1.5, 2.5, 3.5, 4.5, 5.5],
                        [0.5, 1.5, 2.5, 3.5, 4.5, 5.5],
                        [0.5, 1.5, 2.5, 3.5, 4.5, 5.5],
                        [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]]], device=device, dtype=torch.float32)

input = target + 1

In [71]:
# Test

def test_LossMetrics():
    loss = wMAELoss(ranges, weights).to(device)
    metrics = LossMetrics(loss, 'SolFSMY').get_metrics()

    loss_value = loss(input, target)
    metrics_values = [metric(input, target) for metric in metrics]

    assert torch.isclose(loss_value, sum(metrics_values)), f"Expected {loss_value}, but got {sum(metrics_values)} ({metrics_values})"
    print("LossMetrics test passed!")

def test_LossMetrics_for_classification():
    loss = ClassificationLoss(ranges, MSELoss()).to(device)
    metrics = SOLFMYClassificationMetrics(loss)

    # Compute the total misclassifications manually for all specific positions
    total_counts = 0
    total_counts += metrics.Missclassifications_Low(input, target)
    total_counts += metrics.Missclassifications_Moderate(input, target)
    total_counts += metrics.Missclassifications_Elevated(input, target)
    total_counts += metrics.Missclassifications_High(input, target)

    # Use the generate_metrics method to retrieve and calculate all defined metrics
    metrics_functions = LossMetrics(loss, 'SolFSMY').get_metrics()
    metrics_values = [metric(input, target) for metric in metrics_functions]

    # Assert that the total manually calculated matches the sum of individual metrics
    assert np.isclose(total_counts, sum(metrics_values)), f"Expected {total_counts}, but got {sum(metrics_values)} ({metrics_values})"
    print("LossMetrics for classification loss test passed!")

In [72]:
y_true = torch.tensor([
    [[10, 12, 12, 13, 12, 12, 12, 14, 12, 100], [20, 22, 23, 20, 22, 20, 21, 22, 23, 200]],
    [[-11, -12, -13, -13, -14, -14, -12, -13, -14, -105], [-22, -23, -25, -23, -22, -24, -23, -22, -25, -210]]
], dtype=torch.float)

y_pred = torch.tensor([
    [[11, 12, 12, 13, 12, 12, 13, 14, 13, 90], [21, 22, 23, 21, 22, 21, 22, 22, 23, 195]],
    [[-12, -13, -14, -14, -15, -14, -13, -14, -14, -10], [-23, -24, -26, -24, -23, -25, -24, -23, -26, -205]]
], dtype=torch.float)

In [73]:
# Test
def test_OutlierDetectionMetrics():
    metrics = F1ScoreMetrics(metrics=' ').get_metrics()
    metrics_precision = metrics[0](y_true, y_pred) 
    metrics_recall = metrics[1](y_true, y_pred)
    metrics_f1_score = metrics[2](y_true, y_pred)
    metrics_outliers_difference = metrics[3](y_true, y_pred)

    f1 = 2 * (metrics_precision * metrics_recall) / (metrics_precision + metrics_recall)

    assert metrics_precision == 1.0, f"Expected 1.0, but got {metrics_precision}"
    assert metrics_recall == 0.75, f"Expected 0.75, but got {metrics_recall}"
    assert metrics_f1_score == f1, f"Expected {f1}, but got {metrics_f1_score}"
    assert metrics_outliers_difference == 1, f"Expected 2, but got {metrics_outliers_difference}"

    print("OutlierDetectionMetrics test passed!")

In [74]:
# Test
def evaluate_get_metrics (metrics:list):
    for metric in metrics:
        print(f"{metric.__name__}: {metric(y_true, y_pred)}")

def test_F1ScoreMetrics():
    metrics = F1ScoreMetrics(metrics='All').get_metrics()
    evaluate_get_metrics(metrics)
    print("F1ScoreMetrics test passed!")

def test_AUPRCMetric():
    metrics = AUPRCMetric().get_metrics()
    evaluate_get_metrics(metrics)
    print("AUPRCMetric test passed!")

def test_KSDifferenceMetric():
    metrics = KSDifferenceMetric().get_metrics()
    evaluate_get_metrics(metrics)
    print("KSDifferenceMetric test passed!")

def test_AssociationMetrics():
    metrics = AssociationMetrics().get_metrics()
    evaluate_get_metrics(metrics)
    print("AssociationMetrics test passed!")

def test_AccuracyMetrics():
    metrics = AccuracyMetrics().get_metrics()
    evaluate_get_metrics(metrics)
    print("AccuracyMetrics test passed!")

def test_BiasMetrics():
    metrics = BiasMetrics().get_metrics()
    evaluate_get_metrics(metrics)
    print("BiasMetrics test passed!")

In [75]:
def test_upload_valid_metrics():
    valid_metrics = ['precision', 'recall']
    factory = ValidationMetricsHandler(valid_metrics)
    try:
        factory.add(valid_metrics)
        assert len(factory.requested_metrics) == len(valid_metrics)
        print("test_upload_valid_metrics passed!")
    except Exception as e:
        print(f"test_upload_valid_metrics failed: {e}")

def test_upload_invalid_metric():
    try:
        invalid_metric = ['non_existing_metric']
        factory = ValidationMetricsHandler(invalid_metric)    
    except ValueError:
        print("test_upload_invalid_metric passed!")
    except Exception as e:
        print(f"test_upload_invalid_metric failed: {e}")


def test_get_study_directions():
    valid_metrics = ['precision', 'recall']
    factory = ValidationMetricsHandler(valid_metrics)
    try:
        factory.upload(valid_metrics)
        directions = factory.get_study_directions()
        expected_directions = [ValidationMetricsHandler.study_directions[m] for m in valid_metrics]
        assert directions == expected_directions
        print("test_get_study_directions passed!")
    except Exception as e:
        print(f"test_get_study_directions failed: {e}")

def test_get_objective_values():
    from collections import namedtuple
    AvgMetric = namedtuple('AvgMetric', ['value'])
    
    valid_metrics = ['precision', 'recall']
    factory = ValidationMetricsHandler(valid_metrics)
    try:
        factory.upload(valid_metrics)
        mock_results = [AvgMetric(value=i) for i in range(len(valid_metrics))]
        values = factory.get_objective_values(mock_results)
        assert values == list(range(len(valid_metrics)))
        print("test_get_objective_values passed!")
    except Exception as e:
        print(f"test_get_objective_values failed: {e}")


In [76]:
# Test
test_LossMetrics()
test_LossMetrics_for_classification()
test_OutlierDetectionMetrics()
test_F1ScoreMetrics()
test_AUPRCMetric()
test_KSDifferenceMetric()
test_AssociationMetrics()
test_AccuracyMetrics()
test_BiasMetrics()

# Factory tests
test_upload_valid_metrics()
test_upload_invalid_metric()
test_get_study_directions()
test_get_objective_values()

LossMetrics test passed!
LossMetrics for classification loss test passed!
OutlierDetectionMetrics test passed!
Precision: 1.0
Recall: 0.75
F1_Score: 0.8571428656578064
Accuracy_Score: None
Specificity: 1.0
Negative_Predictive_Value: 0.9729729890823364
Detected_Outliers_Difference: 1
F1ScoreMetrics test passed!
AURPC: 0.7721153846153846
AUPRCMetric test passed!
Skewness_Difference: 0.8450393676757812
Kurtosis_Difference: 0.8302615284919739
KSDifferenceMetric test passed!
R_Correlation: 0.9610453844070435
R2_Score: 0.8769017457962036
AssociationMetrics test passed!
sMAPE: 7.933315753936768
MSA: 4.347825050354004
AccuracyMetrics test passed!
SSPB: 4.347825050354004
BiasMetrics test passed!
test_upload_valid_metrics passed!
test_upload_invalid_metric passed!
test_get_study_directions failed: 'ValidationMetricsHandler' object has no attribute 'upload'
test_get_objective_values failed: 'ValidationMetricsHandler' object has no attribute 'upload'


In [77]:
#|eval: false
#|hide
from nbdev import *
nbdev_export()