In [1]:
import os

import pandas as pd
import numpy as np

def calculate_model_metric(models_path, model_names, metric):
    """
    Calculate performance metrics for multiple models.
    :param models_path (str): Base path to models directory
    :param model_names (list): List of model names to analyze  
    :param metric (str): Metrics name that should be calculated
    :returns pd.DataFrame: DataFrame with model metrics
    """
    results = []
    val_metric = f"val_{metric}"

    for model_name in model_names:
        model_dir = os.path.join(models_path, model_name)
        val_metrics = pd.read_csv(os.path.join(model_dir, "val_metrics.csv"))
        train_metrics = pd.read_csv(os.path.join(model_dir, "train_metrics.csv"))
        
        best_validation    = val_metrics[val_metric].max()
        avarage_validation = val_metrics[val_metric].mean()
        epochs_to_best     = val_metrics[val_metric].idxmax() + 1

        train_metrics = train_metrics.groupby('epoch')[metric].mean()
        best_training = train_metrics.max()
    
        results.append({
            'Model': model_name,
            'Metric': metric,
            'Best Val': f"{best_validation:.4f}",
            'Avg Val': f"{avarage_validation:.4f}",
            'Training': f"{best_training:.4f}",
            'Epochs to Best': epochs_to_best,
        
        })
    
    return pd.DataFrame(results)


In [9]:
from src.utils.consts import MODELS_PATH
model_names = ['DenseNet121_v1', 'DenseNet121_v2', 'DenseNet121_v2_1', 'DenseNet121_v3', 'DenseNet121_v3_1', 'DenseNet121_v3_2']
calculate_model_metric(MODELS_PATH, model_names, 'recall')

Unnamed: 0,Model,Metric,Best Val,Avg Val,Training,Epochs to Best
0,DenseNet121_v1,recall,0.096,0.0392,0.1179,26
1,DenseNet121_v2,recall,0.1177,0.0399,0.1132,30
2,DenseNet121_v2_1,recall,0.081,0.0323,0.0911,30
3,DenseNet121_v3,recall,0.3314,0.1871,0.2295,8
4,DenseNet121_v3_1,recall,0.3139,0.2109,0.2603,13
5,DenseNet121_v3_2,recall,0.3436,0.2409,0.358,23


In [11]:
import pandas as pd
import numpy as np

from src.model.tensorflow_utils import calculate_class_weights, load_dataset
from src.utils.consts import TF_RECORD_DATASET, TF_BUFFER_SIZE, NUM_CLASSES

def analyze_class_performance(models_path, model_names):
    results = []

    # Get test dataset for calculating weights
    test_ds = load_dataset(f"{TF_RECORD_DATASET}/test.tfrecord", TF_BUFFER_SIZE)
    class_weights = calculate_class_weights(test_ds, NUM_CLASSES)
    
    # Load label mappings
    mappings_path = f"{TF_RECORD_DATASET}/label_mappings.csv"
    labels_df = pd.read_csv(mappings_path)
    labels_df = labels_df.sort_values('Index')
    class_names = labels_df['Label'].tolist()
    
    # Create index to class name mapping
    index_to_class = {i: name for i, name in enumerate(class_names)}
    
    for model_name in model_names:
        model_dir = os.path.join(models_path, model_name)
        model_metrics = pd.read_csv(os.path.join(model_dir, "model_metrics.csv"))
        
        most_improved = model_metrics.sort_values('f1_score', ascending=False).head(3)
        most_improved_with_weights = []        
        for _, row in most_improved.iterrows():
            class_name = row['class_name']
            class_idx = [i for i, name in enumerate(class_names) if name == class_name]
            
            if class_idx:
                weight = float(class_weights[class_idx[0]])
                most_improved_with_weights.append(f"{class_name} (AUC={row['auc']:.2f}, F1={row['f1_score']:.2f}, wt={weight:.1f})")
            else:
                most_improved_with_weights.append(f"{class_name} (AUC={row['auc']:.2f}, F1={row['f1_score']:.2f})")
        
        most_improved_str = ", ".join(most_improved_with_weights)
    
        problematic = model_metrics[(model_metrics['auc'] > 0.65) & (model_metrics['f1_score'] < 0.05)].sort_values('auc', ascending=False).head(3)
        problematic_with_weights = []        
        for _, row in problematic.iterrows():
            class_name = row['class_name']
            class_idx = [i for i, name in enumerate(class_names) if name == class_name]
            
            if class_idx:
                weight = float(class_weights[class_idx[0]])
                problematic_with_weights.append(f"{class_name} (AUC={row['auc']:.2f}, F1={row['f1_score']:.2f}, wt={weight:.1f})")
            else:
                problematic_with_weights.append(f"{class_name} (AUC={row['auc']:.2f}, F1={row['f1_score']:.2f})")
                
        problematic_str = ", ".join(problematic_with_weights)
        
        class_weight_impact = "N/A"
        merged_data = []
        for idx, name in enumerate(class_names):
            metrics_row = model_metrics[model_metrics['class_name'] == name]
            if not metrics_row.empty:
                merged_data.append({
                    'class_name': name,
                    'weight': float(class_weights[idx]),
                    'f1_score': metrics_row['f1_score'].values[0],
                    'auc': metrics_row['auc'].values[0]
                })
        
        if merged_data:
            merged_df = pd.DataFrame(merged_data)
            weight_f1_corr = merged_df['weight'].corr(merged_df['f1_score'])
            
            if weight_f1_corr < -0.3:
                class_weight_impact = "Inverse relationship: higher weights correlate with lower F1 scores"
            elif weight_f1_corr > 0.3:
                class_weight_impact = "Positive relationship: weights effectively boosting rare class performance"
            else:
                class_weight_impact = "Limited correlation: class weights not significantly impacting F1 scores"

            top_weighted_strs = []
            top_weighted = merged_df.sort_values('weight', ascending=False).head(2)
            for _, row in top_weighted.iterrows():
                top_weighted_strs.append(f"{row['class_name']} (wt={row['weight']:.1f})")
                
            class_weight_impact += f". Top weighted: {', '.join(top_weighted_strs)}"
            
        results.append({
            "Model": model_name,
            "Most Improved Classes": most_improved_str if most_improved_str else "None above threshold",
            "Problematic Classes": problematic_str if problematic_str else "None identified", 
            "Class Balance Impact": class_weight_impact
        })
        
    return pd.DataFrame(results)

In [12]:
from src.utils.consts import MODELS_PATH
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

model_names = ['DenseNet121_v1', 'DenseNet121_v2', 'DenseNet121_v2_1', 'DenseNet121_v3', 'DenseNet121_v3_1', 'DenseNet121_v3_2']
result_df = analyze_class_performance(MODELS_PATH, model_names)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)
display(result_df)



Unnamed: 0,Model,Most Improved Classes,Problematic Classes,Class Balance Impact
0,DenseNet121_v1,"Effusion (AUC=0.84, F1=0.41, wt=0.5), Cardiomegaly (AUC=0.86, F1=0.32, wt=2.5), Hernia (AUC=0.87, F1=0.30, wt=30.2)","Consolidation (AUC=0.75, F1=0.00, wt=1.5), Pleural_Thickening (AUC=0.74, F1=0.01, wt=2.0), No Finding (AUC=0.74, F1=0.03, wt=0.1)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
1,DenseNet121_v2,"Effusion (AUC=0.82, F1=0.33, wt=0.5), No Finding (AUC=0.71, F1=0.28, wt=0.1), Hernia (AUC=0.79, F1=0.25, wt=30.2)","Consolidation (AUC=0.73, F1=0.00, wt=1.5), Mass (AUC=0.73, F1=0.02, wt=1.2), Pleural_Thickening (AUC=0.73, F1=0.00, wt=2.0)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
2,DenseNet121_v2_1,"Cardiomegaly (AUC=0.87, F1=0.30, wt=2.5), Emphysema (AUC=0.77, F1=0.22, wt=2.7), Effusion (AUC=0.81, F1=0.20, wt=0.5)","Hernia (AUC=0.80, F1=0.00, wt=30.2), Consolidation (AUC=0.75, F1=0.00, wt=1.5), Atelectasis (AUC=0.73, F1=0.01, wt=0.6)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
3,DenseNet121_v3,"No Finding (AUC=0.76, F1=0.50, wt=0.1), Effusion (AUC=0.86, F1=0.43, wt=0.5), Cardiomegaly (AUC=0.90, F1=0.35, wt=2.5)","Edema (AUC=0.86, F1=0.00, wt=3.0), Emphysema (AUC=0.81, F1=0.01, wt=2.7), Consolidation (AUC=0.77, F1=0.00, wt=1.5)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
4,DenseNet121_v3_1,"No Finding (AUC=0.76, F1=0.63, wt=0.1), Effusion (AUC=0.86, F1=0.37, wt=0.5), Cardiomegaly (AUC=0.88, F1=0.32, wt=2.5)","Edema (AUC=0.86, F1=0.02, wt=3.0), Consolidation (AUC=0.77, F1=0.00, wt=1.5), Pleural_Thickening (AUC=0.74, F1=0.00, wt=2.0)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
5,DenseNet121_v3_2,"No Finding (AUC=0.76, F1=0.64, wt=0.1), Effusion (AUC=0.86, F1=0.45, wt=0.5), Pneumothorax (AUC=0.83, F1=0.32, wt=1.3)","Edema (AUC=0.87, F1=0.01, wt=3.0), Consolidation (AUC=0.76, F1=0.00, wt=1.5), Pleural_Thickening (AUC=0.74, F1=0.03, wt=2.0)","Limited correlation: class weights not significantly impacting F1 scores. Top weighted: Hernia (wt=30.2), Pneumonia (wt=4.9)"
