In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd 
import os
import datetime
import numpy as np
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def uncertainty_analysis(prediction_df, confidence_threshold=0.7):


    """
    Performs uncertainty analysis per site

    Args:
        prediction_df: df with probabilistic predictions
        confidence_treshold: threshold to mark the prediction as uncertain
     
    """
    # Convert score columns to numbers
    score_cols = ['prediction_score_CT', 'prediction_score_PCM', 'prediction_score_PDLC']
    for col in score_cols:
        df[col] = df[col].str.replace(',', '.').astype(float)
    
    # Obtaining probabilities
    probas = df[score_cols].values
    
    # Obtain prediction labels and confidence
    predictions = df['predicciones'].values
    confidences = np.max(probas, axis=1)
    
    # Marking predictions below the threshold as uncertain
    uncertain_mask = confidences < confidence_threshold
    predictions_with_uncertainty = predictions.copy()
    predictions_with_uncertainty[uncertain_mask] = 'uncertain'
    
    # Calculate entropy 
    entropies = np.array([entropy(probs, base=2) for probs in probas])
    
    # Create DataFrame with results
    results_df = pd.DataFrame({
        'id': df['id'],
        'Site': df['Site'],
        'Original_predictions': predictions,
        'prediction_score_CT': df['prediction_score_CT'],
        'prediction_score_PCM':df['prediction_score_PCM'],
        'prediction_score_PDLC': df['prediction_score_PDLC'],
        'Uncertainty_treshold_predictions': predictions_with_uncertainty,
        'entropy': entropies
    })
    
    # Print basic metrics
    n_uncertain = np.sum(uncertain_mask)
    print(f"Uncertain predictions: {n_uncertain}/{len(df)} ({(n_uncertain/len(df)*100):.1f}%)")
    print(f"Mean dataset entropy: {entropies.mean():.3f}")
    
    
    # Calculate and display median entropy per site
    print("\nMedian entropy per site:")
    entropy_median_by_site = results_df.groupby('Site')['entropy'].median()
    for site, median in entropy_median_by_site.items():
        print(f"{site}: {median:.3f}")
    
    return results_df

In [None]:
# Cargar y procesar datos
df = pd.read_csv('/home/dsg/VORTEX_FINAL/PRODUCTION/DATA/20241118_PrediccionesVortexTodas.csv',encoding='latin-1')
results = predictions_uncertainty_analysis(df, confidence_threshold=0.7)

# Mostrar primeras filas de resultados
print("\nPrimeras 5 filas de resultados:")
results.head()