In [None]:
import pandas as pd
import datetime
import numpy as np
from scipy.stats import entropy
import os

In [None]:
def provenance_determination(self, df):

    """
    Performs consensus provenance detemrination based on a confidence treshold
    Creates a summary statistics table

    Args:
        df: dataframe with probabilistic predictions

    """

     # Convert score columns to numbers
    score_columns = ['prediction_score_CT', 'prediction_score_PCM', 'prediction_score_PDLC']
    #for col in score_cols:
        #df[col] = df[col].str.replace(',', '.').astype(float)
    
    df['max_prob'] = df[score_columns].max(axis=1)
    
    results = []
    for site in df['Site'].unique():
        site_data = df[df['Site'] == site]
        avg_uncertain = sum(site_data['max_prob'] < 0.70)/ len(site_data)*100
        high_conf = site_data[site_data['max_prob'] > 0.70]
        median_entropy = site_data.entropy.median()
        
        if len(high_conf) > 0:
            consensus = high_conf['Uncertainty_treshold_predictions'].mode().iloc[0] # majority vote as mode of high confidence samples
            consistency = sum(high_conf['Uncertainty_treshold_predictions'] == consensus) / len(high_conf)
            n_consensus_pred = len(high_conf)  # Número de predicciones usadas
        else:
            consensus = 'No consensus'
            consistency = 0
            n_consensus_pred = 0
            
        results.append({
            'Site': site,
            'Samples_analyzed': len(site_data),
            'Gavá': len(site_data[site_data['Original_predictions'] == 'CT']) ,
            'Encinasola': len(site_data[site_data['Original_predictions'] == 'PCM']),
            'Aliste': len(site_data[site_data['Original_predictions'] == 'PDLC']),
            'Uncertain(%)': round(avg_uncertain),
            'Samples_for_provenance': n_consensus_pred,
            'Median_entropy': round(median_entropy,2),
            'Consensus': consensus,
            'Homogeneity': round(consistency,2)
        })
    
    return pd.DataFrame(results)

In [None]:
summary_df = provenance_determination(results)
summary_df