# Evaluation of the transcripts: Metrics & time passed to verify

- The first part of this notebook is designed to assess the performance of predictions after manual verification. *(Also providing enhanced visual HTML files to help interpret metrics clearly.)*

    - After manually verifying transcripts, you can use the verified transcripts as reference (ground truth) data to test the accuracy of various models.
    
    - **However**, it's essential to note that if the reference file (ground truth) was created by using a model’s predictions as a template the ground truth may carry a bias towards that initial model. *For instance, if a model omitted interruptions or adjusted sentence structure and the verifier found these changes accurate in context, these modifications might remain, unintentionally favoring that model.*

- The second part of the notebook focuses on tracking the time spent verifying each transcript. This data allows you to quantify time savings, providing insights into the efficiency of each model based on verification time.

In [1]:
import pandas as pd
import numpy as np
import shutil
import os

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set()

from utils.evaluation_helpers import process_folder, load_data_time, compute_der
from utils.format_helpers import get_files

## Predictions vs References: Performance evaluation (ASR & Diarization)

### Preparing evaluation folder (Predictions vs References)

Usefull if want to test/compare differents models.

In [2]:
directories = ['../results/Compassion', '../results/OBE1', '../results/OBE2'] # The csv file that you want to compare
pred_folder = '../evaluation/predictions' # Specify the name if trying to compare different ASR/Diarization models.

def copy_csv_files(directories, pred_folder):
    pred_files = []
    for directory in directories:
        pred_files.append(get_files(directory, 'csv'))
    pred_files = [item for sublist in pred_files for item in sublist]

    os.makedirs(pred_folder, exist_ok=True)
    for file in pred_files:
        shutil.copy(file, pred_folder)

copy_csv_files(directories, pred_folder)

### ASR and Diarization metrics + Visual Tool to easily understand missmatch

In [None]:
prediction_folder = "../evaluation/predictions"
reference_folder = "../evaluation/references"

# Provide the original duration of the audio files.
# This ensures the diarization error rate (DER) metric accounts for the total audio duration, not just the speech segments detected by the model.
# If this information is missing, DER calculations may be inaccurate as they consider only the detected speech intervals, worsening the DER metric.
orignal_info = pd.read_csv("./outputs/audio_data.csv")

metric = process_folder(prediction_folder, reference_folder, info=orignal_info)
metric.to_csv('../evaluation/metrics.csv', index=False)

### (Optional) Dialogue DER Analysis in CSV instead than in HTML

In [None]:
def dialogue_DER(reference_file, prediction_file, output_file):
    df_ref, df_pred = load_data_time(reference_file, prediction_file)
    dialogue_df, error_durations = compute_der(df_ref, df_pred)

    # Display error durations and DER
    print("\nError Durations and DER:")
    for key, value in error_durations.items():
        if key == 'DER':
            print(f"{key}: {value:.2%}")
        else:
            print(f"{key}: {value:.3f} seconds")

    # Optionally, save the DataFrame to a CSV file
    dialogue_df.to_csv(output_file, index=False)

In [None]:
reference_file = "../evaluation/references/S301final.csv"
prediction_file = "../evaluation/predictions/S301final.csv"
output_file = 'dia_S301final_new.csv'

dialogue_DER(reference_file, prediction_file, output_file)

## Visualize performance metrics

In [None]:
# Create a mapping of Filename to Experiment
df_audio = pd.read_csv('./outputs/audio_data.csv')
filename_to_experiment = dict(zip(df_audio['File Name'],df_audio['Experiment']))

df = pd.read_csv('../evaluation/metrics.csv')
df['Experiment'] = df['Filename'].map(filename_to_experiment)
df

In [4]:
def plot_histograms(df):
    """
    Plots histograms for WER and DER distributions.
    
    Args:
        df (pd.DataFrame): DataFrame containing the metrics.
    """
    plt.figure(figsize=(12, 6))

    # Define bins dynamically based on the maximum values in the data
    wer_max = df['WER'].max()
    der_max = df['DER'].max()
    
    wer_bins = np.arange(0, wer_max + 0.05, 0.05)  # Bins for WER
    der_bins = np.arange(0, der_max + 0.05, 0.05)  # Bins for DER
    
    # Histogram for WER
    plt.subplot(1, 2, 1)
    ax1 = sns.histplot(df['WER'], bins=wer_bins, kde=True, color='skyblue')
    plt.xlabel('Word Error Rate (WER)')
    #plt.title('Distribution of WER')
    plt.xticks(np.arange(0, wer_max + 0.1, 0.1))  # X-ticks at 10% intervals

    # Annotate each bar with percentage
    for p in ax1.patches:
        height = p.get_height()
        if height > 0:
            percentage = (height / df['WER'].shape[0]) * 100
            ax1.annotate(f'{percentage:.1f}%', 
                         (p.get_x() + p.get_width() / 2., height),
                         ha='center', va='bottom', fontsize=10, color='black')
    
    # Histogram for DER
    plt.subplot(1, 2, 2)
    ax2 = sns.histplot(df['DER'], bins=der_bins, kde=True, color='salmon')
    plt.xlabel('Diarization Error Rate (DER)')
    #plt.title('Distribution of DER')
    plt.xticks(np.arange(0, der_max + 0.1, 0.1))  # X-ticks at 10% intervals

    # Annotate each bar with percentage
    for p in ax2.patches:
        height = p.get_height()
        if height > 0:
            percentage = (height / df['DER'].shape[0]) * 100
            ax2.annotate(f'{percentage:.1f}%', 
                         (p.get_x() + p.get_width() / 2., height),
                         ha='center', va='bottom', fontsize=10, color='black')
    
    plt.tight_layout()
    plt.show()

def plot_boxplots(df):
    """
    Plots box plots for WER and DER.
    
    Args:
        df (pd.DataFrame): DataFrame containing the metrics.
    """
    plt.figure(figsize=(8, 4))
    
    # Melt the DataFrame for easier plotting with Seaborn
    melted_df = df.melt(id_vars='Filename', value_vars=['WER', 'DER'], var_name='Metric', value_name='Value')
    sns.boxplot(x='Metric', y='Value',hue="Metric" ,data=melted_df, palette=['skyblue', 'salmon'])
    plt.ylabel('Error Rate')
    plt.xlabel('')
    
    plt.tight_layout()
    plt.show()

def plot_wer_vs_der_scatter(df):
    """
    Plots an interactive scatter plot of WER vs. DER using Plotly.
    
    Args:
        df (pd.DataFrame): DataFrame containing the metrics, must include 'WER', 'DER', and 'Filename' columns.
    """
    fig = px.scatter(
        df,
        x='WER',
        y='DER',
        color='Filename',
        hover_data=['Filename','Experiment'],  # Show filenames on hover
        labels={
            'WER': 'Word Error Rate (WER)',
            'DER': 'Diarization Error Rate (DER)'
        },
        title='Interactive Scatter Plot of WER vs. DER'
    )
    fig.update_traces(marker=dict(size=10))
    fig.update_layout(
        xaxis_title='Word Error Rate (WER)',
        yaxis_title='Diarization Error Rate (DER)',
        showlegend=False
    )
    fig.show()

In [None]:
plot_boxplots(df)
plot_histograms(df)
plot_wer_vs_der_scatter(df)

In [None]:
def plot_boxplot_experiment(df):
    """
    Plots box plots for WER and DER, grouped by Experiment.
    
    Args:
        df (pd.DataFrame): DataFrame containing the metrics and Experiment column.
    """
    plt.figure(figsize=(10, 6))
    
    # Melt the DataFrame for easier plotting with Seaborn
    melted_df = df.melt(id_vars=['Filename', 'Experiment'], 
                        value_vars=['WER', 'DER'], 
                        var_name='Metric', 
                        value_name='Value')
    
    # Create the boxplot with Experiment as hue
    sns.boxplot(x='Metric', y='Value', hue='Experiment', data=melted_df, palette='pastel')
    sns.stripplot(x='Metric', y='Value', hue='Experiment', data=melted_df, size=4, linewidth=1,dodge=True, jitter=True, legend=False, edgecolor="k")
    
    plt.ylabel('Error Rate')
    plt.xlabel('')
    #plt.title('Boxplots of WER and DER by Experiment')
    plt.legend(title='Experiment')
    plt.tight_layout()
    plt.show()

plot_boxplot_experiment(df)

In [None]:
# Drop a column if it is not needed
df_c = df.drop(columns=['Filename']).copy()
correlation_matrix = df_c.corr()

# Create a heatmap for the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Matrix of Performance Metrics")
plt.tight_layout()
plt.show()

## Estimation of time passed verify transcription

In [5]:
df_time = pd.read_csv("outputs/time_data.csv")

# Function to convert HH:MM:SS to total seconds
def time_to_seconds(time_str):
    try:
        h, m, s = map(int, time_str.split(':'))
        return h * 3600 + m * 60 + s
    except:
        return np.nan  # Return NaN if time_str is not a valid format

df_time['Verification_sec'] = df_time['Verification_time'].apply(time_to_seconds)

# Drop rows where 'Verification_sec' is NaN
df_time = df_time.dropna(subset=['Verification_sec'])

df_time.loc[:, 'Ratio'] = df_time['Verification_sec'] / df_time['Duration_sec'] 

In [None]:
# Add a new row for the "All" category
df_all = df_time.copy()
df_all['Experiment'] = 'All'

# Concatenate the original data with the "All" data
df_combined = pd.concat([df_all, df_time])
default_palette = sns.color_palette("deep")  
palette = {experiment: (default_palette[0] if experiment != 'All' else default_palette[3]) for experiment in df_combined['Experiment'].unique()}

# Plot the box plot with "All" as an additional category
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_combined, x='Experiment', y='Ratio', palette=palette, hue='Experiment')
sns.stripplot(data=df_combined, x='Experiment', y='Ratio', size=4, linewidth=1, dodge=True, jitter=True, edgecolor='k', color='gray')
plt.title('Ratio of Verification Time to Recording Duration')
plt.xlabel('Experiment')
plt.ylabel('Time Spent / Recording Duration')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_time['Ratio'].dropna(), kde=True, bins=20)
plt.title('Distribution of Time Spent as ratio of Recording Duration')
plt.xlabel('Time Spent Verifying / Recording Duration (Ratio)')
plt.ylabel('Frequency')
plt.show()