In [2]:
import re
from pathlib import Path
import werpy
import jiwer
import pandas as pd

def clean_transcript(text):
    """
    Clean a transcript by removing linguistic annotations, formatting, and non-verbal markers.
    
    Args:
        text (str): The input transcript text
        
    Returns:
        str: The cleaned transcript text
    """
    # Replace speaker identifiers
    text = text.replace("INV1: ", "").replace("INV2: ", "").replace("PAR: ", "").replace("INV: ", "")
    
    # Replace basic formatting
    text = text.replace("‡", "")
    
    # Remove annotations like [+ gram], [+ exc], etc.
    text = re.sub(r'\[\+\s*[^\]]*\]', '', text)
    
    # Remove timing markers like (..), (.), (...)
    text = re.sub(r'\(\.\.\.\)|\(\.\.\)|\(\.\)', ' ', text)
    text = re.sub(r'\(\.+\)', '', text)  # More general pattern for (.), (..), (...)
    
    # Remove phonetic transcriptions like [: word]
    text = re.sub(r'\[:\s*[^\]]*\]', '', text)
    
    # Remove error codes like [* p:w], [* m:=ed], etc.
    text = re.sub(r'\[\*\s*[^\]]*\]', '', text)
    
    # Remove repetition markers like [/], [//]
    text = re.sub(r'\[\/\/?\]', '', text)
    
    # Remove pointing notes like &+points:picture
    text = re.sub(r'&\+points:[^\s]*', '', text)
    
    # Remove speech markers like &-, &+, +<, +/, +...
    text = re.sub(r'&[\+\-][a-z]*|\+[<\/]\s*|\+\.\.\.', '', text)
    
    # Remove non-speech markers like www.
    text = re.sub(r'www\.', '', text)
    
    # Remove special markers for retracings
    text = re.sub(r'\[\*\s*s:r:gc:pro-ret\]', '', text)
    
    # Remove 0det markers
    text = re.sub(r'0det\s*', '', text)
    
    # Clean up special characters
    text = re.sub(r'@q', '', text)
    text = re.sub(r'@l', '', text)
    
    # Remove sections in angle brackets that are immediately followed by corrections
    text = re.sub(r'<[^>]*>\s*\[\/{1,2}\]', '', text)
    
    # Fix contractions that were separated like (a)n(d)
    text = re.sub(r'\(a\)n\(d\)', 'and', text)
    text = re.sub(r'out_o\(f\)', 'out of', text)
    
    # Fix other common linguistic annotations
    text = re.sub(r'„', ',', text)
    
    # Clean up turn-taking markers
    text = re.sub(r'\*+', '', text)
    
    # Further clean-up from additional patterns
    text = text.replace("&-", "").replace("*", "").replace("<", "").replace(">", "")
    text = text.replace("[", "").replace("]", "").replace("/", "")
    text = text.replace("+", "").replace("...", "").replace("..", "").replace("()", "")
    text = text.replace("xxx", "").replace("(", "").replace(")", "").replace("_", " ")
    text = text.replace("mhm", "").replace("hm.", "").replace(" hm ", " ").replace(" uh"," ").replace(" huh"," ")
    
    # Remove filler words
    text = re.sub(r'\b(um|uh)\b', '', text)
    
    # Remove gestures and non-verbal cues
    text = re.sub(r'&=[a-z]+(:[a-z]+)?', '', text)
    
    # Clean up multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Convert to lowercase and trim
    text = text.lower()
    text = text.strip()
    
    return text

def process_transcript_file(file_path):
    """
    Process a transcript file, clean it, and return the cleaned text.
    
    Args:
        file_path (str): Path to the transcript file
        
    Returns:
        str: The cleaned transcript text
    """
    try:
        with open(file_path, "r", encoding="utf-8") as gt_text:
            text = gt_text.read()
            return clean_transcript(text)
    except Exception as e:
        return f"Error processing {file_path}: {str(e)}"
    
def read_text(GT_path, check_path):
    check_path = Path(check_path)
    GT_path = Path(GT_path)
    if not check_path.exists() or not GT_path.exists():
        print("Error: File not found.")
    else:    
        with open(GT_path, "r", encoding="utf-8") as file:
            GT = file.read().strip()
        with open(check_path, "r", encoding="utf-8") as file:
            check = file.read().strip()
    return GT, check


def process_wer_data(num_list, models_list, result_type):
    """
    Processes WER data and updates the specified result type.

    Args:
        num_list (list): List of numerical identifiers.
        models_list (list): List of models to evaluate.
        result_type (str): One of ['wer', 'insertions', 'deletions', 'substitutions'].
    
    Returns:
        dict: A dictionary with WER results based on the selected result type.
    """
    # Initialize result storage
    results = {num: [] for num in num_list}

    for num in num_list:
        GT_path = f"Dataset/Ground_Truth/GT_{num}.txt"
        for model in models_list:
            check_path = f"output/{num}/{model}_{num}.txt"
            
            ref = process_transcript_file(GT_path)
            hyp = process_transcript_file(check_path)
            # Read ground truth and hypothesis
            #ref, hyp = read_text(GT_path, check_path)
            reference = werpy.normalize(ref)
            hypothesis = werpy.normalize(hyp)

            # Compute WER results
            result = jiwer.process_words(reference, hypothesis)

            # Append based on the result type
            if result_type == 'wer':
                results[num].append(result.wer)
            elif result_type == 'insertions':
                results[num].append(result.insertions)
            elif result_type == 'deletions':
                results[num].append(result.deletions)
            elif result_type == 'substitutions':
                results[num].append(result.substitutions)
            else:
                raise ValueError("Invalid result type. Choose from 'wer', 'insertions', 'deletions', 'substitutions'.")

    return results

def dict_to_dataframe(result_dict, models_list):
    """
    Converts a dictionary of WER-related results into a Pandas DataFrame.
    
    Args:
        result_dict (dict): Dictionary where keys are numbers and values are lists of results.
        models_list (list): List of models used in evaluation.
    
    Returns:
        pd.DataFrame: DataFrame with numbers as index and models as columns.
    """
    # Create a structured dictionary to hold the results
    structured_data = {num: {model: None for model in models_list} for num in result_dict.keys()}

    # Populate structured data with actual values
    for num, values in result_dict.items():
        for idx, model in enumerate(models_list):
            if idx < len(values):  # Ensure values exist for the given index
                structured_data[num][model] = values[idx]
    
    # Convert dictionary to DataFrame
    df = pd.DataFrame.from_dict(structured_data, orient='index')
    df.index.name = "Number"
    return df



# WER

In [5]:
a = process_transcript_file("Dataset/Ground_Truth/GT_1554.txt")
print(a)

so i'm just gonna be asking you to do some talking . okay . so how do you think your speech is these days ? it's good . but will be better . i can little i can read a little bit. i have trouble with an and the and and all that . . do you remember when you had your stroke ? yeah . could you tell me about it ? oh gosh . well i got i got up to check the laundry . and i got about there in the doorway of the kitchen . and i don't remember . i go to vanderbilt . and that's all i can remember . hm what about your first memories after the stroke ? hm i was scared . yeah . okay . so what about your recovery ? what kinds of things have you done to try to get better since your stroke ? i exercise five times a a week . and i have speech spring christmas break starting up next tuesday wednesday the following week . and i don't i don't know . that's it . i don't know . doctor's appointments and all that jazz . . so outside of exercising , are there any other changes in your day to day life ? oh i i 

In [117]:
num_list = ["1554","1713","1731","1738","1833","1944"]
models_list = ["largeV3","largeV3C","largeV3F","base","largeV3FC"] #TODO: W2V, LLM, PLLM
wer_results = {num: [] for num in num_list}
insert_results = {num: [] for num in num_list}
delete_results = {num: [] for num in num_list}
substitution_results = {num: [] for num in num_list}

# Process WER results
wer_results = process_wer_data(num_list, models_list, 'wer')
insert_results = process_wer_data(num_list, models_list, 'insertions')
delete_results = process_wer_data(num_list, models_list, 'deletions')
substitution_results = process_wer_data(num_list, models_list, 'substitutions')

wer_df = dict_to_dataframe(wer_results, models_list)
insert_df = dict_to_dataframe(insert_results, models_list)
delete_df = dict_to_dataframe(delete_results, models_list)
substitution_df = dict_to_dataframe(substitution_results, models_list)
wer_df

Unnamed: 0_level_0,largeV3,largeV3C,largeV3F,base,largeV3FC
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1554,0.248238,0.247154,0.243902,0.294309,0.227642
1713,0.143687,0.129424,0.143687,0.158479,0.129424
1731,0.615819,0.647834,0.657878,0.789705,0.671689
1738,0.242627,0.216472,0.211464,0.254869,0.20256
1833,0.21256,0.214573,0.209742,0.288647,0.194042
1944,0.342105,0.342105,0.344329,0.397702,0.329874


Showing results below matters because it shows that cleaning wasn't chaotic.

In [103]:
insert_df

Unnamed: 0_level_0,largeV3,largeV3C,largeV3F,base,largeV3FC
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1554,203,223,208,204,203
1713,80,100,80,84,87
1731,668,714,740,848,762
1738,79,91,78,99,88
1833,136,149,134,238,135
1944,385,385,389,467,389


In [104]:
delete_df

Unnamed: 0_level_0,largeV3,largeV3C,largeV3F,base,largeV3FC
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1554,231,191,217,274,194
1713,198,156,198,199,173
1731,205,187,226,219,223
1738,252,188,194,194,173
1833,360,329,342,328,310
1944,461,461,461,376,410


In [105]:
substitution_df

Unnamed: 0_level_0,largeV3,largeV3C,largeV3F,base,largeV3FC
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1554,67,86,69,111,68
1713,53,48,53,75,44
1731,180,202,155,264,156
1738,159,163,162,219,157
1833,103,125,114,216,108
1944,122,122,125,276,137


In [118]:
num="1554"
model="largeV3FC"

ref = process_transcript_file(f"Dataset/Ground_Truth/GT_1554.txt")
#hyp.replace(" uh"," ").replace(" huh"," ")
hyp = process_transcript_file( f"output/{num}/{model}_{num}.txt")
reference = werpy.normalize(ref) #ref
hypothesis = werpy.normalize(hyp)
result = jiwer.process_words(reference, hypothesis)


alignments = result.alignments[0]

ref_words = reference.split()
hyp_words = hypothesis.split()

deletions = []
insertions = []
substitutions = []
correct = []

for alignment in alignments:
    if alignment.type == 'delete':
        deletions.extend(ref_words[alignment.ref_start_idx:alignment.ref_end_idx])
    elif alignment.type == 'insert':
        insertions.extend(hyp_words[alignment.hyp_start_idx:alignment.hyp_end_idx])
    elif alignment.type == 'substitute':
        substitutions.append((ref_words[alignment.ref_start_idx], hyp_words[alignment.hyp_start_idx]))
    elif alignment.type == 'equal':
        correct.extend(ref_words[alignment.ref_start_idx:alignment.ref_end_idx])

print("Deletions:", len(deletions))
print("Insertions:", len(insertions))
print("Substitutions:", len(substitutions))
print("Correct words:", len(correct))


print("Deletions:", deletions)
print("Insertions:", insertions)
print("Substitutions:", substitutions)
print("Correct words:", correct)


Deletions: 150
Insertions: 208
Substitutions: 53
Correct words: 1633
Deletions: ['and', 'hm', 'hm', 'a', 'and', 'i', 'dont', 'i', 'dont', 'know', 'i', 'dont', 'know', 'jazz', 'i', 'i', 'and', 'and', 'so', 'hm', 'i', 'i', 'hm', 'oh', 'oh', 'fifth', 'hm', 'once', 'i', 'dont', 'know', 'uhhuh', 'so', 'its', 'hm', 'one', 'two', 'best', 'friends', 'with', 'it', 'i', 'bet', 'yeah', 'hm', 'kingdom', 'oh', 'gosh', 'oh', 'oh', 'any', 'i', 'wow', 'hm', 'and', 'you', 'back', 'go', 'for', 'it', 'i', 'dont', 'know', 'i', 'can', 'yeah', 'hm', 'what', 'ugh', 'yeah', 'hm', 'yeah', 'hm', 'well', 'his', 'i', 'i', 'yeah', 'yeah', 'yeah', 'cool', 'okay', 'live', 'firstnames', 'but', 'he', 'hm', 'sorry', 'and', 'hm', 'yeah', 'hm', 'oh', 'okay', 'for', 'fun', 'i', 'a', 'is', 'that', 'right', 'right', 'hm', 'hm', 'hm', 'and', 'i', 'dont', 'hm', 'hm', 'i', 'hm', 'or', 'mostly', 'yeah', 'or', 'i', 'i', 'yeah', 'and', 'okay', 'yeah', 'yeah', 'im', 'sorry', 'yeah', 'oh', 'no', 'r', 'oh', 'no', 'i', 'two', 'and', 