# Notes
- Ground truth transcription is provided in a .log file
- When evaluating accuracy for each S2T or Sentiment model, we convert the .log file into a .csv for simplicity
- When calculating WER, compute_wer.py wants the file in .txt, so we format it again

In [14]:
import csv
import re
import os

## Formats IEMOCAP ground truths into TXT file
- Removes logging output
- Keeps filename
- Converts .log to .txt
Which you can take from each fold folder in ./S2T-Sentiment

In [51]:
# Input and output file paths
input_file = "./InputFiles/groundtruth_IEMOCAP.log"
output_file = "./OutputFiles/groundtruth_IEMOCAP.txt"

In [52]:
# Open the input file and process each line
with open(input_file, mode="r", encoding="utf-8") as infile:
    with open(output_file, mode="w", encoding="utf-8") as outfile:
        for line in infile:
            # Use regex to extract the ID and transcript
            match = re.search(r"(\S+): (.+)", line)
            if match:
                # Extract ID and transcript
                utterance_id = match.group(1)  # The ID (e.g., Ses01F_script01_3_F000)
                transcript = match.group(2).strip()  # The transcript

                # Write the formatted line to the output file
                outfile.write(f"{utterance_id} {transcript}\n")
                
            # Audio has no transcript
            else:
                print(f"Note: Blank transcript for: {line}")
                match_2 = re.search(r"\[INFO\]\s+(\S+:)", line)
                utterance_id = match_2.group(1)
                outfile.write(f"{utterance_id[:-1]} \n") #[:-1] prevents writing the ":"

print(f"Formatted ground truth saved to {output_file}")

Note: Blank transcript for: [2022-02-15 11:28:44,855][text2pickle.py][line:64][INFO] Ses01F_script01_3_F010: 

Note: Blank transcript for: [2022-02-15 11:28:44,898][text2pickle.py][line:64][INFO] Ses01F_script01_3_F011: 

Note: Blank transcript for: [2022-02-15 11:28:44,937][text2pickle.py][line:64][INFO] Ses01F_script01_3_F012: 

Note: Blank transcript for: [2022-02-15 11:28:50,279][text2pickle.py][line:64][INFO] Ses01F_script02_2_M043: 

Note: Blank transcript for: [2022-02-15 11:28:50,347][text2pickle.py][line:64][INFO] Ses01F_script02_2_M046: 

Note: Blank transcript for: [2022-02-15 11:28:51,306][text2pickle.py][line:64][INFO] Ses01M_impro04_F023: 

Note: Blank transcript for: [2022-02-15 11:29:13,178][text2pickle.py][line:64][INFO] Ses01M_script03_1_F007: 

Note: Blank transcript for: [2022-02-15 11:29:13,286][text2pickle.py][line:64][INFO] Ses01M_script03_1_F010: 

Note: Blank transcript for: [2022-02-15 11:29:17,257][text2pickle.py][line:64][INFO] Ses01F_script02_1_F009: 

Note

## Formats IEMOCAP ground truths to CSV file
- Removes logging output
- Keeps filename
- Converts .log to .csv
Which you can take from each fold folder in ./S2T-Sentiment

In [53]:
# Input and output file paths
input_file = "./InputFiles/groundtruth_IEMOCAP.log"
output_csv = "./OutputFiles/groundtruth_IEMOCAP.csv"

In [54]:
# Open the input file and process each line
with open(input_file, mode="r", encoding="utf-8") as infile:
    with open(output_csv, mode="w", encoding="utf-8") as outfile:
        
        # Manually write header
        outfile.write("ID,Transcription\n")
        
        for line in infile:
            # Use regex to extract the ID and transcript
            match = re.search(r"(\S+): (.+)", line)
            if match:
                # Extract ID and transcript
                utterance_id = match.group(1)  # The ID (e.g., Ses01F_script01_3_F000)
                transcript = match.group(2).strip()  # The transcript

                # Write the formatted line to the output file
                outfile.write(f"{utterance_id},{transcript}\n")
                
            # Audio has no transcript
            else:
                print(f"Note: Blank transcript for: {line}")
                match_2 = re.search(r"\[INFO\]\s+(\S+:)", line)
                utterance_id = match_2.group(1)
                outfile.write(f"{utterance_id[:-1]}, \n") #[:-1] prevents writing the ":"

print(f"Formatted ground truth saved to {output_csv}")

Note: Blank transcript for: [2022-02-15 11:28:44,855][text2pickle.py][line:64][INFO] Ses01F_script01_3_F010: 

Note: Blank transcript for: [2022-02-15 11:28:44,898][text2pickle.py][line:64][INFO] Ses01F_script01_3_F011: 

Note: Blank transcript for: [2022-02-15 11:28:44,937][text2pickle.py][line:64][INFO] Ses01F_script01_3_F012: 

Note: Blank transcript for: [2022-02-15 11:28:50,279][text2pickle.py][line:64][INFO] Ses01F_script02_2_M043: 

Note: Blank transcript for: [2022-02-15 11:28:50,347][text2pickle.py][line:64][INFO] Ses01F_script02_2_M046: 

Note: Blank transcript for: [2022-02-15 11:28:51,306][text2pickle.py][line:64][INFO] Ses01M_impro04_F023: 

Note: Blank transcript for: [2022-02-15 11:29:13,178][text2pickle.py][line:64][INFO] Ses01M_script03_1_F007: 

Note: Blank transcript for: [2022-02-15 11:29:13,286][text2pickle.py][line:64][INFO] Ses01M_script03_1_F010: 

Note: Blank transcript for: [2022-02-15 11:29:17,257][text2pickle.py][line:64][INFO] Ses01F_script02_1_F009: 

Note

## Formats ONE .csv file to .txt file for use in compute_wer.py

In [2]:
# Input and output file paths
input_csv = "predictions.csv"  # Replace with your CSV file path
output_txt = "predictions.txt"  # Replace with your desired TXT file path

In [3]:
# Open the CSV file and read its content
def convert_csv_to_txt(input_csv, output_txt):
    with open(input_csv, mode="r", encoding="utf-8") as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader, None)  # Skip header if present
        with open(output_txt, mode="w", encoding="utf-8") as txt_file:
            for row in csv_reader:
                # Combine ID and Transcript with a space separator
                txt_file.write(f"{row[0]} {row[1]}\n")
    
convert_csv_to_txt(input_csv, output_txt)
print(f"Converted {input_csv} to {output_txt}")

Converted predictions.csv to predictions.txt


## Formats ALL .csv file to .txt file for use in compute_wer.py for each fold folder

In [16]:
base_directory = "./whisper-tiny-en+Twitter-roBERTa-base"

In [17]:
# Iterate over all folders in the base directory
for folder_name in os.listdir(base_directory):
    folder_path = os.path.join(base_directory, folder_name)
    # Check if the folder matches the "fold_{number}" pattern
    if os.path.isdir(folder_path) and folder_name.startswith("fold_"):
        input_csv = os.path.join(folder_path, "predictions.csv")
        output_txt = os.path.join(folder_path, "predictions.txt")
        # Check if the CSV file exists
        if os.path.exists(input_csv):
            convert_csv_to_txt(input_csv, output_txt)
            print(f"Done: {input_csv}")
        else:
            print(f"No prediction.csv found in {folder_path}")

Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_0\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_1\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_2\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_3\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_4\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_5\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_6\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_7\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_8\predictions.csv
Done: ./whisper-tiny-en+Twitter-roBERTa-base\fold_9\predictions.csv


## 