In [1]:
import csv
import difflib
import pandas as pd

# Run this once if not already installed
#nltk.download('punkt') 
#nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

In [4]:
def flatten_csv_content(file_path):
    """
    Flatten the 'Content' column of a CSV file into a single string.
    """
    with open(file_path, 'r', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        # Join all the content from the 'Content' column into one plain text string
        return ' '.join(row['Content'].strip() for row in reader)

def compare_and_highlight_changes(reference, prediction):
    """
    Compare reference and prediction texts and highlight changes required to match the prediction to the reference.
    Track insertions (I), deletions (D), and replacements (R).
    """
    ref_words = word_tokenize(reference)
    pred_words = word_tokenize(prediction)
    
    # Use difflib's SequenceMatcher to identify changes between the two tokenized texts
    matcher = difflib.SequenceMatcher(None, ref_words, pred_words)
    diff = matcher.get_opcodes()
    
    highlighted_diff = []
    total_D, total_I, total_R = 0, 0, 0  # Initialize counters for deletions, insertions, and replacements
    
    for tag, i1, i2, j1, j2 in diff:
        if tag == 'equal':
            highlighted_diff.extend(ref_words[i1:i2])
        elif tag == 'replace':
            # Show what needs to be replaced in the prediction to match the reference
            highlighted_diff.append(f"<span style='background-color: yellow;'>[REPLACE: {' '.join(pred_words[j1:j2])} -> {' '.join(ref_words[i1:i2])}]</span>")
            total_R += max(i2 - i1, j2 - j1)
        elif tag == 'delete':
            # Show what needs to be inserted into the prediction to match the reference
            highlighted_diff.append(f"<span style='background-color: green;'>[INSERT: {' '.join(ref_words[i1:i2])}]</span>")
            total_D += i2 - i1
        elif tag == 'insert':
            # Show what needs to be deleted from the prediction
            highlighted_diff.append(f"<span style='background-color: red;'>[DELETE: {' '.join(pred_words[j1:j2])}]</span>")
            total_I += j2 - j1

    return ' '.join(highlighted_diff), total_D, total_I, total_R

def calculate_wer_and_generate_html(prediction_file, reference_file, output_file):
    """
    Flatten prediction and reference CSVs, calculate WER, and generate an HTML output highlighting the changes needed 
    to match the prediction to the reference.
    """
    # Step 1: Flatten both prediction and reference CSVs into plain text
    reference_text = flatten_csv_content(reference_file)
    prediction_text = flatten_csv_content(prediction_file)
    
    # Step 2: Compare the plain texts and highlight what changes are needed
    highlighted_diff, total_D, total_I, total_R = compare_and_highlight_changes(reference_text, prediction_text)
    
    # Step 3: Calculate WER
    total_words = len(word_tokenize(reference_text))
    wer = (total_D + total_I + total_R) / total_words if total_words > 0 else 0
    
    # Step 4: Save results to an HTML file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"<html><body><div>{highlighted_diff}</div></body></html>")

    # Step 5: Print summary
    print(f"Visual differences saved to {output_file}")
    print("\n=== Summary ===")
    print(f"Total Words (Reference): {total_words}")
    print(f"Total Insertions (D, from Reference to Prediction): {total_D}")
    print(f"Total Deletions (I, removed from Prediction): {total_I}")
    print(f"Total Replacements (R, in Prediction): {total_R}")
    print(f"WER: {wer:.4f}")


In [3]:
# Example usage:
reference_file= "../S302con_C.csv"
prediction_file  = "../results/Compassion/S302con.csv"
output_file = "prediction_vs_reference.html"

# Run WER calculation and generate HTML output
calculate_wer_and_generate_html(prediction_file, reference_file, output_file)

Visual differences saved to prediction_vs_reference.html

=== Summary ===
Total Words (Reference): 579
Total Insertions (D, from Reference to Prediction): 102
Total Deletions (I, removed from Prediction): 1
Total Replacements (R, in Prediction): 36
WER: 0.2401


In [10]:
import os
import csv
import difflib
from nltk.tokenize import word_tokenize

def flatten_csv_content(file_path):
    """
    Flatten the 'Content' column of a CSV file into a single string, including speaker information.
    """
    with open(file_path, 'r', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        content = []
        for row in reader:
            speaker = row.get('Speaker', '').strip()
            text = row.get('Content', '').strip()
            if speaker:
                content.append(f"Speaker {speaker}: {text}")
            else:
                content.append(text)
        return ' '.join(content)
    
def compare_and_highlight_changes(reference, prediction):
    """
    Compare reference and prediction texts and highlight changes required to match the prediction to the reference.
    Track insertions (I), deletions (D), and replacements (R).
    """
    ref_words = word_tokenize(reference)
    pred_words = word_tokenize(prediction)
    
    # Use difflib's SequenceMatcher to identify changes between the two tokenized texts
    matcher = difflib.SequenceMatcher(None, ref_words, pred_words)
    diff = matcher.get_opcodes()
    
    highlighted_diff = []
    total_D, total_I, total_R = 0, 0, 0  # Initialize counters for deletions, insertions, and replacements
    
    for tag, i1, i2, j1, j2 in diff:
        if tag == 'equal':
            highlighted_diff.extend(ref_words[i1:i2])
        elif tag == 'replace':
            highlighted_diff.append(f"<span style='background-color: yellow;'>[REPLACE: {' '.join(pred_words[j1:j2])} -> {' '.join(ref_words[i1:i2])}]</span>")
            total_R += max(i2 - i1, j2 - j1)
        elif tag == 'delete':
            highlighted_diff.append(f"<span style='background-color: green;'>[INSERT: {' '.join(ref_words[i1:i2])}]</span>")
            total_D += i2 - i1
        elif tag == 'insert':
            highlighted_diff.append(f"<span style='background-color: red;'>[DELETE: {' '.join(pred_words[j1:j2])}]</span>")
            total_I += j2 - j1
    
    return ' '.join(highlighted_diff), total_D, total_I, total_R

def calculate_wer_and_generate_html(prediction_file, reference_file, output_file):
    """
    Flatten prediction and reference CSVs, calculate WER, and generate an HTML output highlighting the changes needed 
    to match the prediction to the reference.
    """
    # Step 1: Flatten both prediction and reference CSVs into plain text
    reference_text = flatten_csv_content(reference_file)
    prediction_text = flatten_csv_content(prediction_file)
    
    # Step 2: Compare the plain texts and highlight what changes are needed
    highlighted_diff, total_D, total_I, total_R = compare_and_highlight_changes(reference_text, prediction_text)
    
    # Step 3: Calculate WER
    total_words = len(word_tokenize(reference_text))
    wer = (total_D + total_I + total_R) / total_words if total_words > 0 else 0
    
    # Step 4: Save results to an HTML file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"<html><body><div>{highlighted_diff}</div></body></html>")
    
    # Step 5: Print summary
    print(f"Visual differences saved to {output_file}")
    print("\n=== Summary ===")
    print(f"Total Words (Reference): {total_words}")
    print(f"Total Insertions (D, from Reference to Prediction): {total_D}")
    print(f"Total Deletions (I, removed from Prediction): {total_I}")
    print(f"Total Replacements (R, in Prediction): {total_R}")
    print(f"WER: {wer:.4f}")

def process_multiple_files(prediction_dir, reference_dir, output_dir):
    """
    Process multiple CSV files in the specified directories.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of prediction files
    prediction_files = [f for f in os.listdir(prediction_dir) if f.endswith('.csv')]
    
    for pred_file in prediction_files:
        pred_path = os.path.join(prediction_dir, pred_file)
        ref_path = os.path.join(reference_dir, pred_file)
        
        if os.path.exists(ref_path):
            output_file = os.path.join(output_dir, pred_file.replace('.csv', '_diff.html'))
            print(f"Processing {pred_file}...")
            calculate_wer_and_generate_html(pred_path, ref_path, output_file)
        else:
            print(f"Reference file {pred_file} not found in {reference_dir}. Skipping.")

# Example usage:
prediction_dir = "../Zprediction"
reference_dir = "../Ztrue"
output_dir = "./comparison_results"

# Run WER calculation and generate HTML outputs for all files
process_multiple_files(prediction_dir, reference_dir, output_dir)

Processing S301final.csv...
Visual differences saved to ./comparison_results\S301final_diff.html

=== Summary ===
Total Words (Reference): 1071
Total Insertions (D, from Reference to Prediction): 18
Total Deletions (I, removed from Prediction): 3
Total Replacements (R, in Prediction): 31
WER: 0.0486
Processing S302con.csv...
Visual differences saved to ./comparison_results\S302con_diff.html

=== Summary ===
Total Words (Reference): 732
Total Insertions (D, from Reference to Prediction): 103
Total Deletions (I, removed from Prediction): 4
Total Replacements (R, in Prediction): 93
WER: 0.2732


In [29]:
def time_to_seconds(t):
    """
    Convert time format 'HH:MM:SS' to seconds.
    """
    h, m, s = t.split(':')
    return int(h) * 3600 + int(m) * 60 + float(s)

def align_rows_by_time(df_ref, df_pred):
    """
    Align rows from reference and prediction DataFrames based on overlapping time intervals.
    """
    aligned_pairs = []
    ref_intervals = list(zip(df_ref['Start'], df_ref['End'], df_ref.index))
    pred_intervals = list(zip(df_pred['Start'], df_pred['End'], df_pred.index))

    ref_idx = 0
    pred_idx = 0

    while ref_idx < len(ref_intervals) and pred_idx < len(pred_intervals):
        ref_start, ref_end, ref_i = ref_intervals[ref_idx]
        pred_start, pred_end, pred_i = pred_intervals[pred_idx]

        # Skip if start or end times are None
        if ref_start is None or ref_end is None:
            ref_idx += 1
            continue
        if pred_start is None or pred_end is None:
            pred_idx += 1
            continue

        # Ensure intervals have a minimum duration
        if ref_end <= ref_start:
            ref_end = ref_start + 0.001  # Add small epsilon
        if pred_end <= pred_start:
            pred_end = pred_start + 0.001  # Add small epsilon

        # Check for overlap with a small tolerance
        overlap_start = max(ref_start, pred_start)
        overlap_end = min(ref_end, pred_end)

        if overlap_start < overlap_end + 0.001:
            # Intervals overlap
            aligned_pairs.append((df_ref.loc[ref_i], df_pred.loc[pred_i]))
            ref_idx += 1
            pred_idx += 1
        elif ref_end <= pred_start:
            # Reference interval ends before prediction interval starts
            aligned_pairs.append((df_ref.loc[ref_i], None))  # Missing in prediction
            ref_idx += 1
        else:
            # Prediction interval ends before reference interval starts
            aligned_pairs.append((None, df_pred.loc[pred_i]))  # Extra in prediction
            pred_idx += 1

    # Handle any remaining intervals
    while ref_idx < len(ref_intervals):
        ref_start, ref_end, ref_i = ref_intervals[ref_idx]
        aligned_pairs.append((df_ref.loc[ref_i], None))
        ref_idx += 1

    while pred_idx < len(pred_intervals):
        pred_start, pred_end, pred_i = pred_intervals[pred_idx]
        aligned_pairs.append((None, df_pred.loc[pred_i]))
        pred_idx += 1

    return aligned_pairs

def format_diff(ref_text, hyp_text):
    """
    Format differences between reference and hypothesis texts for HTML display.
    """
    ref_words = ref_text.split()
    hyp_words = hyp_text.split()
    sm = difflib.SequenceMatcher(None, hyp_words, ref_words) # Highlight the changes needed in the prediction to match the reference
    opcodes = sm.get_opcodes()
    diff_html = ''
    for tag, i1, i2, j1, j2 in opcodes:
        if tag == 'equal':
            diff_html += ' ' + ' '.join(hyp_words[i1:i2])
        elif tag == 'replace':
            diff_html += ' <span style="background-color: yellow;">[<s>{}</s> → {}]</span>'.format(
                ' '.join(hyp_words[i1:i2]), ' '.join(ref_words[j1:j2]))
        elif tag == 'insert':
            diff_html += ' <span style="background-color: lightgreen;">[{}]</span>'.format(
                ' '.join(ref_words[j1:j2]))
        elif tag == 'delete':
            diff_html += ' <span style="background-color: lightcoral;">[<s>{}</s>]</span>'.format(
                ' '.join(hyp_words[i1:i2]))
    return diff_html.strip()

def format_cell_diff(ref_value, pred_value):
    """
    Format cell differences for columns other than 'Content'.
    """
    if str(ref_value) != str(pred_value):
        if str(pred_value) != '':
        # Modified (e.g., speaker misattribution)
            return f"<td style='background-color: mediumpurple;'>{pred_value} → {ref_value}</td>"
        else:  
            # Missing in prediction (missed speech)
            return f"<td style='background-color: lightcoral;'> → {ref_value}</td>"
    else:
        # No change
        return f"<td>{ref_value}</td>"  # Leave empty

def compare_csv_files(reference_file, prediction_file, output_file):
    """
    Compare two CSV files and generate an HTML output highlighting the differences.
    """
    # Read CSV files
    df_ref = pd.read_csv(reference_file, encoding='utf-8')
    df_pred = pd.read_csv(prediction_file, encoding='utf-8')

    # Convert times to seconds
    df_ref['Start'] = df_ref['Start Time'].apply(time_to_seconds)
    df_ref['End'] = df_ref['End Time'].apply(time_to_seconds)
    # Adjust zero-length intervals
    df_ref.loc[df_ref['End'] <= df_ref['Start'], 'End'] = df_ref['Start'] + 0.001

    df_pred['Start'] = df_pred['Start Time'].apply(time_to_seconds)
    df_pred['End'] = df_pred['End Time'].apply(time_to_seconds)
    # Adjust zero-length intervals
    df_pred.loc[df_pred['End'] <= df_pred['Start'], 'End'] = df_pred['Start'] + 0.001

    # Align rows
    aligned_pairs = align_rows_by_time(df_ref, df_pred)

    # Prepare HTML output
    html_output = "<html><head><style>"
    html_output += "table {border-collapse: collapse; width: 100%;}"
    html_output += "th, td {border: 1px solid black; padding: 5px; text-align: left;}"
    html_output += "</style></head><body>"
    html_output += "<table>"
    html_output += "<tr><th>Index</th><th>Start Time</th><th>End Time</th><th>Speaker</th><th>Content</th></tr>"

    for idx, (ref_row, pred_row) in enumerate(aligned_pairs):
        html_output += "<tr>"
        # Index
        html_output += f"<td>{idx + 1}</td>"

        # Start Time
        ref_start_time = ref_row['Start Time'] if ref_row is not None else ''
        pred_start_time = pred_row['Start Time'] if pred_row is not None else ''
        html_output += format_cell_diff(ref_start_time, pred_start_time)

        # End Time
        ref_end_time = ref_row['End Time'] if ref_row is not None else ''
        pred_end_time = pred_row['End Time'] if pred_row is not None else ''
        html_output += format_cell_diff(ref_end_time, pred_end_time)

        # Speaker
        ref_speaker = ref_row['Speaker'] if ref_row is not None else ''
        pred_speaker = pred_row['Speaker'] if pred_row is not None else ''
        html_output += format_cell_diff(ref_speaker, pred_speaker)

        # Content
        ref_content = ref_row['Content'] if ref_row is not None else ''
        pred_content = pred_row['Content'] if pred_row is not None else ''
        content_diff_html = format_diff(ref_content, pred_content)
        html_output += f"<td>{content_diff_html}</td>"

        html_output += "</tr>"

    html_output += "</table>"

    html_output += "</body></html>"

    # Save HTML output
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_output)

    print(f"Comparison HTML file saved as {output_file}")

# Example usage:
reference_file = '../S302con_C.csv'
prediction_file = '../S302con.csv'
output_file = 'comparison_output.html'

compare_csv_files(reference_file, prediction_file, output_file)


Comparison HTML file saved as comparison_output.html


# others

In [5]:
from evaluate import load

wer_metric = load("wer")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def calculate_wer_for_csv_a(hypothesis_file, reference_file):
    """
    Calculate the WER for the entire content of the 'Content' column in a CSV file.
    """
    with open(hypothesis_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        hyp = ' '.join(row['Content'] for row in reader)

    with open(reference_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        ref = ' '.join(row['Content'] for row in reader)

    return wer_metric.compute(references=[ref], predictions=[hyp])

In [7]:
reference_file= "../S301final_C.csv"
hypothesis_file  = "../results/Compassion/S301final.csv"

calculate_wer_for_csv_a(hypothesis_file, reference_file)

0.03026634382566586

In [8]:
hypothesis_file = "../results/Compassion/S302con.csv"
reference_file = "../S302con_C.csv"

calculate_wer_for_csv_a(hypothesis_file, reference_file)

0.23446893787575152

# Problems with excel format

In [16]:
def convert_csv_semicolon_to_comma(input_file, output_file):
    """
    Converts a CSV file with semicolons as delimiters to a comma-separated CSV.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile, delimiter=';')
        writer = csv.writer(outfile, delimiter=',')
        
        for row in reader:
            writer.writerow(row)

# Example usage
input_file = "../S302con_C.csv"
output_file = 'test.csv'

convert_csv_semicolon_to_comma(input_file, output_file)

In [7]:
def find_problematic_line(input_file):
    """
    Reads a file line by line to find the line that causes a UnicodeDecodeError.
    """
    with open(input_file, 'r', encoding='utf-8', errors='replace') as file:
        for line_number, line in enumerate(file, start=1):
            # If the line contains the problematic character, print the line number and content
            if '\ufffd' in line:  # '\ufffd' is the replacement character for decoding errors
                print(f"Problematic line at line {line_number}: {line}")
                break
        else:
            print("No problematic lines found.")

# Example usage
input_file = "../S302con_C.csv"
find_problematic_line(input_file)


Problematic line at line 22: 20;Compassion;S302con;302;00:02:01;00:02:09;3;but then in the second part the time you had the eyes closed was longer in the first part or�? 



In [16]:
def remove_bom_from_file(file_path):
    """
    Removes the Byte Order Mark (BOM) from a file if it exists.
    """
    with open(file_path, 'rb') as file:
        content = file.read()

    # Check for BOM (UTF-8 BOM is \xef\xbb\xbf)
    if content.startswith(b'\xef\xbb\xbf'):
        print(f"BOM found in {file_path}, removing it...")
        content = content[3:]  # Remove the first three bytes (BOM)
        with open(file_path, 'wb') as file:
            file.write(content)
        print(f"BOM successfully removed from {file_path}.")
    else:
        print(f"No BOM found in {file_path}.")

# Example usage
file_path = "../S302con_C.csv"
remove_bom_from_file(file_path)

No BOM found in ../S302con_C.csv.
