In [6]:
import os
import pandas as pd
import csv

from analysis_helpers import organize_csv_files_by_dir, clean_files

## Save the references by experiment (if mixed on another folder)

In [7]:
source_directory = "../../performance_evaluation/references"    
destination_directory = "../../interviews_corrected/raw"    

organize_csv_files_by_dir(source_directory, destination_directory)

../../performance_evaluation/references\ID 05.csv
Copied ID 05.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 08.csv
Copied Id 08.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 13.csv
Copied Id 13.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 13b.csv
Copied Id 13b.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 14.csv
Copied Id 14.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 15.csv
Copied Id 15.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 16.csv
Copied Id 16.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 17.csv
Copied Id 17.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 18.csv
Copied Id 18.csv to ../../interviews_corrected/raw\OBE1
../../performance_evaluation/references\Id 1

## 1. Remove fillers, repetitions

In [5]:
source_directory = "../../interviews_corrected/raw"    
destination_directory = "../../interviews_corrected/clean1"    

# List of common filler words to remove
filler_words = ["uh", "um", "hmm"]

clean_files(source_directory, destination_directory, filler_words)


In [14]:
def convert_csv_to_dialogue_merge_speakers(input_csv):
    """
    Converts a CSV file to a dialogue-style text file with only Speaker and Content,
    merging consecutive entries from the same speaker.

    Args:
        input_csv (str): Path to the input CSV file.
        output_txt (str): Path to the output text file.
    """
    output_txt = os.path.splitext(input_csv)[0] + '.txt'
    try:
        with open(input_csv, mode='r', encoding='utf-8') as csvfile, \
             open(output_txt, mode='w', encoding='utf-8') as txtfile:
            
            reader = csv.DictReader(csvfile)
            
            previous_speaker = None
            dialogue_buffer = ""
            
            for row in reader:
                speaker = row.get('Speaker', 'Unknown').strip()
                content = row.get('Content', '').strip()
                
                if not speaker or not content:
                    continue  # Skip rows with missing speaker or content
                
                if speaker == previous_speaker:
                    # Append to the existing dialogue buffer
                    dialogue_buffer += f" {content}"
                else:
                    # Write the previous dialogue buffer if it exists
                    if previous_speaker is not None:
                        dialogue_line = f"Speaker {previous_speaker}: {dialogue_buffer}\n\n"
                        txtfile.write(dialogue_line)
                    
                    # Start a new dialogue buffer
                    previous_speaker = speaker
                    dialogue_buffer = content
            
            # Write the last dialogue buffer after the loop ends
            if previous_speaker is not None and dialogue_buffer:
                dialogue_line = f"Speaker {previous_speaker}: {dialogue_buffer}\n\n"
                txtfile.write(dialogue_line)
        
        print(f"Successfully converted and merged '{input_csv}' to '{output_txt}'.")
    
    except FileNotFoundError:
        print(f"Error: The file '{input_csv}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [15]:
input_csv_path = "../interviews_corrected/raw/Compassion/S301final.csv"

convert_csv_to_dialogue_merge_speakers(input_csv_path)

Successfully converted and merged '../interviews_corrected/raw/Compassion/S301final.csv' to '../interviews_corrected/raw/Compassion/S301final.txt'.


# Boff

In [8]:
import spacy
#!python -m spacy download en_core_web_sm

In [21]:
# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

def remove_filler_words(text, FILLER_WORDS):
    doc = nlp(text)
    tokens = []
    i = 0
    while i < len(doc):
        # Check for multi-word filler phrases
        matched = False
        for filler in sorted(FILLER_WORDS, key=lambda x: len(x.split()), reverse=True):
            filler_tokens = filler.split()
            length = len(filler_tokens)
            if i + length <= len(doc):
                segment = doc[i:i+length]
                segment_text = " ".join([token.text.lower() for token in segment])
                if segment_text == filler:
                    matched = True
                    i += length
                    break
        if not matched:
            tokens.append(doc[i].text)
            i += 1
    cleaned_text = " ".join(tokens)
    return cleaned_text

In [22]:
FILLER_WORDS = [
    "um", "uh", "hmm"
]

# Read the text file and remove filler words
input_txt_path = "../interviews_corrected/raw/Compassion/S301final.txt"

with open(input_txt_path, mode='r', encoding='utf-8') as file:
    text = file.read()
    cleaned_text = remove_filler_words(text, FILLER_WORDS)

output_txt_path = os.path.splitext(input_txt_path)[0] + '_cleaned.txt'

with open(output_txt_path, mode='w', encoding='utf-8') as file:
    file.write(cleaned_text)