# Word (.docx) to CSV

In [6]:
import csv
import docx # pip install python-docx
import unicodedata

In [32]:
# Function to read the Word document and extract dialogue
def extract_dialogue_from_docx(docx_file):
    # Load the Word document
    doc = docx.Document(docx_file)
    dialogues = []
    current_text = []
    
    # Loop through each paragraph in the document
    for paragraph in doc.paragraphs:
        text = unicodedata.normalize('NFKD', paragraph.text.strip())

        # Only look after the separator after encountering an empty line
        # Allow to handle with the combination of a line break and separator within the text at the same time !
        if not text:
            # If there's accumulated text, check for the separator and save the dialogue
            if current_text:
                combined_text = ' '.join(current_text).strip()
                if " : " in combined_text:
                    # Only look at the first separator, to avoid separator within the text
                    parts = combined_text.split(" : ", 1)
                    if len(parts) == 2:
                        speaker = parts[0].strip()
                        dialogue_text = parts[1].strip()
                        dialogues.append({"Speaker": speaker, "Text": dialogue_text})
                current_text = []
            continue

        # Accumulate text for the current speaker
        current_text.append(text)

    # Add the last accumulated dialogue
    if current_text:
        combined_text = ' '.join(current_text).strip()
        if " : " in combined_text:
            parts = combined_text.split(" : ", 1)
            if len(parts) == 2:
                speaker = parts[0].strip()
                dialogue_text = parts[1].strip()
                dialogues.append({"Speaker": speaker, "Text": dialogue_text})

    return dialogues

# Function to save the extracted dialogue to a CSV file
def save_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Speaker", "Text"])
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [33]:
# Provide the path to your Word file and the desired CSV output file
word_file = '../../data/text_julian/script_interview_clinique_1.docx'
csv_file = 'output-interview.csv'

# Extract dialogues from the Word file
dialogues = extract_dialogue_from_docx(word_file)

# Save dialogues to the CSV file
save_to_csv(dialogues, csv_file)