In [1]:
import os
import docx
import re
import csv
from collections import defaultdict

<h3> Transcripts to CSVs </h3>

In [32]:


def clean_subtitle(text):
    # Remove asterisks and text between brackets
    text = re.sub(r'\*|\[.*?\]', '', text)
    text = re.sub(r'[\u2012\u2013\u2014\u2015]', '-', text)
    text = text.replace(";",":")
    return text.strip()

def convert_language_to_csv(language, input_file, subtitles):
    doc = docx.Document(input_file)
    subtitle_block = ""
    current_time_range = None
    current_highlight_color = None
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        text = clean_subtitle(text)  # Clean the subtitle text
        # Check if the paragraph contains a time range
        time_range_match = re.match(r'^(\d+[:;]\d+(?::\d+){0,2}\s*-\s*\d+[:;]\d+(?::\d+){0,2})\s*\**', text)
        if time_range_match:
            time_range = time_range_match.group(1)
            # If there's a previous subtitle block, add it to the dictionary
            if subtitle_block and current_time_range:
                subtitles[current_time_range][language] = {
                    "text": subtitle_block,
                    "highlight_color": current_highlight_color,
                    "comments": ""  # Initialize an empty "Comments" column
                }
            subtitle_block = ""
            current_time_range = time_range
            current_highlight_color = None  # Reset the highlight color for a new time range
        else:
            # Extract the highlight color for each run in the paragraph
            for run in paragraph.runs:
                if run.font.highlight_color is not None:
                    current_highlight_color = run.font.highlight_color
            subtitle_block += text + "\n"

    # Add the last subtitle block to the dictionary
    if subtitle_block and current_time_range:
        subtitles[current_time_range][language] = {
            "text": subtitle_block,
            "highlight_color": current_highlight_color,
            "comments": ""  # Initialize an empty "Comments" column
        }

def interview_to_csv(input_files, output_csv_file):
    # Initialize a dictionary to store subtitles for all languages
    all_subtitles = defaultdict(lambda: defaultdict(str))

    # Iterate through each language and convert to CSV
    for language, input_file in input_files.items():
        convert_language_to_csv(language, input_file, all_subtitles)

    # Write the combined subtitles to the CSV file
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        # Write the header row
        header = ["Start Time", "End Time"] + list(input_files.keys()) + ["Highlight", "Comments"]
        csv_writer.writerow(header)
        # Write the data rows with all languages stacked
        for time_range, language_subtitles in all_subtitles.items():
            try:
                start_time, end_time = map(str.strip, time_range.split("-"))
                highlight_color = language_subtitles[list(input_files.keys())[0]]["highlight_color"]
                row = [start_time, end_time] + [clean_subtitle(language_subtitles.get(lang, {"text": ""})["text"]) for lang in input_files.keys()] + [highlight_color, ""]
                csv_writer.writerow(row)
            except:
                print("broken: ", language_subtitles)

    print(f"Conversion completed. Data saved to {output_csv_file}")

# # Test
# input_files = {
#     "English": "../02_Transcripts/Wayuu/Transcripts (EN)/dunas (EN).docx",
#     "Spanish": "../02_Transcripts/Wayuu/Transcripts (ES)/dunas (ES).docx"
# }
# output_csv_file = "../04_Interview CSV/dunas.csv"

# # Convert all languages to CSV using the pipeline function
# interview_to_csv(input_files, output_csv_file)


In [33]:
#run the processing pipeline overall interviews

transcripts_dir = "../02_Transcripts/Wayuu/"

def get_languages_and_interviews(transcripts_dir):
    interviews_dict = {}  # Dictionary to store interviews and their languages and file paths

    for language_folder in os.listdir(transcripts_dir):
        language_dir_path = os.path.join(transcripts_dir, language_folder)
        
        # Check if it's a valid language folder (e.g., "Transcripts (EN)")
        if os.path.isdir(language_dir_path) and language_folder.startswith("Transcripts ("):
            language = language_folder.split(" (")[1].split(")")[0]
            
            for interview_file in os.listdir(language_dir_path):
                if interview_file.endswith(".docx") and not os.path.basename(interview_file).startswith("~$"):
                    interview_name, ext = os.path.splitext(interview_file)
                    interview_code = interview_name.split("(")[0].strip()  # Extract the interview code (e.g., "dunas")
                    interview_path = os.path.join(language_dir_path, interview_file)
                    
                    # Create or update the entry for this interview code
                    if interview_code not in interviews_dict:
                        interviews_dict[interview_code] = {}
                    
                    # Append language and file path to the input_files dictionary
                    interviews_dict[interview_code][language] = interview_path

    return interviews_dict    

def process_all_interviews(transcripts_dir, output_csv_dir):
    # Get the combined dictionary of languages and interviews
    interviews_dict = get_languages_and_interviews(transcripts_dir)

    #print(interviews_dict)
    
    for interview_code in interviews_dict.keys():
        print(interview_code)
        # Process the interview using the input_files and interview_code
        output_csv_file = os.path.join(output_csv_dir, f"{interview_code}.csv")
        interview_to_csv(interviews_dict[interview_code], output_csv_file)
        

# Output directory for CSV files
output_csv_dir = "../04_Interview CSV/"

# Process all interviews in the transcripts directory
process_all_interviews(transcripts_dir, output_csv_dir)


weildler inside
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'externa.\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'marítimo.\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'transformativos\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'El\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
Conversion completed. Data saved to ../04_Interview CSV/weildler inside.csv
Neko urbana
Conversion completed. Data saved to ../04_Interview CSV/Neko urbana.csv
Abuelo pescador
Conversion completed. Data saved to ../04_Interview CSV/Abuelo pescador.csv
abuela pescadora
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'Si, como un intercambio de regalos\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
Conversion completed. Data saved to ../04_Interview CSV/abuela pescadora.csv
tejed

broken:  defaultdict(<class 'str'>, {'ES': {'text': 'Cuando esta agua se acabe vamos a sufrir mucho\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
Conversion completed. Data saved to ../04_Interview CSV/neko father.csv
Neko piedra
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'Y la de Magalys\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'que Wuimpumiun, la alta Guajira\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
broken:  defaultdict(<class 'str'>, {'ES': {'text': 'el ave era una persona y escucharon las diferentes aves y estaba\n\n', 'highlight_color': None, 'comments': ''}, 'EN': ''})
Conversion completed. Data saved to ../04_Interview CSV/Neko piedra.csv
palabrero luis
Conversion completed. Data saved to ../04_Interview CSV/palabrero luis.csv
weildler outside
Conversion completed. Data saved to ../04_Interview CSV/weildler outside.csv
Ana weaving
broken:  defaultdict(<class 'str'>, {'ES

<h3>CSVs to SRTs</h3>