In [209]:
import os
import docx
import re
import csv
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict
from difflib import SequenceMatcher, get_close_matches


<h3> Transcripts to CSVs </h3>

In [319]:


def clean_subtitle(text):
    # Remove asterisks and text between brackets
    text = re.sub(r'\*|\[.*?\]', '', text)
    text = re.sub(r'[\u2012\u2013\u2014\u2015]', '-', text)
    text = text.replace(";",":")
    return text.strip()

def convert_language_to_csv(language, input_file, subtitles):
    doc = docx.Document(input_file)
    subtitle_block = ""
    current_time_range = None
    current_highlight_color = None
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        text = clean_subtitle(text)  # Clean the subtitle text
        # Check if the paragraph contains a time range
        time_range_match = re.match(r'^(\d+[:;]\d+(?::\d+){0,2}\s*-\s*\d+[:;]\d+(?::\d+){0,2})\s*\**', text)
        if time_range_match:
            time_range = time_range_match.group(1)
            # If there's a previous subtitle block, add it to the dictionary
            if subtitle_block and current_time_range:
                subtitles[current_time_range][language] = {
                    "text": subtitle_block,
                    "highlight_color": current_highlight_color,
                    "comments": ""  # Initialize an empty "Comments" column
                }
            subtitle_block = ""
            current_time_range = time_range
            current_highlight_color = None  # Reset the highlight color for a new time range
        else:
            # Extract the highlight color for each run in the paragraph
            for run in paragraph.runs:
                if run.font.highlight_color is not None:
                    current_highlight_color = run.font.highlight_color
            subtitle_block += text + "\n"

    # Add the last subtitle block to the dictionary
    if subtitle_block and current_time_range:
        subtitles[current_time_range][language] = {
            "text": subtitle_block,
            "highlight_color": current_highlight_color,
            "comments": ""  # Initialize an empty "Comments" column
        }

def interview_to_csv(input_files, output_csv_file):
    # Initialize a dictionary to store subtitles for all languages
    all_subtitles = defaultdict(lambda: defaultdict(str))

    # Iterate through each language and convert to CSV
    for language, input_file in input_files.items():
        convert_language_to_csv(language, input_file, all_subtitles)

    # Write the combined subtitles to the CSV file
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        # Write the header row
        header = ["Start Time", "End Time"] + list(input_files.keys()) + ["Highlight", "Comments"]
        csv_writer.writerow(header)
        # Write the data rows with all languages stacked
        for time_range, language_subtitles in all_subtitles.items():
            try:
                start_time, end_time = map(str.strip, time_range.split("-"))
                highlight_color = language_subtitles[list(input_files.keys())[0]]["highlight_color"]
                row = [start_time, end_time] + [clean_subtitle(language_subtitles.get(lang, {"text": ""})["text"]) for lang in input_files.keys()] + [highlight_color, ""]
                csv_writer.writerow(row)
            except:
                print("broken: ", time_range)#,language_subtitles)

    print(f"Conversion completed. Data saved to {output_csv_file}")

# # Test
# input_files = {
#     "English": "../02_Transcripts/Wayuu/Transcripts (EN)/dunas (EN).docx",
#     "Spanish": "../02_Transcripts/Wayuu/Transcripts (ES)/dunas (ES).docx"
# }
# output_csv_file = "../04_Interview CSV/dunas.csv"

# # Convert all languages to CSV using the pipeline function
# interview_to_csv(input_files, output_csv_file)


In [320]:
#run the processing pipeline overall interviews

transcripts_dir = "../02_Transcripts/Wayuu/"

def get_languages_and_interviews(transcripts_dir):
    interviews_dict = {}  # Dictionary to store interviews and their languages and file paths

    for language_folder in os.listdir(transcripts_dir):
        language_dir_path = os.path.join(transcripts_dir, language_folder)
        
        # Check if it's a valid language folder (e.g., "Transcripts (EN)")
        if os.path.isdir(language_dir_path) and language_folder.startswith("Transcripts ("):
            language = language_folder.split(" (")[1].split(")")[0]
            
            for interview_file in os.listdir(language_dir_path):
                if interview_file.endswith(".docx") and not os.path.basename(interview_file).startswith("~$"):
                    interview_name, ext = os.path.splitext(interview_file)
                    interview_code = interview_name.split("(")[0].strip().lower()  # Extract the interview code (e.g., "dunas")
                    interview_path = os.path.join(language_dir_path, interview_file)
                    
                    # Create or update the entry for this interview code
                    if interview_code not in interviews_dict:
                        interviews_dict[interview_code] = {}
                    
                    # Append language and file path to the input_files dictionary
                    interviews_dict[interview_code][language] = interview_path

    return interviews_dict    

def process_all_interviews(transcripts_dir, output_csv_dir):
    # Get the combined dictionary of languages and interviews
    interviews_dict = get_languages_and_interviews(transcripts_dir)
    
    for interview_code in interviews_dict.keys():
        print(interview_code)
        # Process the interview using the input_files and interview_code
        output_csv_file = os.path.join(output_csv_dir, f"{interview_code}.csv")
        interview_to_csv(interviews_dict[interview_code], output_csv_file)
        

# Output directory for CSV files
output_csv_dir = "../04_Interview CSV/Wayuu/"

# Process all interviews in the transcripts directory
process_all_interviews(transcripts_dir, output_csv_dir)


weildler inside
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/weildler inside.csv
neko urbana
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/neko urbana.csv
abuelo pescador
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/abuelo pescador.csv
abuela pescadora
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/abuela pescadora.csv
tejedora abuela
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/tejedora abuela.csv
joaquin
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/joaquin.csv
healing woman
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/healing woman.csv
pinta abuela
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/pinta abuela.csv
salinero oscar
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/salinero oscar.csv
magalys hammock
Conversion completed. Data saved to ../04_Interview CSV/Wayuu/magalys hammock.csv
neko father
Conversion completed. Data saved to ../04_

<h3>CSVs to SRTs</h3>

In [337]:
import os
import pandas as pd

# Input and output directories
csv_dir = "../04_Interview CSV/Wayuu/"
subtitles_dir = "../05_Subtitles/Wayuu/"

# Function to convert a CSV file to SRT for multiple languages
def csv_to_srt_multiple_languages(csv_file):
    print(csv_file)
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Get the language columns dynamically (excluding specific columns)
    exclude_columns = ["Start Time", "End Time", "Highlight", "Comments"]
    language_columns = [col for col in df.columns if col not in exclude_columns]
    
    print(language_columns)

    # Initialize a dictionary to store lines for each language
    language_srt_lines = {col: [] for col in language_columns}
    combined_srt_lines = []
    counter = 1

    # Loop through each row in the CSV
    for index, row in df.iterrows():
        # Format the timestamps in SRT format
        start_time = f"{counter}\n{row['Start Time'].replace('.', ',')} --> {row['End Time'].replace('.', ',')}"

        # Append the text for each language to their respective lines
        for lang in language_columns:
            language_srt_lines[lang].append(start_time)
            language_srt_lines[lang].append(f"{row[lang]}\n")

        # Append the text to the combined SRT
        combined_srt_lines.append(start_time)
        combined_srt_lines.append('\n----\n'.join([str(row[lang]) for lang in language_columns]) + '\n')

        # Increment the counter
        counter += 1

    # Determine the output SRT file paths for combined and individual SRTs
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    combined_langs = '-'.join(language_columns)
    combined_srt_filename = f"{base_filename} ({combined_langs}).srt"
    combined_srt_dir = os.path.join(subtitles_dir,"SRT Export ("+combined_langs+")")
    os.makedirs(combined_srt_dir, exist_ok=True)

    combined_srt_path= os.path.join(combined_srt_dir, combined_srt_filename)

    # Create the folder for combined SRT
    os.makedirs(subtitles_dir, exist_ok=True)

    # Write the combined SRT file
    with open(combined_srt_path, 'w', encoding='utf-8') as combined_srt_file:
        combined_srt_file.write('\n'.join(combined_srt_lines))

    # Create folders for each language and write the individual SRT files
    for lang in language_columns:
        lang_output_dir = os.path.join(subtitles_dir, f"SRT Export ({lang})")
        os.makedirs(lang_output_dir, exist_ok=True)
        lang_srt_filename = f"{base_filename} ({lang}).srt"
        lang_srt_path = os.path.join(lang_output_dir, lang_srt_filename)
        with open(lang_srt_path, 'w', encoding='utf-8') as lang_srt_file:
            lang_srt_file.write('\n'.join(language_srt_lines[lang]))

    #print(f'Combined SRT file "{combined_srt_path}" and individual SRT files have been created.')


# Process all CSV files in the input directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv'):
        csv_file = os.path.join(csv_dir, filename)
        csv_to_srt_multiple_languages(csv_file)


../04_Interview CSV/Wayuu/salinero oscar.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/dunas.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Neko piedra.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/tejedora abuela.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Ana weaving.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Healing Woman.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Wolunka.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Magalys electrico.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/abuela pescadora.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/romelia.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Neko weaving.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/neko father.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Joaquin.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Neko urbana.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/salinero young.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Eliana.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/weildler pre.csv
['EN', 'ES']
../04_Interview CSV/Wayuu/Abuelo pescador.csv
['EN', 'ES']
../0

<h3> Check status of txts, csv, xmls, srts </h3>


In [332]:

def getCodesPresent(folderpath,ext):
    codes = []
    for filename in os.listdir(folderpath):
        if filename.endswith(ext) and os.path.getsize(os.path.join(folderpath,filename))>1000:
            code=filename.lower().split(ext.lower())[0].strip()
            codes.append(code)
    return codes

enTexts=getCodesPresent('../02_Transcripts/Wayuu/Transcripts (EN)/',' (EN).docx')
esTexts=getCodesPresent('../02_Transcripts/Wayuu/Transcripts (ES)/',' (ES).docx')
xmls=getCodesPresent('../03_Interview XML/Wayuu/','- synced.xml')
enSubs=getCodesPresent('../05_Subtitles/Wayuu/SRT Export (EN)/',' (EN).srt')
esSubs=getCodesPresent('../05_Subtitles/Wayuu/SRT Export (ES)/',' (ES).srt')

def getUndefinedCount(code):
    try:
        docx_file='../02_Transcripts/Wayuu/Transcripts (EN)/'+code+' (EN).docx'  
        search_string="xxx"
        count = 0
        # Load the DOCX document
        doc = docx.Document(docx_file)
        # Iterate through paragraphs and search for the string (case-insensitive)
        for paragraph in doc.paragraphs:
            if search_string.lower() in paragraph.text.lower():
                count += paragraph.text.lower().count(search_string.lower())
        return count
    except:
        return ""

lists_dict ={"EN Transcripts":enTexts,
            "ES Transcripts":esTexts,
            "EN Subs":enSubs,
            "ES Subs":esSubs,
            "XMLs":xmls}
    

# Create a DataFrame with unique codes
unique_codes = list(set(code for sublist in lists_dict.values() for code in sublist))
df = pd.DataFrame({'code': unique_codes})

# Add columns for each list with True/False values
for list_name, code_list in lists_dict.items():
    df[list_name] = df['code'].isin(code_list)

# Add column for count of XXX in document    
df["Undefined Count"]=df["code"].apply(getUndefinedCount)
    
# Set 'code' as the index
df.set_index('code', inplace=True)
 
# Print the DataFrame
print(df)


                   EN Transcripts  ES Transcripts  EN Subs  ES Subs   XMLs  \
code                                                                         
magalys hammock              True            True     True     True   True   
palabrero luis               True            True     True     True   True   
weildler pre                 True            True     True     True  False   
tejedora abuela              True            True     True     True   True   
pinta abuela                 True            True     True     True   True   
salinero oscar               True            True     True     True   True   
neko weaving                 True            True     True     True   True   
eliana                       True            True     True     True   True   
salinero young               True            True     True     True   True   
romelia                      True            True     True     True   True   
neko piedra                  True            True     True     T

<h3> Convert Script to Csv </h3>

In [None]:
def concatenate_transcript_csvs(folder_path):

    # Get a list of all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Initialize an empty DataFrame
    concatenated_df = pd.DataFrame()

    # Iterate through CSV files and concatenate them
    for csv_file in csv_files:
        interview_code = os.path.splitext(csv_file)[0]  # Extract interview code from file name
        csv_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(csv_path)

        # Add an "Interview" column with the interview code
        df['Interview'] = interview_code

        # Concatenate the dataframes
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    return concatenated_df

# Example usage:
folder_path = "../04_Interview CSV/Wayuu/"
concatenated_transcripts = concatenate_transcript_csvs(folder_path)
print(concatenated_transcripts)

In [207]:


# Function to convert different time formats to seconds (unchanged)
def convert_time_to_seconds(time_str):
    try:
        if ':' in time_str:
            parts = time_str.split(':')
            if len(parts) == 2:
                minutes, seconds = map(int, parts)
                return minutes * 60 + seconds

        parts = time_str.split(':')
        if len(parts) == 3 or len(parts) == 4:
            hours, minutes, seconds, *milliseconds = map(int, parts)
            total_seconds = hours * 3600 + minutes * 60 + seconds
            if milliseconds:
                total_seconds += milliseconds[0] / 100
            return total_seconds
    except ValueError:
        pass

    raise ValueError(f"Unrecognized time format: {time_str}")

# Function to format time in MM:SS (unchanged)
def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    return f"{minutes:02d}:{seconds:02d}"

def find_start_character_index(string, subsequence, cutoff=0.6):

    # Initialize the start index.
    start_index = -1

    # Iterate through the string using a sliding window approach.
    for i in range(len(string) - len(subsequence) + 1):
        window = string[i:i + len(subsequence)]

        # Use difflib's SequenceMatcher to calculate similarity.
        similarity = SequenceMatcher(None, subsequence, window).ratio()

        # If the similarity exceeds the cutoff, consider it a match.
        if similarity >= cutoff:
            start_index = i
            break

    return start_index

# Function to match script text with transcript
def match_script_with_transcript(docx_file_path, transcript_csv_path):
    doc = docx.Document(docx_file_path)
    transcript_df = pd.read_csv(transcript_csv_path)
    print(transcript_df)

    speaker = ""
    transcript_lines = []

    for para in doc.paragraphs:
        text = para.text.strip()

        if text:
            match = re.match(r'^([A-Za-z]+)$', text)
            if match:
                speaker = match.group(1)
            else:
                closest_match = get_close_matches(text, transcript_df["EN"], n=1, cutoff=0.6)
                if closest_match:
                    matched_row = transcript_df[transcript_df["EN"] == closest_match[0]].iloc[0]
                    start_time_str = matched_row["Start Time"]
                    start_time = convert_time_to_seconds(start_time_str)
                    print(matched_row["EN"],text)
                    transcript_duration=convert_time_to_seconds(matched_row["End Time"]) - convert_time_to_seconds(matched_row["Start Time"])                    
                    offset = find_start_character_index(matched_row["EN"],text[:8], cutoff=.8)
                    seconds_per_letter =transcript_duration / len(matched_row["EN"])
                    start_time+=offset*seconds_per_letter
                    duration_seconds = len(text) * seconds_per_letter
                    end_time = start_time + duration_seconds
                    
                    print(start_time,end_time,seconds_per_letter,offset)

                    start_time_mmss = format_time(int(start_time))
                    end_time_mmss = format_time(int(end_time))
                    transcript_lines.append([speaker, start_time_mmss, end_time_mmss, duration_seconds, text])

    return transcript_lines

# Function to write matched transcript to CSV (unchanged)
def write_matched_transcript_to_csv(transcript_lines, output_csv_path):
    with open(output_csv_path, "w", newline="") as csvfile:
        fieldnames = ["Speaker", "Start Time", "End Time", "Duration (Seconds)", "Transcript"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in transcript_lines:
            speaker, start_time_mmss, end_time_mmss, duration_seconds, text = line
            writer.writerow({"Speaker": speaker, "Start Time": start_time_mmss, "End Time": end_time_mmss, "Duration (Seconds)": duration_seconds, "Transcript": text})

script_path = "../01_Scripts/Wayuu/Dunas Script.docx"
transcript_path = "../04_Interview CSV/Wayuu/dunas.csv"#should use the concatenated csv
output_csv_path = "../01_Scripts/Wayuu/output_timecodes.csv"

transcript_lines = match_script_with_transcript(script_path, transcript_path)
write_matched_transcript_to_csv(transcript_lines, output_csv_path)
print(f"CSV file with speaker, start time, end time, duration, and matched transcript created at {output_csv_path}.")


   Start Time End Time                                                 EN  \
0       00:00    00:02                           Where are you from Neko?   
1       00:03    00:06                I am from a territory called Akualü   
2       00:07    00:10                Although the elders call it Manaure   
3       00:11    00:16  Ah, you came from the wild west and now you're...   
4       00:18    00:21  In Nazareth or township of Nazareth as they ca...   
5       00:22    00:30  We are in the territory of Usijou´, in Alewalü...   
6       00:31    00:36  Yes, it is the only one with some characterist...   
7       00:39    00:45  Yes of course. My grandmother and my mother's ...   
8       00:47    00:51  Where is the sea? If over there, from here you...   
9       00:52    00:56               Yes it's true.\nYes, they lived here   
10      00:57    01:01  But is the sea far from here? Yes, it's far away.   
11      01:02    01:09  They said they moved, they had left their\nthi...   

<h3> Check status of txts, csv, xmls, srts </h3>


In [248]:

def getCodesPresent(folderpath,ext):
    codes = []
    for filename in os.listdir(folderpath):
        if filename.endswith(ext) and os.path.getsize(os.path.join(folderpath,filename))>1000:
            code=filename.lower().split(ext.lower())[0]
            codes.append(code)
    return codes

enTexts=getCodesPresent('../02_Transcripts/Wayuu/Transcripts (EN)/',' (EN).docx')
esTexts=getCodesPresent('../02_Transcripts/Wayuu/Transcripts (ES)/',' (ES).docx')
xmls=getCodesPresent('../03_Interview XML/Wayuu/','.xml')
enSubs=getCodesPresent('../05_Subtitles/Wayuu/SRT Export (EN)/',' (EN).srt')
esSubs=getCodesPresent('../05_Subtitles/Wayuu/SRT Export (ES)/',' (ES).srt')

lists_dict ={"EN Transcripts":enTexts,
            "ES Transcripts":esTexts,
            "EN Subs":enSubs,
            "ES Subs":esSubs,
            "XMLs":xmls}
    

# Create a DataFrame with unique codes
unique_codes = list(set(code for sublist in lists_dict.values() for code in sublist))
df = pd.DataFrame({'code': unique_codes})

# Add columns for each list with True/False values
for list_name, code_list in lists_dict.items():
    df[list_name] = df['code'].isin(code_list)

# Set 'code' as the index
df.set_index('code', inplace=True)

# Print the DataFrame
print(df)


                   EN Transcripts  ES Transcripts  EN Subs  ES Subs   XMLs
code                                                                      
magalys hammock              True            True     True     True  False
palabrero luis               True            True     True     True  False
weildler pre                 True            True     True     True  False
tejedora abuela              True            True     True     True  False
pinta abuela                 True            True     True     True  False
salinero oscar               True            True     True     True  False
neko weaving                 True            True     True     True  False
eliana                       True            True     True     True  False
salinero young               True            True    False     True  False
romelia                      True            True     True     True  False
neko piedra                  True            True     True     True  False
dunas                    