In [217]:
import os
import docx
import re
import csv
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict



In [218]:
project_id="Varanasi"


In [219]:
transcripts_dir = "../02_Transcripts/"+project_id
output_csv_dir = "../04_Interview CSV/"+project_id

<h3> Transcripts to CSVs </h3>

In [220]:
def get_languages_and_interviews(transcripts_dir):
    interviews_dict = {}  # Dictionary to store interviews and their languages and file paths

    for language_folder in os.listdir(transcripts_dir):
        language_dir_path = os.path.join(transcripts_dir, language_folder)
        
        # Check if it's a valid language folder (e.g., "Transcripts (EN)")
        if os.path.isdir(language_dir_path) and language_folder.startswith("Transcripts ("):
            language = language_folder.split(" (")[1].split(")")[0]
            
            for interview_file in os.listdir(language_dir_path):
                if interview_file.endswith(".docx") and not os.path.basename(interview_file).startswith("~$"):
                    interview_name, ext = os.path.splitext(interview_file)
                    interview_code = interview_name.split("(")[0].strip().lower()  # Extract the interview code (e.g., "dunas")
                    interview_path = os.path.join(language_dir_path, interview_file)
                    
                    # Create or update the entry for this interview code
                    if interview_code not in interviews_dict:
                        interviews_dict[interview_code] = {}
                    
                    # Append language and file path to the input_files dictionary
                    interviews_dict[interview_code][language] = interview_path

    return interviews_dict 

interviews_dict=get_languages_and_interviews(transcripts_dir)
print(interviews_dict)

{'sadhu 1': {'EN': '../02_Transcripts/Varanasi/Transcripts (EN)/Sadhu 1 (EN).docx', 'HI': '../02_Transcripts/Varanasi/Transcripts (HI)/Sadhu 1 (HI).docx'}, 'sadhu 2': {'EN': '../02_Transcripts/Varanasi/Transcripts (EN)/Sadhu 2 (EN).docx', 'HI': '../02_Transcripts/Varanasi/Transcripts (HI)/Sadhu 2 (HI).docx'}, 'kashi baba': {'EN': '../02_Transcripts/Varanasi/Transcripts (EN)/Kashi Baba (EN).docx', 'HI': '../02_Transcripts/Varanasi/Transcripts (HI)/Kashi Baba (HI).docx'}}


In [221]:

def clean_subtitle(text):
    # Remove asterisks and text between brackets
    text = re.sub(r'\*|\[.*?\]', '', text)
    text = re.sub(r'[\u2012\u2013\u2014\u2015]', '-', text)
    text = text.replace(";",":")
    return text.strip()

def convert_language_to_csv(language, input_file, subtitles):
    doc = docx.Document(input_file)
    subtitle_block = ""
    current_time_range = None
    current_highlight_color = None
    speaker=None
        
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        text = clean_subtitle(text)  # Clean the subtitle text
        # Check if the paragraph contains a time range
        time_range_match = re.match(r'^(\d+[:;]\d+(?::\d+)?(?:[.,]\d+)?\s*-\s*\d+[:;]\d+(?::\d+)?(?:[.,]\d+)?)\s*\**', text)
            #r'^(\d+[:;]\d+(?::\d+){0,2}\s*-\s*\d+[:;]\d+(?::\d+){0,2})\s*\**'                    
        if len(text)>0 and not text.isnumeric():
            if time_range_match:
                time_range = time_range_match.group(1)
                # If there's a previous subtitle block, add it to the dictionary
                if subtitle_block and current_time_range:
                    subtitles[current_time_range][language] = {
                        "text": subtitle_block,
                        "highlight_color": current_highlight_color,
                        "comments": "",  # Initialize an empty "Comments" column
                        "speaker":speaker
                    }
                subtitle_block = ""
                current_time_range = time_range
                current_highlight_color = None  # Reset the highlight color for a new time range
            elif text[0]=="#":
                speaker = re.sub(r'\[.#?\]', '', text.lower()[1:])

            else:
                #add this subtitle block
                subtitle_block += text + "\n"
                # Extract the highlight color for each run in the paragraph
                for run in paragraph.runs:
                    if run.font.highlight_color is not None:
                        current_highlight_color = run.font.highlight_color


    # Add the last subtitle block to the dictionary
    if subtitle_block and current_time_range:
        subtitles[current_time_range][language] = {
            "text": subtitle_block,
            "highlight_color": current_highlight_color,
            "comments": "",  # Initialize an empty "Comments" column
            "speaker":speaker
        }

def interview_to_csv(input_files, output_csv_file):
    # Initialize a dictionary to store subtitles for all languages
    all_subtitles = defaultdict(lambda: defaultdict(str))

    # Iterate through each language and convert to CSV
    for language, input_file in input_files.items():
        convert_language_to_csv(language, input_file, all_subtitles)

    # Write the combined subtitles to the CSV file
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as f:
        csv_writer = csv.writer(f)
        # Write the header row
        header = ["Start Time", "End Time"] + list(input_files.keys()) + ["Highlight", "Comments","Speaker"]
        csv_writer.writerow(header)
        # Write the data rows with all languages stacked
        for time_range, language_subtitles in all_subtitles.items():
            try:
                start_time, end_time = map(str.strip, time_range.split("-"))
                highlight_color = language_subtitles[list(input_files.keys())[0]]["highlight_color"]
                speaker=language_subtitles[list(input_files.keys())[0]]["speaker"]
                row = [start_time, end_time] + [clean_subtitle(language_subtitles.get(lang, {"text": ""})["text"]) for lang in input_files.keys()] + [highlight_color, "",speaker]
                csv_writer.writerow(row)
            except:
                print("broken: ", time_range)#,language_subtitles)

    print(f"Conversion completed. Data saved to {output_csv_file}")
    
# # # Test
# test_interview_id = next(iter(interviews_dict))
# input_files=interviews_dict[test_interview_id]
# output_csv_file = "../04_Interview CSV/"+project_id+"/"+test_interview_id+".csv"
# # # Convert all languages to CSV using the pipeline function
# interview_to_csv(input_files, output_csv_file)

In [222]:
# Process all interviews in the transcripts directory

def process_all_interviews(interviews_dict, output_csv_dir):
    # Get the combined dictionary of languages and interviews
    interviews_dict = get_languages_and_interviews(transcripts_dir)
    
    for interview_code in interviews_dict.keys():
        print(interview_code)
        # Process the interview using the input_files and interview_code
        output_csv_file = os.path.join(output_csv_dir, f"{interview_code}.csv")
        interview_to_csv(interviews_dict[interview_code], output_csv_file)
        
process_all_interviews(interviews_dict, output_csv_dir)


sadhu 1
Conversion completed. Data saved to ../04_Interview CSV/Varanasi/sadhu 1.csv
sadhu 2
Conversion completed. Data saved to ../04_Interview CSV/Varanasi/sadhu 2.csv
kashi baba
Conversion completed. Data saved to ../04_Interview CSV/Varanasi/kashi baba.csv


In [223]:
##these are used when converting scripts to csv files
def concatenate_transcript_csvs(folder_path):

    # Get a list of all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv') and "-merged" not in f]

    # Initialize an empty DataFrame
    concatenated_df = pd.DataFrame()

    # Iterate through CSV files and concatenate them
    for csv_file in csv_files:
        interview_code = os.path.splitext(csv_file)[0]  # Extract interview code from file name
        csv_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(csv_path)

        # Add an "Interview" column with the interview code
        df['Interview'] = interview_code

        # Concatenate the dataframes
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
        
    concatenated_df.to_csv("../04_Interview CSV/"+project_id+"/"+project_id+"-merged.csv",index=False)

    return concatenated_df

folder_path = "../04_Interview CSV/"+project_id
concatenated_transcripts = concatenate_transcript_csvs(folder_path)
#print(concatenated_transcripts)


<h3>CSVs to SRTs</h3>

In [224]:
def timecode_to_frames(text):
    if len(text.split(":"))==2:
        frames=(int(text.split(":")[0])*60+int(text.split(":")[1]))*24
    elif len(text.split(":"))==3:
        frames=int((int(text.split(":")[0])*3600+int(text.split(":")[1])*60+float(text.split(":")[2].replace(",",".")))*24)
    elif len(text.split(":"))==4:
        frames=(int(text.split(":")[0])*3600+int(text.split(":")[1])*60+int(text.split(":")[2]))*24+int(text.split(":")[3])
    else:
        print(text+"timecode parse error")
    return frames

print(timecode_to_frames("00:31:23"))  #test
print(timecode_to_frames("00:00:06,486"))  #test
print(timecode_to_frames("00:01:06,486"))  #test
print(float("06.486"))

45192
155
1595
6.486


In [225]:
def frames_to_srt_timecode(frames,fast=False):
    if fast==True:
        frames=frames*24/23.976 ##this is dumb, but solves the error when importing into premiere
    frames=int(frames)
    hours = frames // (3600*24)
    remaining_frames = frames % (3600*24)
    minutes=remaining_frames // (60*24)
    remaining_frames=remaining_frames % (60*24)
    seconds=remaining_frames // (24)
    remaining_frames =remaining_frames %(24)
    frames=remaining_frames

    timecode="{:02d}".format(hours)+":"+"{:02d}".format(minutes)+":"+"{:02d}".format(seconds)+","+"{:03d}".format(int(frames/24*1000))

    return timecode

frames_to_srt_timecode(45194)   #test



'00:31:23,083'

In [226]:

# Input and output directories
csv_dir = "../04_Interview CSV/"+project_id
subtitles_dir = "../05_Subtitles/"+project_id

##this fastSubs stuff needs to be refactored, it is just a temp fix to errors in framerate exports 24 and 23.95
#fastSubs=["romelia","joaquin","magalys electrico","neko weaving","weildler inside","weildler outside","bailarinas"]
fastSubs=["coca","fuego","arregoces","tejiendo"]

# Function to convert a CSV file to SRT for multiple languages
def csv_to_srt_multiple_languages(csv_file):

    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    #this should be removed, only is needed because of an error in exporting videos at 23.976 vs 24fps
    code=csv_file.split("/")[-1].split(".csv")[0].lower()
    print(code)
    fast=False
    if code in fastSubs:
        fast=True

    # Get the language columns dynamically (excluding specific columns)
    exclude_columns = ["Start Time", "End Time", "Highlight", "Comments","Speaker"]
    language_columns = [col for col in df.columns if col not in exclude_columns]
    language_columns.reverse()#this is to make english last,
    
    print(language_columns)

    # Initialize a dictionary to store lines for each language
    language_srt_lines = {col: [] for col in language_columns}
    combined_srt_lines = []
    counter = 1

    # Loop through each row in the CSV
    for index, row in df.iterrows():
        # Format the timestamps in SRT format
        srt_timecode = f"{counter}\n{frames_to_srt_timecode(timecode_to_frames(row['Start Time']),fast)} --> {frames_to_srt_timecode(timecode_to_frames(row['End Time']),fast)}"

        # Append the text for each language to their respective lines
        for lang in language_columns:
            language_srt_lines[lang].append(srt_timecode)
            language_srt_lines[lang].append(f"{row[lang]}\n")

        # Append the text to the combined SRT
        combined_srt_lines.append(srt_timecode)
        combined_srt_lines.append('\n----\n'.join([str(row[lang]) for lang in language_columns]) + '\n')

        # Increment the counter
        counter += 1

    # Determine the output SRT file paths for combined and individual SRTs
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    combined_langs = '-'.join(language_columns)
    combined_srt_filename = f"{base_filename} ({combined_langs}).srt"
    combined_srt_dir = os.path.join(subtitles_dir,"SRT Export ("+combined_langs+")")
    os.makedirs(combined_srt_dir, exist_ok=True)

    combined_srt_path= os.path.join(combined_srt_dir, combined_srt_filename)

    # Create the folder for combined SRT
    os.makedirs(subtitles_dir, exist_ok=True)

    # Write the combined SRT file
    with open(combined_srt_path, 'w', encoding='utf-8') as combined_srt_file:
        combined_srt_file.write('\n'.join(combined_srt_lines))

    # Create folders for each language and write the individual SRT files
    for lang in language_columns:
        lang_output_dir = os.path.join(subtitles_dir, f"SRT Export ({lang})")
        os.makedirs(lang_output_dir, exist_ok=True)
        lang_srt_filename = f"{base_filename} ({lang}).srt"
        lang_srt_path = os.path.join(lang_output_dir, lang_srt_filename)
        with open(lang_srt_path, 'w', encoding='utf-8') as lang_srt_file:
            lang_srt_file.write('\n'.join(language_srt_lines[lang]))

    #print(f'Combined SRT file "{combined_srt_path}" and individual SRT files have been created.')


# Process all CSV files in the input directory
for filename in os.listdir(csv_dir):
    if filename.endswith('.csv') and "-merged" not in filename:
        csv_file = os.path.join(csv_dir, filename)
        csv_to_srt_multiple_languages(csv_file)


sadhu 1
['HI', 'EN']
sadhu 2
['HI', 'EN']
kashi baba
['HI', 'EN']


<h3> Check status of txts, csv, xmls, srts </h3>


In [227]:

def getCodesPresent(folderpath,ext):
    codes = []
    for filename in os.listdir(folderpath):
        if filename.endswith(ext) and os.path.getsize(os.path.join(folderpath,filename))>1000:
            code=filename.lower().split(ext.lower())[0].strip()
            codes.append(code)
    return codes

enTexts=getCodesPresent('../02_Transcripts/'+project_id+'/Transcripts (EN)/',' (EN).docx')
# esTexts=getCodesPresent('../02_Transcripts/'+project_id+'/Transcripts (ES)/',' (ES).docx')
xmls=getCodesPresent('../03_Interview XML/'+project_id+'/','- final.xml')
enSubs=getCodesPresent('../05_Subtitles/'+project_id+'/SRT Export (EN)/',' (EN).srt')
# esSubs=getCodesPresent('../05_Subtitles/'+project_id+'/SRT Export (ES)/',' (ES).srt')

lists_dict ={"EN Transcripts":enTexts,
#             "ES Transcripts":esTexts,
            "EN Subs":enSubs,
#             "ES Subs":esSubs,
            "XMLs":xmls}

In [228]:

# Create a DataFrame with unique codes
unique_codes = list(set(code for sublist in lists_dict.values() for code in sublist))
df = pd.DataFrame({'code': unique_codes})

# Add columns for each list with True/False values
for list_name, code_list in lists_dict.items():
    df[list_name] = df['code'].isin(code_list)

def getUndefinedCount(code):
    try:
        docx_file='../02_Transcripts/'+project_id+'/Transcripts (EN)/'+code+' (EN).docx'  
        search_string="xxx"
        count = 0
        # Load the DOCX document
        doc = docx.Document(docx_file)
        # Iterate through paragraphs and search for the string (case-insensitive)
        for paragraph in doc.paragraphs:
            if search_string.lower() in paragraph.text.lower():
                count += paragraph.text.lower().count(search_string.lower())
        return count
    except:
        return ""
    
# Add column for count of XXX in document    
df["Undefined Count"]=df["code"].apply(getUndefinedCount)
    
# Set 'code' as the index
df.set_index('code', inplace=True)
 
# Print the DataFrame
print(df)


            EN Transcripts  EN Subs   XMLs  Undefined Count
code                                                       
kashi baba            True     True  False                0
sadhu 2               True     True  False                0
sadhu 1               True     True  False                0
