In [106]:
import os
import pandas as pd
import docx
import re
import csv
from difflib import SequenceMatcher, get_close_matches

In [107]:
project_id="Test"
script_title = "wolunka-test"

In [108]:
def exportNewScriptRow(matched_row,text):
    start_time_str = matched_row["Start Time"]
    start_time = convert_time_to_seconds(start_time_str)
    transcript_duration=convert_time_to_seconds(matched_row["End Time"]) - convert_time_to_seconds(matched_row["Start Time"])                    
    offset = find_start_character_index(matched_row["EN"],text[:8], cutoff=.8)
    seconds_per_letter =transcript_duration / len(matched_row["EN"])
    start_time+=offset*seconds_per_letter
    duration_seconds = len(text) * seconds_per_letter
    end_time = start_time + duration_seconds

    start_time_mmss = format_time(int(start_time))
    end_time_mmss = format_time(int(end_time))
    return start_time_mmss,end_time_mmss,duration_seconds

In [109]:


# Function to convert different time formats to seconds (unchanged)
def convert_time_to_seconds(time_str):
    try:
        if ':' in time_str:
            parts = time_str.split(':')
            if len(parts) == 2:
                minutes, seconds = map(int, parts)
                return minutes * 60 + seconds

        parts = time_str.split(':')
        if len(parts) == 3 or len(parts) == 4:
            hours, minutes, seconds, *milliseconds = map(int, parts)
            total_seconds = hours * 3600 + minutes * 60 + seconds
            if milliseconds:
                total_seconds += milliseconds[0] / 100
            return total_seconds
    except ValueError:
        pass

    raise ValueError(f"Unrecognized time format: {time_str}")





In [110]:
def find_start_character_index(string, subsequence, cutoff=0.6):

    # Initialize the start index.
    start_index = -1

    # Iterate through the string using a sliding window approach.
    for i in range(len(string) - len(subsequence) + 1):
        window = string[i:i + len(subsequence)]

        # Use difflib's SequenceMatcher to calculate similarity.
        similarity = SequenceMatcher(None, subsequence, window).ratio()

        # If the similarity exceeds the cutoff, consider it a match.
        if similarity >= cutoff:
            start_index = i
            break

    return start_index

In [111]:
    
# Function to format time in MM:SS (unchanged)
def format_time(seconds):
    minutes, seconds = divmod(seconds, 60)
    time= f"{minutes:02d}:{seconds:02d}"
    return time

In [112]:

# Function to match script text with transcript
def match_script_with_transcript(docx_file_path, transcript_df):
    doc = docx.Document(docx_file_path)
    speaker = ""
    next_row={"EN":"temp"}
    matched_row={"EN":"temp"}
    transcript_lines = []
    for line in doc.paragraphs:
        text = str(line.text.strip())
        if any(char.isalpha() for char in text):
            match = re.match(r'\#([^, ]+)', text)
            if match:##if the line is defining a new speaker because it in the form *Magalys
                speaker = match.group(1)[1:]
            else:

                speaker_df=transcript_df[transcript_df["Speaker"].str.contains(speaker.lower())]  

                ##first try the next line after the previously searched for line
                if text in next_row["EN"]:
                    matched_row=next_row
                    
                elif text in matched_row["EN"]:
                    matched_row=matched_row
                    
                else:
                    closest_match = get_close_matches(text, speaker_df["EN"].apply(float_to_str), n=1, cutoff=0.6)
                    if closest_match:
                        #print(closest_match,text)

                        matched_row = speaker_df[speaker_df["EN"] == closest_match[0]].iloc[0]


                if closest_match or next_row["EN"]==text:

                    #this is a hacky way to get the next row but i could figure out a good index method
                    getNextRow=False
                    for index,row in speaker_df.iterrows():
                        if getNextRow:
                            next_row=row
                            getNextRow=False
                        elif row["EN"]==matched_row["EN"]:
                            getNextRow=True

                    start_time_mmss,end_time_mmss,duration_seconds=exportNewScriptRow(matched_row,text)
                    interview=matched_row["Interview"]
                else:
                    start_time_mmss=""
                    end_time_mmss=""
                    duration_seconds=""
                    interview="can't find"

                transcript_lines.append([speaker,interview, start_time_mmss, end_time_mmss, duration_seconds, text])                

    return transcript_lines


def float_to_str(value):
    return str(value)
            

script_path = "../01_Scripts/"+project_id+"/DOCX/"+script_title+".docx"
concat_transcripts = pd.read_csv("../04_Interview CSV/"+project_id+"/"+project_id+"-merged.csv")
transcript_lines = match_script_with_transcript(script_path, concat_transcripts)
print(transcript_lines)



[['Magalys', 'wolunka', '01:00', '01:04', 4.3809523809523805, 'I am glad to know the place where Wolunka was.'], ['Magalys', 'wolunka', '01:04', '01:07', 3.5238095238095237, 'We can talk a little more about that.'], ['Magalys', 'wolunka', '01:09', '01:11', 1.9999999999999998, 'I am going to tell you then the story of Wolunka.'], ['Magalys', 'wolunka', '01:14', '01:15', 1.0, 'This here'], ['Magalys', 'wolunka', '01:16', '01:21', 5.0, 'It is the stone of Wolunka, where he used to sit'], ['Magalys', 'wolunka', '00:24', '00:27', 3.8028169014084505, 'TRUE. In fact, in our culture it is necessary'], ['Magalys', 'wolunka', '00:27', '00:29', 2.112676056338028, 'let us know these stories'], ['Magalys', 'wolunka', '00:31', '00:36', 5.0, 'So that we continue to value these wonders that we have now'], ['Magalys', 'wolunka', '00:37', '00:41', 4.0, 'Since this is what our ancestors have left'], ['Magalys', 'wolunka', '00:41', '00:42', 1.9047619047619047, 'and our grandparents'], ['Magalys', 'wolunka

In [113]:
# Function to write matched transcript to CSV (unchanged)
def write_matched_transcript_to_csv(transcript_lines, output_csv_path):
    with open(output_csv_path, "w", newline="") as csvfile:
        fieldnames = ["Speaker", "Interview","Start Time", "End Time", "Duration (Seconds)", "Transcript"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for line in transcript_lines:
            speaker, interview, start_time_mmss, end_time_mmss, duration_seconds, text = line
            writer.writerow({"Speaker": speaker,"Interview":interview, "Start Time": start_time_mmss, "End Time": end_time_mmss, "Duration (Seconds)": duration_seconds, "Transcript": text})

write_matched_transcript_to_csv(transcript_lines, output_csv_path)
print(f"CSV file with speaker, start time, end time, duration, and matched transcript created at {output_csv_path}.")

            
output_csv_path="../01_Scripts/"+project_id+"/CSV/"+script_title+".csv"
write_matched_transcript_to_csv(transcript_lines, output_csv_path)

CSV file with speaker, start time, end time, duration, and matched transcript created at ../01_Scripts/Test/CSV/wolunka-test.csv.
