In [None]:
import os
import pandas as pd

folder_path = "../subtitles"  # Path to the folder containing XLSX files
file_extension = ".xlsx"  # File extension of the XLSX files
search_text = 'What I would like to do with Moselantja'  # Text to search for

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(file_extension):
        file_path = os.path.join(folder_path, filename)

        # Read the XLSX file into a DataFrame
        df = pd.read_excel(file_path)

        # Add a new column with the file path
        df['filepath'] = file_path

        # Append the data to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

# Strip the search text
search_text = search_text.strip()

# Strip whitespace in the DataFrame columns and convert to lowercase
combined_df["TRANSCRIPTION (SESOTHO)"] = combined_df["TRANSCRIPTION (SESOTHO)"].str.strip().str.lower()
combined_df["TRANSLATION (ENGLISH)"] = combined_df["TRANSLATION (ENGLISH)"].str.strip().str.lower()

# Find the rows that contain the specific text
match_row = combined_df["TRANSCRIPTION (SESOTHO)"].str.contains(search_text.lower(), case=False, na=False) | \
            combined_df["TRANSLATION (ENGLISH)"].str.contains(search_text.lower(), case=False, na=False)
matching_rows = combined_df[match_row]

# Display the matching rows
# print(matching_rows)


In [None]:

import os
import pandas as pd

def getRowFromSub(df, substring):
    return df[df.apply(lambda row: row.astype(str).str.contains(substring, case=False).any(), axis=1)]

folder_path = "../subtitles"
file_extension = ".xlsx"

combined_df = pd.DataFrame()

for filename in os.listdir(folder_path):
    if filename.endswith(file_extension):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_excel(file_path)
        df['filepath'] = file_path
        combined_df = pd.concat([combined_df, df], ignore_index=True)

matched_rows = []

with open('../scripts/hosman cut script.txt', 'r',encoding='utf-8') as file:
    lines = file.readlines()

narrator = None
language = None
for line in lines:
    line = line.strip().lower()
    if line.startswith('***'):
        narrator = line[3:].strip().lower()
        language = 'SESOTHO'
    elif line.startswith('###'):
        narrator = line[3:].strip().lower()
        language = 'ENGLISH'
    elif line and narrator and language:
        matched_rows_df = getRowFromSub(combined_df, line)
        if not matched_rows_df.empty:
            for _, row in matched_rows_df.iterrows():
                new_row = {
                    'Text': line,
                    'Narrator': narrator,
                    'Language': language,
                    'Timecode Range': row['TIME'],
                    'FilePath': row['filepath']
                }
                matched_rows.append(new_row)
        # else:  # If full match is not found, try matching with the first 15 characters
        #     line_15 = line[:30]
        #     matched_rows_df = getRowFromSub(combined_df, line_15)
        #     if not matched_rows_df.empty:
        #         for _, row in matched_rows_df.iterrows():
        #             new_row = {
        #                 'Text': line,
        #                 'Narrator': narrator,
        #                 'Language': language,
        #                 'Timecode Range': row['TIME'],
        #                 'FilePath': row['filepath']
        #             }
        #             matched_rows.append(new_row)
        else:  # If no match found, save line with no timecode range or filepath
            new_row = {
                'Text': line,
                'Narrator': narrator,
                'Language': language,
                'Timecode Range': None,
                'FilePath': None
            }
            matched_rows.append(new_row)

matched_df = pd.DataFrame(matched_rows)
print(matched_df)
matched_df.to_csv('../output.csv', index=False)


In [1]:
import xml.etree.ElementTree as ET
import copy



def create_xml_structure(project_name, matches):
    # Create the root element and set attributes
    root = ET.Element("xmeml")
    root.set("version", "4")
    
    # Create the project element
    project = ET.SubElement(root, "project")
    
    # Create the name element and set the project name
    name = ET.SubElement(project, "name")
    name.text = project_name
    
    # Create the children element
    children = ET.SubElement(project, "children")
    
    # Create the sequence element
    sequence = ET.SubElement(children, "sequence")
    sequence.set("id", "sequence-1")
    sequence.set("TL.SQAudioVisibleBase", "0")
    sequence.set("TL.SQVideoVisibleBase", "0")
    sequence.set("TL.SQVisibleBaseTime", "1954072810692088")
    sequence.set("TL.SQAVDividerPosition", "0.556213021278")
    sequence.set("MZ.Sequence.PreviewUseMaxRenderQuality", "false")
    sequence.set("MZ.Sequence.PreviewUseMaxBitDepth", "false")
    sequence.set("MZ.Sequence.VideoTimeDisplayFormat", "998")
    sequence.set("MZ.WorkOutPoint", "8014566084840000")
    sequence.set("MZ.WorkInPoint", "0")
    sequence.set("explodedTracks", "true")
    
    # Create the duration element and set a value
    duration = ET.SubElement(sequence, "duration")
    duration.text = "424764"
    
    # Create the rate element and set the timebase and ntsc values
    rate = ET.SubElement(sequence, "rate")
    
    timebase = ET.SubElement(rate, "timebase")
    timebase.text = "53"
    
    ntsc = ET.SubElement(rate, "ntsc")
    ntsc.text = "FALSE"
    
    # Create the name element and set a value
    sequence_name = ET.SubElement(sequence, "name")
    sequence_name.text = "output"
    
    # Create the media element
    media = ET.SubElement(sequence, "media")
    
    # Create the video element
    video = ET.SubElement(media, "video")
    video_tracks = {}  # Store video tracks

    # Create the audio element
    audio = ET.SubElement(media, "audio")
    audio_tracks = {}  # Store audio tracks

    for track_element, clips in matches.items():
        for clip_item, clip_type, track_properties in clips:

            # Handle video clips
            if clip_type == "video":
                if track_element not in video_tracks:
                    video_track = ET.SubElement(video, "track", attrib=track_properties)
                    video_tracks[track_element] = video_track  # Add new track to dict
                video_tracks[track_element].append(copy.deepcopy(clip_item))  # Append clip to track

            # Handle audio clips
            elif clip_type == "audio":
                if track_element not in audio_tracks:
                    audio_track = ET.SubElement(audio, "track", attrib=track_properties)
                    audio_tracks[track_element] = audio_track  # Add new track to dict
                audio_tracks[track_element].append(copy.deepcopy(clip_item))  # Append clip to track

    # Create the timecode element
    timecode = ET.SubElement(sequence, "timecode")
    
    # Create the rate element for timecode and set the timebase and ntsc values
    timecode_rate = ET.SubElement(timecode, "rate")
    
    timecode_timebase = ET.SubElement(timecode_rate, "timebase")
    timecode_timebase.text = "53"
    
    timecode_ntsc = ET.SubElement(timecode_rate, "ntsc")
    timecode_ntsc.text = "FALSE"
    
    # Create the string element for timecode and set a value
    timecode_string = ET.SubElement(timecode, "string")
    timecode_string.text = "00:00:00:00"
    
    # Create the frame element and set a value
    frame = ET.SubElement(timecode, "frame")
    frame.text = "0"
    
    # Create the displayformat element and set a value
    display_format = ET.SubElement(timecode, "displayformat")
    display_format.text = "NDF"
    
    # Create the labels element
    labels = ET.SubElement(sequence, "labels")
    
    # Create the label2 element and set a value
    label2 = ET.SubElement(labels, "label2")
    label2.text = "Forest"
    
    # Create the logginginfo element
    logging_info = ET.SubElement(sequence, "logginginfo")
    
    # Create the child elements for logginginfo and leave them empty
    child_elements = ["description", "scene", "shottake", "lognote", "good", "originalvideofilename", "originalaudiofilename"]
    for element in child_elements:
        ET.SubElement(logging_info, element)
    
    # Create the XML tree
    tree = ET.ElementTree(root)
     
    # Return the XML tree
    return tree


# Example usage:

# Create sample video clips (you can customize this part based on your clip structure)
video_clips = [
   
]

# Create sample audio clips (you can customize this part based on your clip structure)
audio_clips = [
]

# Create the XML structure with provided clips
xml_tree = create_xml_structure("fortnite new", video_clips, audio_clips)

# Save the XML to a file
xml_tree.write("../output.xml", encoding="utf-8", xml_declaration=True)


In [11]:
# Question: WHat about those records for which there is no timecode range?
# Question: Do we go through all the xml files to search a time code range ? Currently this is how it is working.
# Question: How do i test it since the xml produced has links to .mov or audio files that are not available?
# Question: I am not too sure about the format since the xml files have numerous attributes specific to that project it seems so do i
# use a vanilla format like now ?
# Question: What is the purpose of the SRT files since currently they are not being used in this iteration of the script?
# when comparing time codes do we also match the narrator and language or other specification ?

import csv
import os
import xml.etree.ElementTree as ET
import copy


def convert_timecode(timecode):
    # Convert timecode from 'HH:MM-SS:MM' format to 'HH:MM:SS:FF' format
    start_time, end_time = timecode.split('-')
    start_time_parts = start_time.split(':')
    end_time_parts = end_time.split(':')

    # Convert minutes to seconds and add frames
    start_seconds = int(start_time_parts[0]) * 60 + int(start_time_parts[1])
    end_seconds = int(end_time_parts[0]) * 60 + int(end_time_parts[1])

    # Format as 'HH:MM:SS:FF'
    converted_start_time = '{:02d}:{:02d}:00:00'.format(start_seconds // 60, start_seconds % 60)
    converted_end_time = '{:02d}:{:02d}:00:00'.format(end_seconds // 60, end_seconds % 60)

    return converted_start_time, converted_end_time

def is_time_within_range(time, start_time, end_time):
    return start_time <= time <= end_time

def get_parent_map(root):
    parent_map = {c: p for p in root.iter() for c in p}
    return parent_map

def get_parent_element(parent_map, element):
    return parent_map.get(element)

def get_clip_type(parent_map, element):
    parent = get_parent_element(parent_map, element)
    while parent is not None:
        if parent.tag in ['video', 'audio']:
            return parent.tag
        parent = get_parent_element(parent_map, parent)
    return 'unknown'


def create_xml_structure(project_name, matches):
    # Create the root element and set attributes
    root = ET.Element("xmeml")
    root.set("version", "4")
    
    # Create the project element
    project = ET.SubElement(root, "project")
    
    # Create the name element and set the project name
    name = ET.SubElement(project, "name")
    name.text = project_name
    
    # Create the children element
    children = ET.SubElement(project, "children")
    
    # Create the sequence element
    sequence = ET.SubElement(children, "sequence")
    sequence.set("id", "sequence-1")
    sequence.set("TL.SQAudioVisibleBase", "0")
    sequence.set("TL.SQVideoVisibleBase", "0")
    sequence.set("TL.SQVisibleBaseTime", "1954072810692088")
    sequence.set("TL.SQAVDividerPosition", "0.556213021278")
    sequence.set("MZ.Sequence.PreviewUseMaxRenderQuality", "false")
    sequence.set("MZ.Sequence.PreviewUseMaxBitDepth", "false")
    sequence.set("MZ.Sequence.VideoTimeDisplayFormat", "998")
    sequence.set("MZ.WorkOutPoint", "8014566084840000")
    sequence.set("MZ.WorkInPoint", "0")
    sequence.set("explodedTracks", "true")
    
    # Create the duration element and set a value
    duration = ET.SubElement(sequence, "duration")
    duration.text = "424764"
    
    # Create the rate element and set the timebase and ntsc values
    rate = ET.SubElement(sequence, "rate")
    
    timebase = ET.SubElement(rate, "timebase")
    timebase.text = "53"
    
    ntsc = ET.SubElement(rate, "ntsc")
    ntsc.text = "FALSE"
    
    # Create the name element and set a value
    sequence_name = ET.SubElement(sequence, "name")
    sequence_name.text = "output"
    
    # Create the media element
    media = ET.SubElement(sequence, "media")
    
    # Create the video element
    video = ET.SubElement(media, "video")
    video_tracks = {}  # Store video tracks

    # Create the audio element
    audio = ET.SubElement(media, "audio")
    audio_tracks = {}  # Store audio tracks

    for track_element, clips in matches.items():
        for clip_item, clip_type, track_properties in clips:

            # Handle video clips
            if clip_type == "video":
                if track_element not in video_tracks:
                    video_track = ET.SubElement(video, "track", attrib=track_properties)
                    video_tracks[track_element] = video_track  # Add new track to dict
                video_tracks[track_element].append(copy.deepcopy(clip_item))  # Append clip to track

            # Handle audio clips
            elif clip_type == "audio":
                if track_element not in audio_tracks:
                    audio_track = ET.SubElement(audio, "track", attrib=track_properties)
                    audio_tracks[track_element] = audio_track  # Add new track to dict
                audio_tracks[track_element].append(copy.deepcopy(clip_item))  # Append clip to track

    # Create the timecode element
    timecode = ET.SubElement(sequence, "timecode")
    
    # Create the rate element for timecode and set the timebase and ntsc values
    timecode_rate = ET.SubElement(timecode, "rate")
    
    timecode_timebase = ET.SubElement(timecode_rate, "timebase")
    timecode_timebase.text = "53"
    
    timecode_ntsc = ET.SubElement(timecode_rate, "ntsc")
    timecode_ntsc.text = "FALSE"
    
    # Create the string element for timecode and set a value
    timecode_string = ET.SubElement(timecode, "string")
    timecode_string.text = "00:00:00:00"
    
    # Create the frame element and set a value
    frame = ET.SubElement(timecode, "frame")
    frame.text = "0"
    
    # Create the displayformat element and set a value
    display_format = ET.SubElement(timecode, "displayformat")
    display_format.text = "NDF"
    
    # Create the labels element
    labels = ET.SubElement(sequence, "labels")
    
    # Create the label2 element and set a value
    label2 = ET.SubElement(labels, "label2")
    label2.text = "Forest"
    
    # Create the logginginfo element
    logging_info = ET.SubElement(sequence, "logginginfo")
    
    # Create the child elements for logginginfo and leave them empty
    child_elements = ["description", "scene", "shottake", "lognote", "good", "originalvideofilename", "originalaudiofilename"]
    for element in child_elements:
        ET.SubElement(logging_info, element)
    
    # Create the XML tree
    tree = ET.ElementTree(root)
     
    # Return the XML tree
    return tree


def extract_clips(csv_file, xml_folder, output_file):
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        matches = {}

        for row in reader:
            timecode_range = row['Timecode Range']
            if timecode_range and timecode_range != 'None':
                start_time, end_time = convert_timecode(timecode_range)

                xml_files = os.listdir(xml_folder)
                for xml_file in xml_files:
                    xml_path = os.path.join(xml_folder, xml_file)
                    tree = ET.parse(xml_path)
                    root = tree.getroot()
                    parent_map = get_parent_map(root)  # create the parent map here
                    clip_items = root.findall(".//clipitem")

                    for clip_item in clip_items:
                        timecode_element = clip_item.find(".//timecode/string")
                        if timecode_element is not None:
                            xml_timecode = timecode_element.text

                            if is_time_within_range(xml_timecode, start_time, end_time):
                                clip_type = get_clip_type(parent_map, clip_item)  # get the clip type here

                                track_properties = {}
                                track_element = get_parent_element(parent_map, clip_item)
                                if track_element is not None and track_element.tag == 'track':
                                    for attribute in track_element.attrib:
                                        track_properties[attribute] = track_element.attrib[attribute]

                                print('Match found in {}'.format(xml_file))
                                print('Timecode: {}'.format(xml_timecode))
                                print('Start time: {}'.format(start_time))
                                print('End time: {}'.format(end_time))
                                print('Clip Type: {}'.format(clip_type))
                                print('Track Properties: {}'.format(track_properties))

                                # Add to matches dictionary
                                if track_element not in matches:
                                    matches[track_element] = []
                                matches[track_element].append((clip_item, clip_type, track_properties))

    # return matches

        if matches:
            output_tree = create_xml_structure('test', matches)
            output_tree.write(output_file, encoding='utf-8', xml_declaration=True)

# Usage example
csv_file = '../output.csv'
xml_folder = '../interview xmls'
output_file = '../xml exports//matched_clips.xml'

extract_clips(csv_file, xml_folder, output_file)


Match found in ross - synced.xml
Timecode: 13:50:18:00
Start time: 13:33:00:00
End time: 13:54:00:00
Clip Type: video
Track Properties: {}
Match found in ross - synced.xml
Timecode: 13:53:06:00
Start time: 13:33:00:00
End time: 13:54:00:00
Clip Type: video
Track Properties: {}
Match found in ross - synced.xml
Timecode: 13:50:18:00
Start time: 13:33:00:00
End time: 13:54:00:00
Clip Type: video
Track Properties: {}
Match found in ross - synced.xml
Timecode: 13:53:06:00
Start time: 13:33:00:00
End time: 13:54:00:00
Clip Type: video
Track Properties: {}
Match found in chrissy_love - synced.xml
Timecode: 14:10:40:00
Start time: 14:04:00:00
End time: 14:20:00:00
Clip Type: video
Track Properties: {}
Match found in mats'ele - synced.xml
Timecode: 14:09:17:00
Start time: 14:04:00:00
End time: 14:20:00:00
Clip Type: video
Track Properties: {}
Match found in ross - synced.xml
Timecode: 14:04:34:00
Start time: 14:04:00:00
End time: 14:20:00:00
Clip Type: video
Track Properties: {}
Match found in 

NameError: name 'copy' is not defined