In [None]:
import os
import pandas as pd

folder_path = "../subtitles"  # Path to the folder containing XLSX files
file_extension = ".xlsx"  # File extension of the XLSX files
search_text = 'What I would like to do with Moselantja'  # Text to search for

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(file_extension):
        file_path = os.path.join(folder_path, filename)

        # Read the XLSX file into a DataFrame
        df = pd.read_excel(file_path)

        # Add a new column with the file path
        df['filepath'] = file_path

        # Append the data to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)

# Strip the search text
search_text = search_text.strip()

# Strip whitespace in the DataFrame columns and convert to lowercase
combined_df["TRANSCRIPTION (SESOTHO)"] = combined_df["TRANSCRIPTION (SESOTHO)"].str.strip().str.lower()
combined_df["TRANSLATION (ENGLISH)"] = combined_df["TRANSLATION (ENGLISH)"].str.strip().str.lower()

# Find the rows that contain the specific text
match_row = combined_df["TRANSCRIPTION (SESOTHO)"].str.contains(search_text.lower(), case=False, na=False) | \
            combined_df["TRANSLATION (ENGLISH)"].str.contains(search_text.lower(), case=False, na=False)
matching_rows = combined_df[match_row]

# Display the matching rows
# print(matching_rows)


In [None]:

import os
import pandas as pd

def getRowFromSub(df, substring):
    return df[df.apply(lambda row: row.astype(str).str.contains(substring, case=False).any(), axis=1)]

folder_path = "../subtitles"
file_extension = ".xlsx"

combined_df = pd.DataFrame()

for filename in os.listdir(folder_path):
    if filename.endswith(file_extension):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_excel(file_path)
        df['filepath'] = file_path
        combined_df = pd.concat([combined_df, df], ignore_index=True)

matched_rows = []

with open('../scripts/hosman cut script.txt', 'r',encoding='utf-8') as file:
    lines = file.readlines()

narrator = None
language = None
for line in lines:
    line = line.strip().lower()
    if line.startswith('***'):
        narrator = line[3:].strip().lower()
        language = 'SESOTHO'
    elif line.startswith('###'):
        narrator = line[3:].strip().lower()
        language = 'ENGLISH'
    elif line and narrator and language:
        matched_rows_df = getRowFromSub(combined_df, line)
        if not matched_rows_df.empty:
            for _, row in matched_rows_df.iterrows():
                new_row = {
                    'Text': line,
                    'Narrator': narrator,
                    'Language': language,
                    'Timecode Range': row['TIME'],
                    'FilePath': row['filepath']
                }
                matched_rows.append(new_row)
        # else:  # If full match is not found, try matching with the first 15 characters
        #     line_15 = line[:30]
        #     matched_rows_df = getRowFromSub(combined_df, line_15)
        #     if not matched_rows_df.empty:
        #         for _, row in matched_rows_df.iterrows():
        #             new_row = {
        #                 'Text': line,
        #                 'Narrator': narrator,
        #                 'Language': language,
        #                 'Timecode Range': row['TIME'],
        #                 'FilePath': row['filepath']
        #             }
        #             matched_rows.append(new_row)
        else:  # If no match found, save line with no timecode range or filepath
            new_row = {
                'Text': line,
                'Narrator': narrator,
                'Language': language,
                'Timecode Range': None,
                'FilePath': None
            }
            matched_rows.append(new_row)

matched_df = pd.DataFrame(matched_rows)
print(matched_df)
matched_df.to_csv('../output.csv', index=False)


In [None]:
# Question: WHat about those records for which there is no timecode range?
# Question: Do we go through all the xml files to search a time code range ? Currently this is how it is working.
# Question: How do i test it since the xml produced has links to .mov or audio files that are not available?
# Question: I am not too sure about the format since the xml files have numerous attributes specific to that project it seems so do i
# use a vanilla format like now ?
# Question: What is the purpose of the SRT files since currently they are not being used in this iteration of the script?
# when comparing time codes do we also match the narrator and language or other specification ?

import csv
import os
import xml.etree.ElementTree as ET

def convert_timecode(timecode):
    # Convert timecode from 'HH:MM-SS:MM' format to 'HH:MM:SS:FF' format
    start_time, end_time = timecode.split('-')
    start_time_parts = start_time.split(':')
    end_time_parts = end_time.split(':')

    # Convert minutes to seconds and add frames
    start_seconds = int(start_time_parts[0]) * 60 + int(start_time_parts[1])
    end_seconds = int(end_time_parts[0]) * 60 + int(end_time_parts[1])

    # Format as 'HH:MM:SS:FF'
    converted_start_time = '{:02d}:{:02d}:00:00'.format(start_seconds // 60, start_seconds % 60)
    converted_end_time = '{:02d}:{:02d}:00:00'.format(end_seconds // 60, end_seconds % 60)

    return converted_start_time, converted_end_time

def is_time_within_range(time, start_time, end_time):
    return start_time <= time <= end_time

def extract_clips(csv_file, xml_folder, output_file):
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        matches = []

        for row in reader:
            timecode_range = row['Timecode Range']
            if timecode_range and timecode_range != 'None':
                start_time, end_time = convert_timecode(timecode_range)

                xml_files = os.listdir(xml_folder)
                for xml_file in xml_files:
                    xml_path = os.path.join(xml_folder, xml_file)
                    tree = ET.parse(xml_path)
                    root = tree.getroot()
                    clip_items = root.findall(".//clipitem")

                    for clip_item in clip_items:
                        timecode_element = clip_item.find(".//timecode/string")
                        if timecode_element is not None:
                            xml_timecode = timecode_element.text

                            if is_time_within_range(xml_timecode, start_time, end_time):
                                print('Match found in {}'.format(xml_file))
                                print('Timecode: {}'.format(xml_timecode))
                                print('Start time: {}'.format(start_time))
                                print('End time: {}'.format(end_time))
                                matches.append(clip_item)

        if matches:
            root = ET.Element('xmeml')
            sequence = ET.SubElement(root, 'sequence')
            sequence.extend(matches)

            # Create a new XML file with matched clips
            output_tree = ET.ElementTree(root)
            output_tree.write(output_file, encoding='utf-8', xml_declaration=True)

# Usage example
csv_file = '../output.csv'
xml_folder = '../interview xmls'
output_file = '../xml exports//matched_clips.xml'

extract_clips(csv_file, xml_folder, output_file)
