In [1]:
import pandas as pd
import re
import glob
import os

def extract_paragraphs(file_path):
    with open(file_path, 'r',encoding='latin-1') as file:
        content = file.read()

    regex = r"\[(\d{2}:\d{2}:\d{2})\]\s+(.*?)(?=\n\[|\Z)"
    matches = re.findall(regex, content, re.DOTALL)

    paragraphs = []
    for i, match in enumerate(matches):
        start_time = match[0]
        paragraph = match[1].strip().replace('\n', ' ')  # Remove leading/trailing whitespace and replace newlines with spaces
        
        # Remove line breaks between speaker lines
        paragraph = re.sub(r'([A-Za-z]+:)\s*\n\s*', r'\1 ', paragraph)

        if paragraph:
            if i < len(matches) - 1:
                next_start_time = matches[i + 1][0]
                end_time = next_start_time
            else:
                # Last paragraph, use a default end time
                end_time = "00:00:00"

            paragraphs.append((paragraph, start_time, end_time))

    return paragraphs


def process_paragraph(paragraph, start_time, end_time):
    # Split the paragraph into sentences
    sentences = paragraph.split('. ')
    
    # Calculate the number of sentences
    num_sentences = len(sentences)
    
    # Convert start time and end time to datetime objects
    start_datetime = pd.to_datetime(start_time[1:9], format='%H:%M:%S')
    end_datetime = pd.to_datetime(end_time[1:9], format='%H:%M:%S')
    
    # Calculate the total duration of the paragraph in seconds
    total_duration = (end_datetime - start_datetime).total_seconds()
    
    # Calculate the total number of words in the paragraph
    total_words = len(paragraph.split())
    
    # Calculate the time per word
    time_per_word = total_duration / total_words
    
    # Create a list to store the resulting data
    data = []
    
    # Iterate over each sentence and assign time values
    current_datetime = start_datetime
    for sentence in sentences:
        sentence_words = sentence.split()
        sentence_duration = len(sentence_words) * time_per_word
        
        time_str = f"{current_datetime.time().strftime('%H:%M:%S')} - {(current_datetime + pd.Timedelta(seconds=sentence_duration)).time().strftime('%H:%M:%S')}"
        data.append([time_str, sentence.strip(), ""])
        current_datetime += pd.Timedelta(seconds=sentence_duration)
    
    # Create a pandas DataFrame
    df = pd.DataFrame(data, columns=['TIME', 'TRANSLATION (ENGLISH)', 'TRANSCRIPTION (SESOTHO)'])
    
    return df



# The convert_txt_to_cs() function in Python is designed to convert a text file (.txt) into Comma-Separated Values (CSV)

# The function takes no arguments and assumes that the text file is present in the current directory. Here's an outline of what the function does:

# Read the contents of the text file and store them in memory.
# Split the text into individual lines and process each line.
# Convert the processed data into CSV format by separating values with commas and appending them to a CSV string.
# Create a CSV file and write the CSV string to it.
# Display a success message indicating the conversion process is complete.


def convert_txt_to_csv():
    # Folder path containing the text files
    TXT_folder_path = '../Interview TXT/'

    # Output folder path for CSV files
    output_folder = '../Interview CSV/'

    # Get a list of all the text files in the folder
    txt_files = glob.glob(TXT_folder_path + '*.txt')

    for txt_file in txt_files:
        # Extract paragraphs from the current text file
        paragraphs = extract_paragraphs(txt_file)

        df_all = pd.DataFrame(columns=['TIME', 'TRANSLATION (ENGLISH)', 'TRANSCRIPTION (SESOTHO)', 'filepath'])

        for paragraph in paragraphs:
            df = process_paragraph(paragraph[0], '['+paragraph[1]+']', '['+paragraph[2]+']')

            # Add the 'filepath' column with the current text file path
            df['filepath'] = txt_file
            
            df_all = pd.concat([df_all, df], ignore_index=True)

        # Generate the CSV file name based on the text file name
        csv_file = os.path.join(output_folder, os.path.basename(txt_file).replace('.txt', '.csv'))

        # Save the DataFrame to the CSV file
        df_all.to_csv(csv_file, index=False)

# convert_txt_to_csv()