<a href="https://colab.research.google.com/github/Evanson12/Data-Analysis-for-OEWG-Transcripts/blob/main/New_way_to_get_list_of_all_columns_from_txt_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import re
import csv
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Function to extract timestamp from the text
def extract_timestamp(text):
    # Define regex pattern to match h:mm:ss or mm:ss timestamps
    pattern = r'(\d{1,2}:\d{2}(:\d{2})?)'
    match = re.search(pattern, text)
    return match.group(0) if match else None

# Function to extract speaker and timestamp from input text
def extract_speaker_speech(input_text):
    lines = input_text.strip().split('\n')
    speaker_speech_data = []
    current_speaker = None
    current_timestamp = None
    current_speech = []

    for line in lines:
        # Extract timestamp from the line
        timestamp = extract_timestamp(line)
        if timestamp:
            # Process the previous entry if it exists
            if current_speaker and current_timestamp:
                speaker_speech_data.append({
                    "speaker_timestamp": f"{current_speaker} {current_timestamp}",
                    "speaker": current_speaker,
                    "timestamp": current_timestamp,
                    "speech": ' '.join(current_speech).strip()
                })
            # Split the line to get speaker and speech
            parts = line.split(timestamp, 1)
            if len(parts) == 2:
                current_speaker = parts[0].strip()
                current_timestamp = timestamp
                current_speech = [parts[1].strip()]
            else:
                current_speech.append(line.strip())
        else:
            current_speech.append(line.strip())

    # Append the last collected speech
    if current_speaker and current_timestamp:
        speaker_speech_data.append({
            "speaker_timestamp": f"{current_speaker} {current_timestamp}",
            "speaker": current_speaker,
            "timestamp": current_timestamp,
            "speech": ' '.join(current_speech).strip()
        })

    return speaker_speech_data

# Function to process speaker, timestamp, and speech data
def process_speaker_speech(speaker_speech_data):
    processed_data = []
    for entry in speaker_speech_data:
        # Clean the speech from timestamps
        cleaned_speech = re.sub(r'\b(\d{1,2}:\d{2}(:\d{2})?)\b', '', entry['speech']).strip()
        processed_data.append({
            "speaker_timestamp": entry['speaker_timestamp'],
            "speaker": entry['speaker'],
            "timestamp": entry['timestamp'],
            "speech": cleaned_speech
        })
    return processed_data

# Function to combine scripts and write to CSV
def combine_scripts(folder_path):
    # List all text files in the specified folder
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]

    # Group data by meeting name
    meeting_data = {}

    # Iterate through each file path
    for file_path in file_paths:
        # Extract meeting name from file path
        meeting_name = os.path.basename(file_path).replace('.txt', '')

        # Read input text from file
        with open(file_path, 'r', encoding='utf-8') as file:
            input_text = file.read()

        # Extract speaker, timestamp, and speech data
        extracted_data = extract_speaker_speech(input_text)
        processed_data = process_speaker_speech(extracted_data)

        # Append data with meeting name to meeting_data
        if meeting_name not in meeting_data:
            meeting_data[meeting_name] = []
        for entry in processed_data:
            meeting_data[meeting_name].append({
                "speaker_timestamp": entry["speaker_timestamp"],
                "speaker": entry["speaker"],
                "timestamp": entry["timestamp"],
                "speech": entry["speech"]
            })

    # Define output directory
    output_dir = folder_path

    # Find the next available file number
    existing_files = os.listdir(output_dir)
    output_files = [f for f in existing_files if f.startswith("combined_output") and f.endswith(".csv")]
    file_numbers = [int(f.split('_')[-1].replace('.csv', '')) for f in output_files if f.split('_')[-1].replace('.csv', '').isdigit()]
    next_file_number = max(file_numbers) + 1 if file_numbers else 1

    # Define output CSV file path with numbering
    output_csv_path = os.path.join(output_dir, f"combined_output_{next_file_number}.csv")

    # Write the processed data to a CSV file
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write header
        csvwriter.writerow(['Meeting', 'Speaker Timestamp', 'Speaker', 'Timestamp', 'Speech'])
        # Write data rows grouped by meeting name
        for meeting_name, entries in meeting_data.items():
            for entry in entries:
                csvwriter.writerow([meeting_name, entry["speaker_timestamp"], entry["speaker"], str(entry["timestamp"]), entry["speech"]])

    print(f"Output written to {output_csv_path}")

# Specify the folder path in Google Drive
folder_path = '/content/drive/MyDrive/OEWG Transcripts'
combine_scripts(folder_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Output written to /content/drive/MyDrive/OEWG Transcripts/combined_output_9.csv
