<a href="https://colab.research.google.com/github/Evanson12/Hello-world/blob/master/Combine_different_txt_tanscripts_to_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import csv
import os

# Function to extract speaker, timestamp, and speech from input text
def extract_speaker_speech(input_text):
    lines = input_text.strip().split('\n')
    speaker_speech_data = []
    current_speaker = None
    current_timestamp = None
    current_speech = []

    # Define regex pattern to match h:mm:ss or mm:ss timestamps
    pattern = r'(\d{1,2}:\d{2}:\d{2})|(\d{1,2}:\d{2})(?!\d)'  # Matches h:mm:ss or mm:ss formats

    for line in lines:
        matches = re.findall(pattern, line)
        if matches:
            if current_speaker and current_timestamp:
                # Save the current speech data
                speaker_speech_data.append({
                    "speaker": current_speaker,
                    "timestamp": current_timestamp,
                    "speech": ' '.join(current_speech).strip()
                })
            for match in matches:
                timestamp = match[0] if match[0] else match[1]  # Use the first valid match
                # Check if the extracted timestamp matches expected formats
                if re.match(r'^\d{1,2}:\d{2}(:\d{2})?$', timestamp):
                    parts = line.split(timestamp, 1)
                    current_speaker = parts[0].strip()
                    current_timestamp = timestamp
                    current_speech = [parts[1].strip()]
        else:
            current_speech.append(line.strip())

    # Append the last collected speech
    if current_speaker and current_timestamp:
        speaker_speech_data.append({
            "speaker": current_speaker,
            "timestamp": current_timestamp,
            "speech": ' '.join(current_speech).strip()
        })

    return speaker_speech_data

# Function to process speaker, timestamp, and speech data
def process_speaker_speech(speaker_speech_data):
    processed_data = []
    for entry in speaker_speech_data:
        # Clean the speech from timestamps
        cleaned_speech = re.sub(r'\b(\d{1,2}:\d{2}(:\d{2})?)\b', '', entry['speech']).strip()
        processed_data.append({
            "speaker": entry['speaker'],
            "timestamp": entry['timestamp'],
            "speech": cleaned_speech
        })
    return processed_data

# Function to combine scripts and write to CSV
def combine_scripts(file_paths):
    # Iterate through each file path
    all_data = []
    for file_path in file_paths:
        # Extract meeting name from file path
        meeting_name = os.path.basename(file_path).replace('.txt', '')

        # Read input text from file
        with open(file_path, 'r', encoding='utf-8') as file:
            input_text = file.read()

        # Extract speaker, timestamp, and speech data
        extracted_data = extract_speaker_speech(input_text)
        processed_data = process_speaker_speech(extracted_data)

        # Append data with meeting name to all_data
        for entry in processed_data:
            all_data.append({
                "meeting": meeting_name,
                "speaker": entry["speaker"],
                "timestamp": entry["timestamp"],
                "speech": entry["speech"]
            })

    # Define output CSV file path in the same directory
    output_csv_path = os.path.join(os.path.dirname(file_paths[0]), "combined_output.csv")

    # Write the processed data to a CSV file
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write header
        csvwriter.writerow(['Meeting', 'Speaker', 'Timestamp', 'Speech'])
        # Write data rows
        for entry in all_data:
            csvwriter.writerow([entry["meeting"], entry["speaker"], str(entry["timestamp"]), entry["speech"]])  # Convert timestamp to string explicitly

    print(f"Output written to {output_csv_path}")

# Example usage with multiple files
file_paths = [
    r'C:\Users\e2 hp\Downloads\OEWG2 7-8_otter_ai (1).txt',
    r'C:\Users\e2 hp\Downloads\OEWG2 7-9_otter_ai (1).txt'
]
combine_scripts(file_paths)
