In [30]:
import pandas as pd
import re

In [31]:
# Load the data, using the file name: master_task2_datset.csv
file_path = 'master_task2_datset.csv'
df = pd.read_csv(file_path)

In [32]:
# --- Define Transcript Cleaning Function ---
def clean_transcript(text):
    # 5. Handle missing transcripts: Replace NaNs/None with empty string
    if pd.isna(text) or text is None:
        return ''
    text = str(text)

    # 4. Replace newlines with spaces
    text = text.replace('\n', ' ')

    # 1. Remove bracketed text: [Music], [Applause], [Laughter] etc.
    text = re.sub(r'\[.*?\]', '', text)

    # 1. Remove timestamps: 00:01, 1:23:45, 01:00:00.
    text = re.sub(r'\s?\d{1,2}:\d{2}(:\d{2})?\s?', ' ', text)

    # 3. Lowercase text
    text = text.lower()

    # 2. Remove Special characters or non-UTF symbols (keep only letters, numbers, and spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Clean up excessive whitespace created by replacements
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [28]:
# Apply the new cleaning function to the 'transcript' column
df['cleaned_transcript'] = df['transcript'].apply(clean_transcript)

In [29]:
# Save the final cleaned DataFrame
df.to_csv('master_task2_clean_transcript_dataset.csv', index=False)