In [1]:
import pandas as pd
import re
import numpy as np

In [2]:

def clean_transcript(text):
    if pd.isnull(text) or str(text).strip() == "":
        return np.nan
    # Remove [Music], [Applause], and similar tags
    text = re.sub(r'\[(music|applause|laughter|silence|noise|cheering|clapping)\]', '', text, flags=re.IGNORECASE)
    # Remove timestamps like 00:01, 1:23:45, etc.
    text = re.sub(r'\b\d{1,2}:\d{2}(?::\d{2})?\b', '', text)
    # Remove special characters and non-UTF symbols
    text = re.sub(r'[^\w\s]', '', text)
    # Lowercase
    text = text.lower()
    # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text if text else np.nan

# Load CSV
input_file = 'master_task2_dataset.csv'  # Change to your file name
df = pd.read_csv(input_file)

# Clean transcript column (replace 'transcript' with your actual column name)
if 'transcript' in df.columns:
    df['transcript'] = df['transcript'].apply(clean_transcript)

# Save cleaned CSV
output_file = 'cleaned_transcripts.csv'
df.to_csv(output_file, index=False)