In [None]:
import pandas as pd
import re
import isodate

# ===== Function to clean text =====
def clean_text(text):
    if pd.isna(text):
        return ""
    # Remove special characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower().strip()

# ===== Function to convert ISO 8601 duration to seconds =====
def duration_to_seconds(duration_str):
    try:
        duration = isodate.parse_duration(duration_str)
        return int(duration.total_seconds())
    except Exception:
        return None

# ===== MAIN PROCESS =====
# Replace 'input.csv' with your actual CSV filename
df = pd.read_csv("/content/master_dataset_updated.csv")

# Step 1: Standardize column names (to lowercase)
df.columns = df.columns.str.lower().str.strip()

# Step 2: Clean and convert text columns to lowercase
if 'title' in df.columns:
    df['title'] = df['title'].apply(clean_text)

if 'description' in df.columns:
    df['description']=df['description'].apply(clean_text)


if 'transcript' in df.columns:
    df['transcript'] = df['transcript'].apply(clean_text)

# Step 3: Convert duration to seconds
if 'duration' in df.columns:
    df['duration'] = df['duration'].apply(duration_to_seconds)

# Step 4: Save the cleaned data
df.to_csv("master data cleaned_output.csv", index=False)

print("âœ… Cleaning complete! Saved as 'master data cleaned_output.csv'.")
