In [1]:
import pandas as pd
import re

# -----------------------------
# Function to clean text columns
# -----------------------------
def clean_text(text):
    if pd.isna(text):
        return ""
    # Remove special characters except spaces and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s,.!?]', '', text)
    # Convert to lowercase
    return text.lower().strip()

# -----------------------------
# Function to convert duration into seconds
# -----------------------------
def duration_to_seconds(duration):
    if pd.isna(duration):
        return 0

    duration = str(duration).strip()

    # If format is HH:MM:SS
    if re.match(r'^\d{1,2}:\d{2}:\d{2}$', duration):
        h, m, s = map(int, duration.split(':'))
        return h * 3600 + m * 60 + s

    # If format is YouTube ISO 8601 (PT#H#M#S)
    match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
    if match:
        hours = int(match.group(1) or 0)
        minutes = int(match.group(2) or 0)
        seconds = int(match.group(3) or 0)
        return hours * 3600 + minutes * 60 + seconds

    # Otherwise, try numeric
    if duration.isdigit():
        return int(duration)

    return 0  # default fallback

# -----------------------------
# Main Script
# -----------------------------
def clean_csv(input_path, output_path):
    # Read CSV
    df = pd.read_csv(input_path)

    # Clean columns if they exist
    if 'title' in df.columns:
        df['title'] = df['title'].apply(clean_text)

    if 'transcript' in df.columns:
        df['transcript'] = df['transcript'].apply(clean_text)

    if 'duration' in df.columns:
        df['duration_seconds'] = df['duration'].apply(duration_to_seconds)

    # Save cleaned CSV
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"✅ Cleaned CSV saved to: {output_path}")


# -----------------------------
# Example usage
# -----------------------------
if __name__ == "__main__":
    input_csv = r"G:\infosys_internship\ytcleaneddata\master_dataset_updated.csv"  # change path as needed
    output_csv = r"G:\infosys_internship\ytcleaneddata\cleaned_youtube_details.csv"
    clean_csv(input_csv, output_csv)


✅ Cleaned CSV saved to: G:\infosys_internship\ytcleaneddata\cleaned_youtube_details.csv
