In [6]:
import pandas as pd
import re
import isodate

In [None]:

# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Function to remove special characters
def remove_special_chars(text):
    return re.sub(r'[#@|\[\]{}]', '', text)

# Function to remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_bad_strings(text):
    # Match any sequence of these corrupted characters and similar ones
    bad_pattern = r'[ÃÂ°Ÿ¤¯‰]+'
    return re.sub(bad_pattern, '', text)

# Update the full cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = remove_emojis(text)
    text = remove_special_chars(text)
    text = remove_html_tags(text)
    text = remove_bad_strings(text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_duration_to_seconds(duration_str):
    """Parses an ISO 8601 duration string and returns the total seconds."""
    try:
        duration = isodate.parse_duration(duration_str)
        return int(duration.total_seconds())
    except Exception:
        return None # Handle any parsing errors
    
channel_details_df = pd.read_csv('Dhruvin_cleaned_video_details.csv')

# Overwrite the 'duration' column with seconds
channel_details_df['duration'] = channel_details_df['duration'].apply(parse_duration_to_seconds)

# Save the changes back to the CSV file
channel_details_df.to_csv('Dhruvin_cleaned_video_details.csv', index=False)

In [None]:

# Load CSV
input_file = 'Dhruvin_cleaned_video_details.csv'  # Change to your file name
df = pd.read_csv(input_file)

In [None]:

# Clean relevant columns
for col in ['title', 'description', 'channel_title', 'channel_description']:
    if col in df.columns:
        df[col] = df[col].apply(clean_text)

# Ensure title uniqueness across video_id
if 'video_id' in df.columns and 'title' in df.columns:
    df = df.drop_duplicates(subset=['title', 'video_id'])

# Save cleaned CSV
output_file = 'cleaned_video_details.csv'
df.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")

Cleaned data saved to dhruvin_cleaned_video_details.csv
