In [15]:
# load Data

import pandas as pd
import numpy as np
import re
from datetime import timedelta

try:
    df_raw = pd.read_csv('Main.csv')
    print("Loaded the file.")
except FileNotFoundError:
    print("Error no file found")
   

Loaded the file.


In [18]:
# Data Cleaning
def clean_dataframe(df):
    """
    Function to clean the raw dataset
    Handles misssing values, misssind value and remove duplicates data types
    """
    df_clean = df.copy()

    # missing values
    df_clean['likeCount'] = df_clean['likeCount'].fillna(0)
    df_clean['commentCount'] = df_clean['commentCount'].fillna(0)
    df_clean['description'] = df_clean['description'].fillna('')

    # convert floats to int
    df_clean['likeCount'] = df_clean['likeCount'].astype(int)
    df_clean['commentCount'] = df_clean['commentCount'].astype(int)
    df_clean['published_at'] = pd.to_datetime(df_clean['published_at'])

    # Remove Duplicate
    rows_before = len(df_clean)
    df_clean.drop_duplicates(subset=['title'], keep='first', inplace= True)
    rows_after = len(df_clean)
    print(f"Removed {rows_before - rows_after} duplicate videos title")

    df_clean.reset_index(drop=True, inplace = True)
    print("Data cleaning done")
    return df_clean

df_cleaned = clean_dataframe(df)

Removed 2880 duplicate videos title
Data cleaning done


In [19]:
# feature engineering

def parse_iso8601_duration(duration_str):
    """
    Parses an ISO 8601 duration string - PT1M5S and return the total sec.
    """
    if not isinstance(duration_str, str) or not duration_str.startswith('PT'):
        return 0
    duration = re.sub(r'^PT', '', duration_str)
    seconds = 0
    hours_match = re.search(r'(\d+)H', duration)
    minutes_match = re.search(r'(\d+)M', duration)
    seconds_match = re.search(r'(\d+)S', duration)
    if hours_match:
        seconds += int(hours_match.group(1)) * 3600
    if minutes_match:
        seconds += int(minutes_match.group(1)) * 60
    if seconds_match:
        seconds += int(seconds_match.group(1))
    return seconds

def create_features(df):
    df_featured = df.copy()

    df_featured['title_length'] = df_featured['title'].str.len()

    df_featured['published_year'] = df_featured['published_at'].dt.year
    df_featured['published_month'] = df_featured['published_at'].dt.month
    df_featured['published_day_of_week'] = df_featured['published_at'].dt.dayofweek

    df_featured['duration_seconds'] = df_featured['duration'].apply(parse_iso8601_duration)

    df_featured['like_ratio'] = df_featured['likeCount'] / (df_featured['viewCount'] + 1e-6)
    df_featured['comment_ratio'] = df_featured['commentCount'] / (df_featured['viewCount'] + 1e-6)

    print("feature engineering complete")
    return df_featured

if 'df_cleaned' in locals() and not df_cleaned.empty:
    df_final = create_features(df_cleaned)

    print("\nPreview of the final dataframe:")
    print(df_final.head())

feature engineering complete

Preview of the final dataframe:
      video_id                                              title  \
0  4wZwXhoxRIA  15 Tech Gadgets I Use Every Day #shorts #justi...   
1  5fKkXFresYI                      5 Times Tech has SAVED LIVES!   
2  KArch4rjU_0  USB in USB #charger #technology #electronic #f...   
3  gsJAlLOFBv0                      TINY Tech That Actually Works   
4  uvG-WToQfU4                             Old Or New Technology?   

                                         description  \
0                                                      
1  #shorts #technology \n\nI spend a LOT of time ...   
2  Maybe stop plugging things into each other to ...   
3                                                      
4  Follow me here:\nInstagram ► https://www.insta...   

                                                tags  \
0                                                 []   
1                                 ["tech", "shorts"]   
2                 

In [20]:
# save the new file
if 'df_final' in locals():
    output_filename = "youtube_data_clean.csv"

    try:
        df_final.to_csv(output_filename, index=False)
        print(f"file saved as '{output_filename}'")

    except Execption as e:
        print(f"file not saved. An error occurred: {e}")
else:
    print("file not saved - not found or empty")

file saved as 'youtube_data_clean.csv'
