Step 1: Import Required Libraries

In [2]:
import pandas as pd


Step 2: Read the "Uncleaned" CSV File

In [3]:
# Read the uncleaned backup CSV
df = pd.read_csv('USvideos_uncleaned_backup.csv', encoding='ISO-8859-1')

# Check the first few rows
print(df.head())


      video_id trending_date  \
0  2kyS6SvSYSE      17.14.11   
1  1ZAPwfrtAFY      17.14.11   
2  5qpjK5DgCt4      17.14.11   
3  puqaWrEC7tY      17.14.11   
4  d380meD0W0M      17.14.11   

                                               title          channel_title  \
0                 WE WANT TO TALK ABOUT OUR MARRIAGE           CaseyNeistat   
1  The Trump Presidency: Last Week Tonight with J...        LastWeekTonight   
2  Racist Superman | Rudy Mancuso, King Bach & Le...           Rudy Mancuso   
3                   Nickelback Lyrics: Real or Fake?  Good Mythical Morning   
4                           I Dare You: GOING BALD!?               nigahiga   

   category_id              publish_time  \
0           22  2017-11-13T17:13:01.000Z   
1           24  2017-11-13T07:30:00.000Z   
2           23  2017-11-12T19:05:24.000Z   
3           24  2017-11-13T11:00:04.000Z   
4           24  2017-11-12T18:01:41.000Z   

                                                tags    views   lik

Step 3: Remove Duplicate Rows

In [4]:
# Remove any fully duplicated rows
df = df.drop_duplicates()

print("Duplicates removed.")


Duplicates removed.


Step 4: Handle Missing Values

In [5]:
# Fill missing 'description' with 'No Description'
if 'description' in df.columns:
    df['description'] = df['description'].fillna('No Description')

# Fill missing 'tags' with 'No Tags'
if 'tags' in df.columns:
    df['tags'] = df['tags'].fillna('No Tags')

# Example: Fill any other missing numerical columns with 0 (if needed)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[num_cols] = df[num_cols].fillna(0)

print("Missing values handled.")


Missing values handled.


Step 5: Correct Data Formats

In [6]:
# Convert 'publish_time' to datetime format (if the column exists)
if 'publish_time' in df.columns:
    df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')

print("Date formats corrected.")


Date formats corrected.


Step 6: Filter Unwanted Data (Example Rule)

In [7]:
# Example filter: Remove videos with views less than 1000
if 'views' in df.columns:
    df = df[df['views'] >= 1000]

print("Unwanted data filtered.")


Unwanted data filtered.


In [8]:
df.head(5)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13 17:13:01+00:00,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13 07:30:00+00:00,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12 19:05:24+00:00,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO Ã¢ÂÂ¶ \n\nSUBSCRIBE Ã...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13 11:00:04+00:00,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12 18:01:41+00:00,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


Step 7: Save Cleaned Data

In [9]:
# Save the cleaned DataFrame to a new CSV
df.to_csv('USvideos_cleaned_final.csv', index=False)

print("Cleaned data saved as 'USvideos_cleaned_final.csv'.")


Cleaned data saved as 'USvideos_cleaned_final.csv'.
