In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import re


In [2]:
# Step 2: Load the dataset
netflix = pd.read_csv('netflix_titles.csv')

In [3]:
# Step 3: Handle missing values
# Fill missing director, cast, and country with 'Unknown'
netflix['director'] = netflix['director'].fillna('Unknown')
netflix['cast'] = netflix['cast'].fillna('Unknown')
netflix['country'] = netflix['country'].fillna('Unknown')
netflix['date_added'] = netflix['date_added'].fillna('01-Jan-1900')  # placeholder for missing dates

In [4]:
# Step 4: Convert 'date_added' to datetime
netflix['date_added'] = pd.to_datetime(netflix['date_added'], format='%B %d, %Y', errors='coerce')


In [5]:
# Step 5: Extract year and month for analysis
netflix['year_added'] = netflix['date_added'].dt.year
netflix['month_added'] = netflix['date_added'].dt.month_name()

In [6]:
# Step 6: Clean 'duration' column
# Create a new column for movie duration in minutes
def extract_duration(x):
    if pd.isnull(x):
        return np.nan
    elif 'min' in x:
        return int(re.findall(r'\d+', x)[0])
    else:
        return np.nan  # For TV shows, you can handle seasons separately if needed

netflix['duration_minutes'] = netflix['duration'].apply(extract_duration)


In [7]:
# Step 7: Remove duplicates
netflix.drop_duplicates(subset=['title','type'], inplace=True)

In [8]:
# Step 8: Optional – reset index
netflix.reset_index(drop=True, inplace=True)

In [9]:
# Step 9: Check cleaned data
print(netflix.head())

  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water          Unknown   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans          Unknown   
4      s5  TV Show           Kota Factory          Unknown   

                                                cast        country  \
0                                            Unknown  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...        Unknown   
3                                            Unknown        Unknown   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

  date_added  release_year rating   duration  \
0 2021-09-25          2020  PG-13     90 min   
1 2021-09-24          2021  TV-MA  2 Seasons   
2 2021-09-24          2021  TV-MA   1 Season   
3 2021-0

In [10]:
netflix.to_csv('netflix_cleaned.csv', index=False)
