In [3]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv("netflix_titles.csv")

# STEP 1: Basic Overview
print("Initial Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())

# STEP 2: Rename Columns (clean & consistent)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# STEP 3: Handle Missing Values
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Not specified')
df['country'] = df['country'].fillna('Unknown')
df['rating'] = df['rating'].fillna('Unknown')

# ✅ FIX: Convert 'date_added' using automatic date parsing
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Drop rows with essential fields missing
df.dropna(subset=['type', 'title'], inplace=True)

# STEP 4: Remove Duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
df.drop_duplicates(inplace=True)

# STEP 5: Standardize Text Values
df['type'] = df['type'].str.title()
df['rating'] = df['rating'].str.upper()

# STEP 6: Convert Data Formats
df['release_year'] = df['release_year'].astype(int)

# STEP 7: Final Data Check
print("\nFinal Dataset Info:")
print(df.info())

# Save cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as 'netflix_titles_cleaned.csv'")


Initial Shape: (8807, 12)

Missing Values:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Number of duplicate rows: 0

Final Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      8807 non-null   object        
 4   cast          8807 non-null   object        
 5   country       8807 non-null   object        
 6   date_added    8709 non-null   datetime64[ns]
 7   release_year  8807 non-null   int32         
 8   rating        8807 non-null  