In [2]:
import pandas as pd

# Load the dataset (update 'netflix.csv' to your actual file name/path)
df = pd.read_csv('netflix_titles.csv')

# 1. Handle missing values
# Summary: Count missing values per column
missing_summary = df.isnull().sum()
print("Missing values per column:\n", missing_summary)

# Fill or drop missing values as needed (example: drop rows with null 'title', fill others)
df['title'] = df['title'].fillna('Unknown')
df = df.dropna(subset=['show_id'])  # Ensure show_id is present

# 2. Remove duplicate rows
duplicates_count = df.duplicated().sum()
df = df.drop_duplicates()
print(f"Removed {duplicates_count} duplicate rows.")

# 3. Standardize text values (example: gender, country names)
# Lowercase country names and strip spaces
if 'country' in df.columns:
    df['country'] = df['country'].str.lower().str.strip()

# Fix inconsistent 'type'
df['type'] = df['type'].str.title().str.strip()

# 4. Convert date formats ('date_added'), ensure 'release_year' is int
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce', format='%B %d, %Y')  # Netflix format
df['release_year'] = df['release_year'].fillna(0).astype(int)

# 5. Rename columns (lowercase, replace spaces with _)
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# 6. Fix data types (example: duration as string for now)
if 'duration' in df.columns:
    df['duration'] = df['duration'].astype(str)

# Save cleaned data to new file
df.to_csv('netflix_cleaned.csv', index=False)

# Short summary of changes:
print("\nSummary of Cleaning:")
print("- Filled missing 'title' with 'Unknown'")
print("- Dropped rows with missing 'show_id'")
print("- Removed duplicate rows")
print("- Standardized text in 'type' and 'country' columns")
print("- Converted 'date_added' to datetime, 'release_year' to int")
print("- Renamed columns to lowercase with underscores")
print("- Saved cleaned data to 'netflix_cleaned.csv'")


Missing values per column:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64
Removed 0 duplicate rows.

Summary of Cleaning:
- Filled missing 'title' with 'Unknown'
- Dropped rows with missing 'show_id'
- Removed duplicate rows
- Standardized text in 'type' and 'country' columns
- Converted 'date_added' to datetime, 'release_year' to int
- Renamed columns to lowercase with underscores
- Saved cleaned data to 'netflix_cleaned.csv'
