In [None]:

# Task 1: Data Cleaning and Preprocessing
# Dataset: Netflix Movies and TV Shows

import pandas as pd

# Load dataset
df = pd.read_csv("netflix_titles.csv")
print("Initial Shape:", df.shape)
df.head()

# Basic info
df.info()

# Check missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Drop duplicates
df.drop_duplicates(inplace=True)

# Standardize text columns
df['type'] = df['type'].str.strip().str.title()
df['rating'] = df['rating'].str.strip().str.upper()

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Final check
print("\nFinal Shape:", df.shape)
df.info()

# Save cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)

# Summary of changes:
summary = '''
Summary of Cleaning:

- Filled missing values in 'director', 'cast', 'country' with 'Unknown'.
- Converted 'date_added' to datetime format.
- Removed duplicate records.
- Standardized text formats in 'type' and 'rating' columns.
- Cleaned column headers (lowercase, underscores).
'''
print(summary)
