In [4]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv("/content/netflix_titles.csv")

# 2. Remove duplicates
df = df.drop_duplicates()

# 3. Standardize column names (lowercase, remove spaces)
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

# 4. Handle missing values
df['director'] = df['director'].fillna('Not Available')
df['cast'] = df['cast'].fillna('Not Available')
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['rating'] = df['rating'].fillna('Not Rated')
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df = df.dropna(subset=['date_added'])

# 5. Create new features: year and month added
df['added_year'] = df['date_added'].dt.year
df['added_month'] = df['date_added'].dt.month

# 6. Convert 'release_year' to int (if not already)
df['release_year'] = df['release_year'].astype(int)

# 7. Standardize text columns
df['type'] = df['type'].str.strip().str.title()
df['title'] = df['title'].str.strip()
df['rating'] = df['rating'].str.strip()

# 8. Reorder columns logically
column_order = [
    'show_id', 'type', 'title', 'director', 'cast', 'country',
    'date_added', 'added_year', 'added_month',
    'release_year', 'rating', 'duration', 'listed_in', 'description'
]
df = df[column_order]

# 9. Save cleaned and formatted dataset
df.to_csv("Netflix_Titles_Cleaned_Formatted.csv", index=False)
