In [1]:
import pandas as pd

In [3]:
# Load dataset
df = pd.read_csv("netflix_titles.csv")

In [5]:
# Step 1: Identify and handle missing values
print("Missing values before cleaning:\n", df.isnull().sum())

Missing values before cleaning:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [7]:
# Step 2: Remove duplicate rows
df_cleaned = df.drop_duplicates()

In [9]:
# Step 3: Standardize text values (e.g., strip and title-case country names)
df_cleaned['country'] = df_cleaned['country'].str.strip().str.title()

In [11]:
# Step 4: Convert date formats to a consistent type
df_cleaned['date_added'] = pd.to_datetime(df_cleaned['date_added'], errors='coerce')

In [13]:
# Step 5: Rename column headers to lowercase and remove spaces
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(" ", "_")

In [15]:
# Step 6: Check and fix data types
df_cleaned['release_year'] = pd.to_numeric(df_cleaned['release_year'], errors='coerce')

In [17]:
print("\nMissing values after cleaning:\n", df_cleaned.isnull().sum())


Missing values after cleaning:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        98
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [19]:
print("\nColumn names:\n", df_cleaned.columns.tolist())


Column names:
 ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']


In [21]:
print("\nData types:\n", df_cleaned.dtypes)


Data types:
 show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


In [23]:
# Handle missing values
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Not Specified')
df['country'] = df['country'].fillna(df['country'].mode()[0])  # Fill with most common country
df = df.dropna(subset=['date_added'])  # Drop rows with missing date
df['rating'] = df['rating'].fillna('Not Rated')
df['duration'] = df['duration'].fillna('Not Specified')

In [25]:
# Ensure correct data types
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

In [29]:
#Final check
print("Missing values after full cleaning:\n", df.isnull().sum())

Missing values after full cleaning:
 show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [31]:
print("Cleaned dataset shape:", df.shape)

Cleaned dataset shape: (8797, 12)


In [33]:
#Save cleaned dataset
df.to_csv("netflix_titles_cleaned_v2.csv", index=False)

In [35]:
import os
print(os.getcwd())

C:\Users\debji
