In [2]:
# File: 2_data_cleaning.py

import pandas as pd

# Load the dataset
df = pd.read_csv('NetflixClustering_DB.csv')

# 1. Handling 'director', 'cast', and 'country'
# Since these are important for clustering but have many missing values, we fill them with 'Unknown'.
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')

# 2. Handling 'rating' and 'date_added'
# Since only 1 value is missing for each, we can fill them with the most common value (mode) or drop the rows. Here, we'll fill them with the mode.
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])

# 3. Verify that there are no more null values
print("--- Missing Values After Cleaning ---")
print(df.isnull().sum())

# 4. Save the cleaned data to a new file for the next step
df.to_csv('cleaned_netflix_data.csv', index=False)
print("\nCleaned data saved as 'cleaned_netflix_data.csv'")

--- Missing Values After Cleaning ---
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

Cleaned data saved as 'cleaned_netflix_data.csv'
