In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
print("Original shape:", df.shape)

Original shape: (8807, 12)


In [4]:
print("Missing values before cleaning:\n", df.isnull().sum())

Missing values before cleaning:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [5]:
df = df.dropna(subset=["date_added", "duration", "rating"])

In [6]:
df['director'] = df['director'].fillna("Unknown")
df['cast'] = df['cast'].fillna("Unknown")
df['country'] = df['country'].fillna("Unknown")

In [7]:
df[['duration_value', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)', expand=True)
df['duration_value'] = pd.to_numeric(df['duration_value'], errors='coerce')

In [8]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df = df.dropna(subset=['date_added'])

df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month_name()

df['date_added'] = df['date_added'].dt.date

In [9]:
df['main_genre'] = df['listed_in'].str.split(',').str[0].str.strip()

In [10]:
df_directors = df[df['director'] != "Unknown"].copy()
df_directors['director'] = df_directors['director'].str.split(',')
df_directors = df_directors.explode('director')
df_directors['director'] = df_directors['director'].str.strip()
df_directors = df_directors[['show_id', 'director']]

In [11]:
df_actors = df[df['cast'] != "Unknown"].copy()
df_actors['cast'] = df_actors['cast'].str.split(',')
df_actors = df_actors.explode('cast')
df_actors['actor'] = df_actors['cast'].str.strip()
df_actors = df_actors[['show_id', 'actor']]

In [12]:
with pd.ExcelWriter("netflix_data.xlsx", engine="xlsxwriter") as writer:
    df.to_excel(writer, sheet_name="movies", index=False)
    df_directors.to_excel(writer, sheet_name="directors", index=False)
    df_actors.to_excel(writer, sheet_name="actors", index=False)

print("netflix_data.xlsx' created with sheets: movies, directors, actors")

netflix_data.xlsx' created with sheets: movies, directors, actors
