In [2]:
# Netflix Data Analytics Project
# Author: Anju Saradha
# Date: 2025

# ======================
# 1. Import libraries
# ======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)

# ======================
# 2. Load the dataset
# ======================

file_path = "D:/Anju_phython/your_project/netflix_titles.csv"

df = pd.read_csv(file_path)

# Check top rows
print("Dataset preview:")
print(df.head())

# ======================
# 3. Explore dataset
# ======================
print("\nBasic info:")
print(df.info())

print("\nMissing values:")
print(df.isnull().sum())

print("\nShape of dataset:", df.shape)

# ======================
# 4. Data Cleaning (ETL)
# ======================

# Handle missing values
df['country'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Available', inplace=True)
df['director'].fillna('Not Available', inplace=True)

# Drop duplicate entries
df.drop_duplicates(inplace=True)

# Convert 'date_added' to datetime
df['date_added'] = pd.to_datetime(df['date_added'])

# Extract year from 'release_year'
df['release_year'] = df['release_year'].astype(int)

# ======================
# 5. Exploratory Data Analysis
# ======================

# 5.1 Distribution by content type
plt.figure(figsize=(6,4))
sns.countplot(x='type', data=df, palette='pastel')
plt.title('Movies vs TV Shows on Netflix')
plt.xlabel('Content Type')
plt.ylabel('Count')
plt.show()

# 5.2 Top 10 countries producing content
plt.figure(figsize=(10,5))
top_countries = df['country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index, palette='cool')
plt.title('Top 10 Countries Producing Netflix Content')
plt.xlabel('Count')
plt.ylabel('Country')
plt.show()

# 5.3 Trend over release years
plt.figure(figsize=(10,5))
df['release_year'].value_counts().sort_index().plot(kind='line')
plt.title('Content Release Trend Over Years')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.grid()
plt.show()

# ======================
# 6. Generate Insights
# ======================

# Most frequent genres
df['listed_in'] = df['listed_in'].astype(str)
genres = df['listed_in'].str.split(',').explode().str.strip()
top_genres = genres.value_counts().head(10)
print("\nTop 10 genres:\n", top_genres)

# Most common directors
top_directors = df['director'].value_counts().head(5)
print("\nTop 5 directors:\n", top_directors)

# ======================
# 7. Export cleaned data
# ======================
df.to_csv("netflix_cleaned.csv", index=False)
print("\nCleaned data saved as netflix_cleaned.csv")


Dataset preview:
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Not Available', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

ValueError: time data " August 4, 2017" doesn't match format "%B %d, %Y", at position 1442. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
# =========================================
# NETFLIX DATA ANALYTICS PROJECT
# Author: Anju Saradha
# Date: 2025
# =========================================

# ---- 1. Import libraries ----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

pd.set_option('display.max_columns', None)

# ---- 2. Load the dataset ----
# If using Colab, first upload the file using:
# from google.colab import files
# uploaded = files.upload()
# df = pd.read_csv("netflix_titles.csv")

# If using local environment, give full path OR keep file in same folder
file_path = "D:/Anju_phython/your_project/netflix_titles.csv"

df = pd.read_csv(file_path)

print("✅ Dataset loaded successfully!")
print("Rows, Columns:", df.shape)
print(df.head())

# ---- 3. Basic info & missing values ----
print("\n--- Dataset Info ---")
print(df.info())

print("\n--- Missing Values ---")
print(df.isnull().sum())

# ---- 4. Data Cleaning (ETL) ----
# Fill missing textual fields
df['country'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Available', inplace=True)
df['director'].fillna('Not Available', inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

# --- SAFE DATE & YEAR PARSING ---
df['date_added'] = (
    df['date_added']
      .astype(str)
      .str.strip()
      .str.replace(r'(\d)(st|nd|rd|th)', r'\1', regex=True)
      .replace({'nan': None})
)

try:
    df['date_added'] = pd.to_datetime(
        df['date_added'],
        format='mixed',
        errors='coerce',
        dayfirst=False
    )
except TypeError:
    df['date_added'] = pd.to_datetime(
        df['date_added'],
        errors='coerce',
        infer_datetime_format=True
    )

df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')
df['year'] = df['date_added'].dt.year.astype('Int64')
df['year'] = df['year'].fillna(df['release_year'])

print("\n✅ Cleaned date and year columns")

# ---- 5. Exploratory Data Analysis ----
plt.style.use('ggplot')

# 5.1 Distribution of Movies vs TV Shows
plt.figure(figsize=(6,4))
sns.countplot(x='type', data=df, palette='Set2')
plt.title('Movies vs TV Shows on Netflix')
plt.xlabel('Content Type')
plt.ylabel('Count')
plt.show()

# 5.2 Top 10 countries producing content
plt.figure(figsize=(10,5))
top_countries = df['country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index, palette='coolwarm')
plt.title('Top 10 Countries Producing Netflix Content')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.show()

# 5.3 Trend over years (using cleaned 'year' column)
plt.figure(figsize=(10,5))
df['year'].value_counts().sort_index().plot(kind='line', marker='o')
plt.title('Netflix Content Trend Over Years')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.grid(True)
plt.show()

# ---- 6. Generate Insights ----
df['listed_in'] = df['listed_in'].astype(str)
genres = df['listed_in'].str.split(',').explode().str.strip()
top_genres = genres.value_counts().head(10)
print("\n🎬 Top 10 Genres:\n", top_genres)

top_directors = df['director'].value_counts().head(5)
print("\n🎥 Top 5 Directors:\n", top_directors)

# ---- 7. Export cleaned data ----
df.to_csv("netflix_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as netflix_cleaned.csv")

# ---- 8. Summary Insights ----
print("\n📊 Summary Insights:")
print("- Movies make up around", 
      round((df[df['type']=='Movie'].shape[0] / df.shape[0]) * 100, 1), "% of total content.")
print("- Top 3 countries producing content:", ", ".join(top_countries.index[:3]))
print("- Most popular genre:", top_genres.index[0])
print("- Content growth peaked around year:", df['year'].value_counts().idxmax())
