In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ----------------------------------
# Global Styling (Professional Look)
# ----------------------------------
sns.set_theme(style="whitegrid", context="talk")
plt.rcParams["figure.figsize"] = (10, 6)

# ----------------------------------
# Paths
# ----------------------------------
DATA_PATH = "netflix_titles.csv"
OUTPUT_DIR = "eda_report"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ----------------------------------
# Load Dataset
# ----------------------------------
df = pd.read_csv(DATA_PATH)

# ----------------------------------
# Data Cleaning & Feature Engineering
# ----------------------------------
df = df.dropna(subset=["type", "release_year", "duration", "listed_in"])

# Convert date_added to datetime
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")
df["year_added"] = df["date_added"].dt.year

# Extract numeric duration
df["duration_num"] = df["duration"].str.extract("(\d+)").astype(int)

# ----------------------------------
# 1. Content Distribution by Type
# ----------------------------------
plt.figure()
sns.countplot(data=df, x="type", palette="Set2")
plt.title("Distribution of Content by Type", pad=15)
plt.xlabel("Content Type")
plt.ylabel("Number of Titles")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/content_by_type.png", dpi=100)
plt.close()

# ----------------------------------
# 2. Content Growth Over Time (Release Year)
# ----------------------------------
yearly_content = df["release_year"].value_counts().sort_index()

plt.figure()
plt.plot(yearly_content.index, yearly_content.values, linewidth=2)
plt.title("Netflix Content Growth Over Time", pad=15)
plt.xlabel("Release Year")
plt.ylabel("Number of Titles")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/content_growth_over_time.png", dpi=100)
plt.close()

# ----------------------------------
# 3. Content Added Over Time (Netflix Expansion)
# ----------------------------------
added_year = df["year_added"].value_counts().sort_index()

plt.figure()
plt.plot(added_year.index, added_year.values, linewidth=2)
plt.title("Titles Added to Netflix Per Year", pad=15)
plt.xlabel("Year Added")
plt.ylabel("Number of Titles")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/titles_added_per_year.png", dpi=100)
plt.close()

# ----------------------------------
# 4. Top 10 Genres
# ----------------------------------
genres = df["listed_in"].str.split(", ").explode()
top_genres = genres.value_counts().head(10)

plt.figure()
sns.barplot(x=top_genres.values, y=top_genres.index, palette="viridis")
plt.title("Top 10 Most Common Genres on Netflix", pad=15)
plt.xlabel("Number of Titles")
plt.ylabel("Genre")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/top_10_genres.png", dpi=100)
plt.close()

# ----------------------------------
# 5. Movie Runtime Distribution
# ----------------------------------
movies = df[df["type"] == "Movie"]

plt.figure()
sns.histplot(movies["duration_num"], bins=30, kde=True)
plt.title("Movie Runtime Distribution", pad=15)
plt.xlabel("Duration (Minutes)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/movie_runtime_distribution.png", dpi=100)
plt.close()

# ----------------------------------
# 6. Top 10 Years with Most Content
# ----------------------------------
top_years = df["release_year"].value_counts().head(10)

plt.figure()
sns.barplot(x=top_years.index, y=top_years.values, palette="magma")
plt.title("Top 10 Years with Highest Content Production", pad=15)
plt.xlabel("Release Year")
plt.ylabel("Number of Titles")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/top_10_release_years.png", dpi=100)
plt.close()

# ----------------------------------
# 7. Summary Report
# ----------------------------------
summary = f"""
NETFLIX DATASET ‚Äì EXPLORATORY DATA ANALYSIS SUMMARY
==================================================

Total Titles Analyzed: {len(df)}

Content Type Distribution:
{df['type'].value_counts().to_string()}

Top 10 Genres:
{top_genres.to_string()}

Top 10 Release Years:
{top_years.to_string()}

Key Insights:
- Movies dominate Netflix‚Äôs content library.
- Strong acceleration in content production post-2015.
- Drama and International content are the most frequent genres.
- Majority of movies range between 80‚Äì120 minutes.
- Netflix aggressively expanded its catalog between 2016‚Äì2020.
"""

with open(f"{OUTPUT_DIR}/summary.txt", "w", encoding="utf-8") as f:
    f.write(summary)

print("‚úÖ Professional EDA Report Generated Successfully!")
print(f"üìÅ Check the '{OUTPUT_DIR}' folder for plots and summary.")

  df["duration_num"] = df["duration"].str.extract("(\d+)").astype(int)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x="type", palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_genres.values, y=top_genres.index, palette="viridis")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_years.index, y=top_years.values, palette="magma")


‚úÖ Professional EDA Report Generated Successfully!
üìÅ Check the 'eda_report' folder for plots and summary.
