In [8]:
# 1️⃣ Prepare data for statistics & probability calculations
import pandas as pd
import numpy as np

df = pd.read_csv("../data/netflix_cleaned.csv", parse_dates=['date_added'])

# Ensure critical columns exist and have no surprising nulls
print("Rows:", len(df))
print("Columns:", df.columns.tolist())

# Prepare an exploded genre table (one row per show_id per genre)
if 'genres_exploded' not in globals():
    genres = df[['show_id','type','listed_in']].copy()
    genres['listed_in'] = genres['listed_in'].fillna('Unknown')
    genres_exploded = genres.assign(genre = genres['listed_in'].str.split(',')).explode('genre')
    genres_exploded['genre'] = genres_exploded['genre'].str.strip()
    # quick check
    print("Exploded genres rows:", len(genres_exploded))
else:
    print("genres_exploded already exists.")


Rows: 8807
Columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'date_added_raw', 'duration_int', 'duration_unit', 'year_added', 'month_added', 'content_age', 'is_modern', 'num_genres', 'has_known_director']
genres_exploded already exists.


In [17]:
# 2️⃣ Central tendency & dispersion for numeric columns

# For release_year
release_stats = df['release_year'].agg(['count','mean','median','std','min','max']).to_dict()
release_stats['mode'] = df['release_year'].mode().iloc[0] if not df['release_year'].mode().empty else np.nan

duration_stats = df['duration_int'].agg(['count','mean','median','std','min','max']).to_dict()
duration_stats['mode'] = df['duration_int'].mode().iloc[0] if not df['duration_int'].mode().empty else np.nan

release_stats, duration_stats


({'count': 8807.0,
  'mean': 2014.1801975701146,
  'median': 2017.0,
  'std': 8.819312130833966,
  'min': 1925.0,
  'max': 2021.0,
  'mode': 2018},
 {'count': 8804.0,
  'mean': 69.84688777828259,
  'median': 88.0,
  'std': 50.81482778918896,
  'min': 1.0,
  'max': 312.0,
  'mode': 1.0})

In [18]:
# 3️⃣ Distribution percentages

# Content type distribution (Movies vs TV Shows)
type_dist = df['type'].fillna('Unknown').value_counts().mul(100).round(2)  # percent
type_counts = df['type'].value_counts()

# Top genres distribution (percentage of titles mentioning the genre)
top_genres_pct = genres_exploded['genre'].value_counts(normalize=True).mul(100).round(2).head(20)

# Top countries distribution
countries = df[['show_id','country']].copy()
countries['country'] = countries['country'].fillna('Unknown')
countries_exploded = countries.assign(country = countries['country'].str.split(',')).explode('country')
countries_exploded['country'] = countries_exploded['country'].str.strip()
top_countries_pct = countries_exploded['country'].value_counts(normalize=True).mul(100).round(2).head(15)

type_dist, type_counts, top_genres_pct, top_countries_pct


(type
 Movie      69.62
 TV Show    30.38
 Name: proportion, dtype: float64,
 type
 Movie      6131
 TV Show    2676
 Name: count, dtype: int64,
 genre
 International Movies        14.24
 Dramas                      12.56
 Comedies                     8.66
 International Tv Shows       6.99
 Documentaries                4.50
 Action & Adventure           4.45
 Tv Dramas                    3.95
 Independent Movies           3.91
 Children & Family Movies     3.32
 Romantic Movies              3.19
 Tv Comedies                  3.01
 Thrillers                    2.99
 Crime Tv Shows               2.43
 Kids' Tv                     2.33
 Docuseries                   2.04
 Music & Musicals             1.94
 Romantic Tv Shows            1.91
 Horror Movies                1.85
 Stand-Up Comedy              1.78
 Reality Tv                   1.32
 Name: proportion, dtype: float64,
 country
 United States     34.01
 India              9.64
 Unknown            7.66
 United Kingdom     7.43
 Can