In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns

In [17]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
from collections import Counter

init_notebook_mode(connected=True)

df = pd.read_csv('/Users/abughdaryan/Documents/repos/asds/ASDS-DataVisualization/project/data/netflix1.csv')

df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

df['normalized_title'] = df['title'].str.lower().str.strip()

df = df.drop_duplicates(subset='normalized_title', keep='first')

df['country'] = df['country'].replace('Not Given', np.nan)
df['director'] = df['director'].replace('Not Given', np.nan)
df['rating'] = df['rating'].replace('Not Given', np.nan)

df['show_id'] = df['show_id'].str.replace('s', '').astype(int)
df.set_index('show_id', inplace=True)
df.sort_index(inplace=True)


df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month


df.drop(columns=['normalized_title', 'date_added'], inplace=True)


print("DataFrame Info:")
df.info()
print("\nMissing Values (%):")
print((df.isnull().sum() / len(df)) * 100)
print("\nFirst 5 Rows:")
print(df.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 8781 entries, 1 to 8807
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8781 non-null   object
 1   title         8781 non-null   object
 2   director      6197 non-null   object
 3   country       8494 non-null   object
 4   release_year  8781 non-null   int64 
 5   rating        8781 non-null   object
 6   duration      8781 non-null   object
 7   listed_in     8781 non-null   object
 8   year_added    8781 non-null   int32 
 9   month_added   8781 non-null   int32 
dtypes: int32(2), int64(1), object(7)
memory usage: 686.0+ KB

Missing Values (%):
type             0.000000
title            0.000000
director        29.427172
country          3.268420
release_year     0.000000
rating           0.000000
duration         0.000000
listed_in        0.000000
year_added       0.000000
month_added      0.000000
dtype: float64

First 5 Rows:
     

### Content Type Distribution (Movie vs. TV Show)

In [20]:
content_counts = df["type"].value_counts()

fig_pie = px.pie(
    names=content_counts.index,
    values=content_counts.values,
    title="Distribution of Content Types on Netflix",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hole=0.3,
)

fig_pie.update_traces(textposition="inside", textinfo="percent+label")
fig_pie.update_layout(legend_title_text="Content Type", title_x=0.5)

fig_pie.show()

In [21]:
directors_count = df['director'].dropna().value_counts()

top_10_directors = directors_count.head(10).reset_index()
top_10_directors.columns = ['director', 'count']

fig_directors = px.bar(
    top_10_directors,
    x='director',
    y='count',
    title='Top 10 Directors on Netflix by Number of Titles',
    labels={'count': 'Number of Titles', 'director': 'Director'},
    text='count',
    color='count',
    color_continuous_scale=px.colors.sequential.Reds
)

fig_directors.update_layout(
    xaxis_title="Director",
    yaxis_title="Number of Titles",
    title_x=0.5,
    xaxis_tickangle=-45
)
fig_directors.update_traces(textposition='outside')

fig_directors.show()

### Top 10 Countries Producing Content

In [22]:
country_counts = df['country'].dropna().apply(lambda x: x.split(',')[0].strip()).value_counts()

top_10_countries = country_counts.head(10).reset_index()
top_10_countries.columns = ['country', 'count']

fig_countries = px.bar(
    top_10_countries,
    x='country',
    y='count',
    title='Top 10 Countries Producing Content on Netflix (Based on First Country Listed)',
    labels={'count': 'Number of Titles', 'country': 'Country'},
    text='count',
    color='count',
    color_continuous_scale=px.colors.sequential.Blues
)

fig_countries.update_layout(
    xaxis_title="Country",
    yaxis_title="Number of Titles",
    title_x=0.5,
    xaxis_tickangle=-30
)
fig_countries.update_traces(textposition='outside')

fig_countries.show()

### Content Release Over the Years (Based on 'release_year')

In [24]:
released_year_counts = df['release_year'].value_counts().sort_index()

fig_release_year = px.line(
    x=released_year_counts.index,
    y=released_year_counts.values,
    title='Netflix Content Release Trend Over the Years',
    labels={'x': 'Release Year', 'y': 'Number of Titles Added'},
    markers=True
)

fig_release_year.update_layout(
    xaxis_title="Release Year",
    yaxis_title="Number of Titles Released",
    title_x=0.5
)

fig_release_year.show()

### Monthly Content Additions (Based on 'year_added' and 'month_added')

In [25]:
monthly_added = df.dropna(subset=['year_added', 'month_added'])

monthly_counts = monthly_added.groupby(['year_added', 'month_added', 'type']).size().unstack(fill_value=0).reset_index()

monthly_melted = monthly_counts.melt(id_vars=['year_added', 'month_added'], var_name='type', value_name='count')

monthly_melted['year_month'] = monthly_melted['year_added'].astype(str) + '-' + monthly_melted['month_added'].astype(str).str.zfill(2)
monthly_melted = monthly_melted.sort_values('year_month')


fig_monthly_added = px.line(
    monthly_melted,
    x='year_month',
    y='count',
    color='type',
    title='Monthly Content Additions on Netflix (Movies vs. TV Shows)',
    labels={'count': 'Number of Titles Added', 'year_month': 'Year-Month Added', 'type': 'Content Type'},
    markers=True,
    color_discrete_map={'Movie': 'red', 'TV Show': 'black'}
)

fig_monthly_added.update_layout(
    xaxis_title="Year-Month Added",
    yaxis_title="Number of Titles Added",
    title_x=0.5,
    xaxis_tickangle=-45
)

fig_monthly_added.show()

### Movie Duration Distribution

In [26]:
movies_df = df[df["type"] == "Movie"].copy()
movies_df["duration_min"] = pd.to_numeric(
    movies_df["duration"].str.replace(" min", ""), errors="coerce"
)
movies_df.dropna(subset=["duration_min"], inplace=True)


fig_movie_duration_hist = px.histogram(
    movies_df,
    x="duration_min",
    title="Distribution of Movie Durations",
    labels={"duration_min": "Duration (minutes)"},
    marginal="box",
    color_discrete_sequence=["coral"],
)

fig_movie_duration_hist.update_layout(
    yaxis_title="Number of Movies", xaxis_title="Duration (minutes)", title_x=0.5
)
fig_movie_duration_hist.show()

fig_movie_duration_box = px.box(
    movies_df,
    y="duration_min",
    title="Summary of Movie Durations",
    labels={"duration_min": "Duration (minutes)"},
    points="outliers",
    color_discrete_sequence=["lightcoral"],
)
fig_movie_duration_box.update_layout(title_x=0.5)
fig_movie_duration_box.show()

### TV Show Season Distribution

In [27]:
tv_show_df = df[df['type'] == 'TV Show'].copy()

tv_show_df['seasons'] = pd.to_numeric(tv_show_df['duration'].str.split(' ').str[0], errors='coerce')
tv_show_df.dropna(subset=['seasons'], inplace=True)
tv_show_df['seasons'] = tv_show_df['seasons'].astype(int)

season_counts = tv_show_df['seasons'].value_counts().reset_index()
season_counts.columns = ['seasons', 'count']
season_counts = season_counts.sort_values('seasons')

fig_tv_seasons = px.bar(
    season_counts,
    x='seasons',
    y='count',
    title='Distribution of TV Show Seasons on Netflix',
    labels={'seasons': 'Number of Seasons', 'count': 'Number of TV Shows'},
    text='count'
)

fig_tv_seasons.update_layout(
    xaxis_title="Number of Seasons",
    yaxis_title="Number of TV Shows",
    title_x=0.5,
    xaxis = dict(
        tickmode = 'linear',
        dtick = 1
    )
)
fig_tv_seasons.update_traces(textposition='outside')
fig_tv_seasons.show()

### Genre Analysis (Top 20 Genres)

In [28]:
genre_list = df['listed_in'].dropna().str.split(', ')

all_genres = [genre for sublist in genre_list for genre in sublist]

genre_counts = Counter(all_genres)

top_20_genres = pd.DataFrame(genre_counts.most_common(20), columns=['genre', 'count'])

fig_genres = px.bar(
    top_20_genres.sort_values(by='count', ascending=True),
    x='count',
    y='genre',
    orientation='h',
    title='Top 20 Most Common Genres on Netflix',
    labels={'count': 'Number of Titles', 'genre': 'Genre'},
    text='count',
    color='count',
    color_continuous_scale=px.colors.sequential.Viridis
)

fig_genres.update_layout(
    yaxis_title="Genre",
    xaxis_title="Number of Titles",
    title_x=0.5,
    yaxis={'categoryorder':'total ascending'}
)
fig_genres.update_traces(textposition='outside')
fig_genres.show()

### Distribution of Content Ratings

In [29]:
rating_counts = df['rating'].dropna().value_counts().reset_index()
rating_counts.columns = ['rating', 'count']

fig_ratings = px.bar(
    rating_counts,
    x='rating',
    y='count',
    title='Distribution of Content Ratings on Netflix',
    labels={'rating': 'Rating', 'count': 'Number of Titles'},
    text='count',
    color = 'count',
    color_continuous_scale=px.colors.sequential.Plasma
)

fig_ratings.update_layout(
    xaxis_title="Rating Category",
    yaxis_title="Number of Titles",
    title_x=0.5,
    xaxis={'categoryorder':'total descending'}
)
fig_ratings.update_traces(textposition='outside')
fig_ratings.show()

### Trend of Content Types Added Over Years

In [30]:
type_trend = df.dropna(subset=['year_added']).groupby(['year_added', 'type']).size().unstack(fill_value=0)

fig_type_trend_area = px.area(
    type_trend,
    x=type_trend.index,
    y=['Movie', 'TV Show'],
    title='Trend of Content Types Added to Netflix Over Years',
    labels={'year_added': 'Year Added', 'value': 'Number of Titles Added', 'type': 'Content Type'},
    color_discrete_map={'Movie': 'red', 'TV Show': 'black'}
)

fig_type_trend_area.update_layout(
    xaxis_title="Year Added",
    yaxis_title="Number of Titles Added",
    title_x=0.5
)
fig_type_trend_area.show()


fig_type_trend_line = px.line(
    type_trend,
    x=type_trend.index,
    y=['Movie', 'TV Show'],
    title='Trend of Content Types Added to Netflix Over Years (Absolute)',
    labels={'year_added': 'Year Added', 'value': 'Number of Titles Added', 'type': 'Content Type'},
    markers=True,
    color_discrete_map={'Movie': 'red', 'TV Show': 'black'}
)

fig_type_trend_line.update_layout(
    xaxis_title="Year Added",
    yaxis_title="Number of Titles Added",
    title_x=0.5
)
fig_type_trend_line.show()

### Treemap for Genre Distribution

In [31]:
genre_counts_df = pd.DataFrame(genre_counts.items(), columns=['genre', 'count']).sort_values(by='count', ascending=False)


fig_treemap = px.treemap(
    genre_counts_df.head(30),
    path=[px.Constant("All Genres"), 'genre'],
    values='count',
    title='Hierarchical Distribution of Top 30 Genres by Title Count',
    color='count',
    color_continuous_scale='viridis'
)

fig_treemap.update_layout(margin = dict(t=50, l=25, r=25, b=25), title_x=0.5)
fig_treemap.update_traces(textinfo = 'label+value+percent root')
fig_treemap.show()