In [24]:
#Import all necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

In [25]:
#load the dataset and look through the first 5 records
netflix_data = pd.read_csv('netflix1.csv')

net_df = pd.DataFrame(netflix_data)

net_df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [26]:
#Drop duplicates via show_id
net_df.drop_duplicates(subset = 'show_id', keep = False, inplace = True)
net_df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [27]:
#Learn more about the data and their datatypes
net_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


In [28]:
#Convert date_added to a datetime object and check
net_df['date_added'] = pd.to_datetime(net_df['date_added'])
net_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8790 non-null   object        
 1   type          8790 non-null   object        
 2   title         8790 non-null   object        
 3   director      8790 non-null   object        
 4   country       8790 non-null   object        
 5   date_added    8790 non-null   datetime64[ns]
 6   release_year  8790 non-null   int64         
 7   rating        8790 non-null   object        
 8   duration      8790 non-null   object        
 9   listed_in     8790 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 686.8+ KB


In [29]:
#Seperate date_added into new columns Day, Month and Year
net_df['Day_added'] = net_df['date_added'].dt.day_name()
net_df['month_added'] = net_df['date_added'].dt.month_name()
net_df['year_added'] = net_df['date_added'].dt.year
net_df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in,Day_added,month_added,year_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,2021-09-25,2020,PG-13,90 min,Documentaries,Saturday,September,2021
1,s3,TV Show,Ganglands,Julien Leclercq,France,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",Friday,September,2021
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,2021-09-24,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",Friday,September,2021
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,2021-09-22,2021,TV-PG,91 min,"Children & Family Movies, Comedies",Wednesday,September,2021
4,s8,Movie,Sankofa,Haile Gerima,United States,2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies",Friday,September,2021


In [30]:
# Calculate the percentage of each type of show
type_counts = net_df['type'].value_counts(normalize=True).reset_index()
type_counts.columns = ['type', 'percentage']
type_counts.head()

Unnamed: 0,type,percentage
0,Movie,0.696928
1,TV Show,0.303072


In [31]:
#Top 20 countries who produced the most Tv shows and Movies combined
top_countries = net_df[net_df['country'] != 'Not Given'].groupby(['country', 'type'])['show_id'].count()
top_countries = top_countries.sort_values(ascending = False)
top_20 = top_countries.reset_index(name = 'shows').head(20)
top_20.head()

Unnamed: 0,country,type,shows
0,United States,Movie,2395
1,India,Movie,976
2,United States,TV Show,845
3,United Kingdom,Movie,387
4,Pakistan,TV Show,350


In [32]:
#Top 10 Director to produce Tv shows and Movies
directors = net_df[net_df['director'] != 'Not Given'].groupby(['director', 'type'])['show_id'].count()
directors = directors.sort_values(ascending = False)
top_10 = directors.reset_index(name = 'shows').head(10)
top_10.head()

Unnamed: 0,director,type,shows
0,Rajiv Chilaka,Movie,19
1,"Raúl Campos, Jan Suter",Movie,18
2,Suhas Kadav,Movie,16
3,Marcus Raboy,Movie,15
4,Alastair Fothergill,TV Show,14


In [33]:
#Top 15 Genres
genres = net_df.groupby(['listed_in', 'type'])['type'].count()
genres = genres.sort_values(ascending = False)
top_15 = genres.reset_index(name = 'genre_count').head(15)
top_15.head()

Unnamed: 0,listed_in,type,genre_count
0,"Dramas, International Movies",Movie,362
1,Documentaries,Movie,359
2,Stand-Up Comedy,Movie,334
3,"Comedies, Dramas, International Movies",Movie,274
4,"Dramas, Independent Movies, International Movies",Movie,252


In [34]:
#Group by rating and type
movie_rating = net_df[net_df['type'] == 'Movie'].groupby(['rating', 'type'])['rating'].count()
movie_rating = movie_rating.reset_index(name = 'count')
movie_rating.head()

Unnamed: 0,rating,type,count
0,G,Movie,41
1,NC-17,Movie,3
2,NR,Movie,75
3,PG,Movie,287
4,PG-13,Movie,490


In [35]:
#Group by rating and type
show_rating = net_df[net_df['type'] == 'TV Show'].groupby(['rating', 'type'])['rating'].count()
show_rating = show_rating.reset_index(name = 'count')
show_rating.head()

Unnamed: 0,rating,type,count
0,NR,TV Show,4
1,R,TV Show,2
2,TV-14,TV Show,730
3,TV-G,TV Show,94
4,TV-MA,TV Show,1143


In [36]:
#Group by year and type
year = net_df.groupby(['year_added', 'type'])['type'].count()
year = year.reset_index(name = 'count')
year.head()

Unnamed: 0,year_added,type,count
0,2008,Movie,1
1,2008,TV Show,1
2,2009,Movie,2
3,2010,Movie,1
4,2011,Movie,13


In [49]:
#Create dash app
net_dashapp = dash.Dash(__name__)

net_dashapp.layout = html.Div([

    html.H1('Netflix Data Analysis', style = {'color': 'white'}),

    dcc.Dropdown(
        id = 'graph-dropdown',

        options = [
            {'label': 'Percentage of Show Types', 'value' : 'pie' },
            {'label': 'Top 20 Countries with the Most Tv Shows and Movies Produced', 'value' : 'bar'},
            {'label': 'Top 10 Directors who produced the most Tv Shows/Movies', 'value' : 'histogram'},
            {'label': 'Top 15 Genres', 'value' : 'bubble'},
            {'label': 'Ratings Count', 'value' : 'waterfall'},
            {'label': 'Release Per Year', 'value': 'line'}
        ],

        value = 'pie',#Default
        clearable = False
    ),

    dcc.Graph(id = 'dynamic-graph'),
    html.Div(id = 'report-info', style = {'marginTop' : 20})
])


In [50]:
#Create dynamic interface
@net_dashapp.callback(

    Output('dynamic-graph', 'figure'),
    [Input('graph-dropdown', 'value')]
    
)

def update_graph(selected_graph):

    #Create Pie chart
    if selected_graph == 'pie':
        
        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']
        fig = px.pie(type_counts, values = 'percentage', names = 'type', title ='Percentage of Show Types', color_discrete_sequence = colours)

    #Create Bar Graph
    elif selected_graph == 'bar':
        
        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']
        fig = px.bar(top_20, x = 'country', y = 'shows', color = 'type', title = "Top 20 Countries with the Most Tv Shows and Movies Produced", color_discrete_sequence= colours)

        fig.update_layout(
    
            barmode = 'group'
        
        )

    #Create Histogram
    elif selected_graph == 'histogram':
        
        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']
        fig = px.histogram(top_10, x = 'director', y = 'shows', color = 'type', title = "Top 10 Directors who produced the most Tv Shows/Movies", color_discrete_sequence = colours)
    
    #create Histogram
    elif selected_graph == 'bubble':

        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']
        fig = px.scatter(top_15, x = 'listed_in', y = 'genre_count', color = 'type', title = 'Top 15 Genres', color_discrete_sequence = colours)
    
    #Create Waterfall graph
    elif selected_graph == 'waterfall':

        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']

        fig = make_subplots(rows = 1, cols = 2, subplot_titles = ("Movies", "Tv Shows"))
        fig.add_trace(go.Waterfall(x = movie_rating['rating'], y = movie_rating['count'],measure=['relative', 'relative', 'relative', 'relative', 'total'], text = rating['count'], textposition = 'outside', connector = dict(line = dict(color = "RoyalBlue"))  ), row = 1, col= 1)
        fig.add_trace(go.Waterfall(x = show_rating['rating'], y = show_rating['count'],measure=['relative', 'relative', 'relative', 'relative', 'total'], text = rating['count'], textposition = 'outside', connector = dict(line = dict(color = "RoyalBlue")) ), row = 1 , col = 2)
        fig.update_layout(
        
        title_text='Movie and Tv Show Ratings',
    )

        
    
    #Create Line Graph
    elif selected_graph == 'line':

        colours = ['#436272', '#A3BBC8', '#243F4D', '#FCD7B6', '#AD886E']
        fig = px.line(year, x = 'year_added', y = 'count', color = 'type', title = "Release Per Year", color_discrete_sequence = colours)

    return fig




In [51]:
@net_dashapp.callback(

    Output('report-info', 'children'),
    [Input('graph-dropdown', 'value')]
    
)

def update_report(selected_graph):
    style = {'color': 'white'}
    if selected_graph == 'pie':
        
        report = [

            html.H3("Percentage of Show Types Report", style = style),
            html.P('This pie chart shows the percentage distribution of different types of Netflix Shows.', style = style),
            html.P('You can see the proportion of movies and Tv shows available on Netflix.', style = style),
            html.P('Between the years 2008 and 2021, it can be seen that there was a 39.4% increase in movie uploads.', style = style)
            
            
        ]
        
    elif selected_graph == 'bar':
        
        report = [

            html.H3("Top 20 Countries with the Most Tv Shows and Movies Produced", style = style),
            html.P('This bar chart shows the top 20 countries by the number of netflix shows.', style = style),
            html.P('The bars are grouped by show type (Movie or Tv Show).', style = style),
            html.P('Netflix has the most movies from the United States, followed by India in second place and the United Kingdom in third place.', style = style),
            html.P('It also has the most Tv episodes from the United States, followed by Pakistan in second place and the United Kingdom in third place.', style = style)
            
        ]
        
    elif selected_graph == 'histogram':
        
        report = [

            html.H3("Top 10 Directors who produced the most Tv Shows/Movies", style = style),
            html.P('This histogram shows the top 10 directors by the number of  Netflix shows.', style = style)
            
            
        ]
        
    elif selected_graph == 'bubble':

        report = [

            html.H3("Top 15 Genres Report", style = style),
            html.P('This bubble plot shows the top 15 genres across netflix shows.', style = style),
            html.P("It can be seen that Drama's are the most popular, followed by Documentries and Stand Up Comedy. ", style = style)
            
        ]

        
    elif selected_graph == 'waterfall':

        report = [

            html.H3("Ratings Count Report.", style = style),
            html.P('This Waterfall chart shows the distribution of Netflix show ratings.', style = style),
            html.P('You can observe the frequency of different ratings assigned to the shows.', style = style),
            html.H2('Movie Ratings', style = style),
            html.P('The movie rating chart reveals that 33.7% of movies are rated TV-MA. This indicates that the show is meant for mature audiences.', style = style),
            html.P('The second most movies are classified TV-14, which shows that Parental Guidelines indicates content for mature audiences.', style = style),
            html.P('The third most rated movies are rates R, which implies that they are not acceptable for children to watch due to violence, foul language, or sexual acitivity.', style = style),
            html.H2('Tv Show Ratings', style = style),
            html.P('The Tv rating chart reveals that 42.9% of Tv shows are TV-MA, which means that the broadcasts are meant for mature audiences.', style = style),
            html.P('The sencond most Tv shows are rated TV-14, implying that Parental Guidelines indicate content for mature audiences.', style = style),
            html.P('The third most Tv shows are rated TV-PG, which deontes under parental guidnace.', style = style)
            
        ]
        
    elif selected_graph == 'line':

        report = [

            html.H3("Release Per Year Report", style = style),
            html.P('This line chart shows the distributions of Movies an Tv shows added over the years.', style = style),
            html.P('It can be observed that Netflix began to add Tv shows in large quantities from 2014', style = style)
            
        ]

    return report
    

In [52]:
#Run Dash App
if __name__ == '__main__':
    net_dashapp.run_server(debug = True)