In [None]:
import numpy as np
import pandas as pd
import plotly.express as px 

In [None]:
filelink = 'https://raw.githubusercontent.com/DevanshRathiji/datasets/main/netflix_titles.csv'
df = pd.read_csv(filelink)
#the dataset has been taken from kaggle 

In [None]:
df.head

<bound method NDFrame.head of      show_id     type                  title         director  \
0         s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1         s2  TV Show          Blood & Water              NaN   
2         s3  TV Show              Ganglands  Julien Leclercq   
3         s4  TV Show  Jailbirds New Orleans              NaN   
4         s5  TV Show           Kota Factory              NaN   
...      ...      ...                    ...              ...   
8802   s8803    Movie                 Zodiac    David Fincher   
8803   s8804  TV Show            Zombie Dumb              NaN   
8804   s8805    Movie             Zombieland  Ruben Fleischer   
8805   s8806    Movie                   Zoom     Peter Hewitt   
8806   s8807    Movie                 Zubaan      Mozez Singh   

                                                   cast        country  \
0                                                   NaN  United States   
1     Ama Qamata, Khosi Ngema, Gail Mabal

In [None]:
df.shape

(8807, 12)

In [None]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [None]:
rating_df = df.groupby(['rating']).size().reset_index(name='counts') 

In [None]:
rating_df.head

<bound method NDFrame.head of       rating  counts
0     66 min       1
1     74 min       1
2     84 min       1
3          G      41
4      NC-17       3
5         NR      80
6         PG     287
7      PG-13     490
8          R     799
9      TV-14    2160
10      TV-G     220
11     TV-MA    3207
12     TV-PG     863
13      TV-Y     307
14     TV-Y7     334
15  TV-Y7-FV       6
16        UR       3>

In [None]:
pieChart = px.pie(rating_df, values='counts', names='rating', title='Distribution of Content Ratings on Netflix', color_discrete_sequence=px.colors.qualitative.Set3)
pieChart.show()

In [None]:
dff = rating_df.copy()
dff.loc[dff['counts'] < 200, 'rating'] = 'Others'

In [None]:
pieChart = px.pie(dff, values='counts', names='rating', title='Distribution of Content Ratings on Netflix', color_discrete_sequence=px.colors.qualitative.Set3)
pieChart.show()

**OBSERVATION 1-**

The graph above shows that the majority of content on Netflix is categorized as “TV-MA”, which means that most of the content available on Netflix is intended for viewing by mature and adult audiences.

In [None]:
df['director']=df['director'].fillna('No Director Specified')
filtered_directors=pd.DataFrame()
filtered_directors=df['director'].str.split(',',expand=True).stack()
filtered_directors=filtered_directors.to_frame()
filtered_directors.columns=['Director']
directors=filtered_directors.groupby(['Director']).size().reset_index(name='Total Content')
directors=directors[directors.Director !='No Director Specified']
directors=directors.sort_values(by=['Total Content'],ascending=False)
directors

Unnamed: 0,Director,Total Content
4021,Rajiv Chilaka,22
4068,Raúl Campos,18
261,Jan Suter,18
4652,Suhas Kadav,16
3235,Marcus Raboy,16
...,...,...
2340,J. Davis,1
2341,J. Lee Thompson,1
2342,J. Michael Long,1
609,Smriti Keshari,1


In [None]:
directorsTop5=directors.head()
directorsTop5=directorsTop5.sort_values(by=['Total Content'])
fig1=px.bar(directorsTop5,x='Total Content',y='Director',title='Top 5 Directors on Netflix')
fig1.show()

**Observation 2-**

From the above graph it is derived that the top 5 directors on this platform are:

Raul Campos

Jan Suter

Jay Karas

Marcus Raboy

Jay Chapman

In [None]:
df['cast']=df['cast'].fillna('No Cast Specified')
filtered_cast=pd.DataFrame()
filtered_cast=df['cast'].str.split(',',expand=True).stack()
filtered_cast=filtered_cast.to_frame()
filtered_cast.columns=['Actor']
actors=filtered_cast.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='No Cast Specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop10=actors.head(10)
actorsTop10

Unnamed: 0,Actor,Total Content
2612,Anupam Kher,39
26941,Rupa Bhimani,31
30303,Takahiro Sakurai,30
15541,Julie Tejwani,28
23624,Om Puri,27
38446,Shah Rukh Khan,26
25410,Rajesh Kava,26
4186,Boman Irani,25
33367,Yuki Kaji,25
1905,Andrea Libman,25


In [None]:
actorsTop10=actorsTop10.sort_values(by=['Total Content'])
fig2=px.bar(actorsTop10,x='Total Content',y='Actor', title='Top 10 Actors on Netflix')
fig2.show()

In [None]:
content_df=df[['type','release_year']]
content_df=content_df.rename(columns={"release_year": "Release Year"})
content_df=content_df.groupby(['Release Year','type']).size().reset_index(name='Total Content')


In [None]:
content_df=content_df[content_df['Release Year']>=2008]
fig3 = px.line(content_df, x="Release Year", y="Total Content", color='type',title='Trend of content produced over the years on Netflix')
fig3.show()

In [None]:
from textblob import TextBlob # for sentiment analysis
dfsentiment=df[['release_year','description']]
dfsentiment=dfsentiment.rename(columns={'release_year':'Release Year'})
for index,row in dfsentiment.iterrows():
    z=row['description']
    testimonial=TextBlob(z)
    p=testimonial.sentiment.polarity
    if p==0:
        sent='Neutral'
    elif p>0:
        sent='Positive'
    else:
        sent='Negative'
    dfsentiment.loc[[index,2],'Sentiment']=sent


dfsentiment=dfsentiment.groupby(['Release Year','Sentiment']).size().reset_index(name='Total Content')

dfsentiment=dfsentiment[dfsentiment['Release Year']>=2010]
fig4 = px.bar(dfsentiment, x="Release Year", y="Total Content", color="Sentiment", title="Sentiment of content on Netflix")
fig4.show()