In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline as py


In [3]:
df=pd.read_csv("NetflixOriginals.csv",encoding='latin1')

In [4]:
df.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [5]:
df.shape

(584, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Runtime,584.0,93.577055,27.761683,4.0,86.0,97.0,108.0,209.0
IMDB Score,584.0,6.271747,0.979256,2.5,5.7,6.35,7.0,9.0


In [8]:
df.describe(include="O").T

Unnamed: 0,count,unique,top,freq
Title,584,584,Enter the Anime,1
Genre,584,115,Documentary,159
Premiere,584,390,"October 2, 2020",6
Language,584,38,English,401


In [9]:
df.isna().sum()

Title         0
Genre         0
Premiere      0
Runtime       0
IMDB Score    0
Language      0
dtype: int64

In [10]:
df.duplicated().sum()

0

### Feautre Engineering

In [12]:
df['date'] = pd.to_datetime(df['Premiere'], format='mixed', errors='coerce')


In [13]:
df['year_month']= df['date'].dt.strftime('%Y-%m')
df['year'] = df['date'].dt.year
df['month']= df['date'].dt.month
df['day']=df['date'].dt.dayofweek

In [15]:
df.drop(columns="Premiere",inplace=True)

In [16]:
df

Unnamed: 0,Title,Genre,Runtime,IMDB Score,Language,date,year_month,year,month,day
0,Enter the Anime,Documentary,58,2.5,English/Japanese,2019-08-05,2019-08,2019,8,0
1,Dark Forces,Thriller,81,2.6,Spanish,2020-08-21,2020-08,2020,8,4
2,The App,Science fiction/Drama,79,2.6,Italian,2019-12-26,2019-12,2019,12,3
3,The Open House,Horror thriller,94,3.2,English,2018-01-19,2018-01,2018,1,4
4,Kaali Khuhi,Mystery,90,3.4,Hindi,2020-10-30,2020-10,2020,10,4
...,...,...,...,...,...,...,...,...,...,...
579,Taylor Swift: Reputation Stadium Tour,Concert Film,125,8.4,English,2018-12-31,2018-12,2018,12,0
580,Winter on Fire: Ukraine's Fight for Freedom,Documentary,91,8.4,English/Ukranian/Russian,2015-10-09,2015-10,2015,10,4
581,Springsteen on Broadway,One-man show,153,8.5,English,2018-12-16,2018-12,2018,12,6
582,Emicida: AmarElo - It's All For Yesterday,Documentary,89,8.6,Portuguese,2020-12-08,2020-12,2020,12,1


### Analysis 

In [17]:
df['Language'].nunique()

38

In [18]:
top_10_languages_used= df['Language'].value_counts()[:10]
top_10_languages_used

Language
English       401
Hindi          33
Spanish        31
French         20
Italian        14
Portuguese     12
Indonesian      9
Japanese        6
Korean          6
German          5
Name: count, dtype: int64

In [19]:
fig = px.bar(top_10_languages_used, x= top_10_languages_used.index, y=top_10_languages_used.values, labels={'y':'Count', 'index':'Language'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [20]:
fig = px.histogram(df, x= 'IMDB Score', title='IMDB Score of the Programs in Netflix')
    
fig.show()

In [21]:
Year = df['year'].value_counts()

fig = px.bar(Year, x= Year.index, y=Year.values, labels={'y':'Count of Movies in Each Year', 'index':'Year'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [22]:
Month= df['month'].value_counts(sort=False)
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

fig = px.bar(Month, x= months, y=Month.values, labels={'y':'Count of Movies in Each Month', 'x':'Month'})
fig.show()

In [24]:
days= df['day'].value_counts(sort=False)
day = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']

fig = px.bar(days, x= day, y=days.values, labels={'y':'Count of Movies in Each Day', 'x':'Day'})
fig.show()

In [25]:
top_10_ratings_by_genre = df.groupby('Genre')['IMDB Score'].mean().sort_values(ascending=False)[:10]
top_10_ratings_by_genre

Genre
Animation/Christmas/Comedy/Adventure    8.200000
Musical / Short                         7.700000
Concert Film                            7.633333
Anthology/Dark comedy                   7.600000
Animation / Science Fiction             7.500000
Making-of                               7.450000
Action-adventure                        7.300000
Historical drama                        7.200000
Coming-of-age comedy-drama              7.200000
Drama-Comedy                            7.200000
Name: IMDB Score, dtype: float64

In [26]:
fig = px.bar(top_10_ratings_by_genre, x= top_10_ratings_by_genre.index, y=top_10_ratings_by_genre.values, labels={'y':'Average Rating Score', 'x':'Genre'})
fig.show()

In [27]:
lowest_10_ratings_by_genre = df.groupby('Genre')['IMDB Score'].mean().sort_values()[:10]
lowest_10_ratings_by_genre

Genre
Heist film/Thriller        3.700000
Musical/Western/Fantasy    3.900000
Horror anthology           4.300000
Political thriller         4.300000
Superhero-Comedy           4.400000
Science fiction/Drama      4.533333
Romance drama              4.600000
Mystery                    4.650000
Horror thriller            4.700000
Anime / Short              4.700000
Name: IMDB Score, dtype: float64

In [30]:
fig = px.bar(lowest_10_ratings_by_genre, x= lowest_10_ratings_by_genre.index, y=lowest_10_ratings_by_genre.values, labels={'y':'Average Rating Score', 'x':'Genre'})
fig.show()

In [33]:
corr_matrix =df[['IMDB Score','Runtime']].corr()

In [None]:
px.imshow(corr_matrix,
                text_auto=True,           
                color_continuous_scale='RdBu_r',  
                labels={'color': 'Correlation'},
                title='Correlation between IMDB Score and Runtime')

In [35]:
fig = px.scatter(df, x='IMDB Score', y='Runtime')
fig.show()