<a href="https://colab.research.google.com/github/Ayben06/Netflix_Analysis_Report/blob/main/EDA_Netflix_TV_Shows.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the relevant libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Data Preprocessing

## Importing the Database

In [2]:
data=pd.read_csv('/content/netflix_titles.csv')

In [3]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


## Dealing with NaN Values

In [5]:
data.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [6]:
data.isnull().sum().sort_values()

show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
duration           3
rating             4
date_added        10
cast             825
country          831
director        2634
dtype: int64

In [7]:
data[data['rating'].isna()]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
5989,s5990,Movie,13TH: A Conversation with Oprah Winfrey & Ava ...,,"Oprah Winfrey, Ava DuVernay",,"January 26, 2017",2017,,37 min,Movies,Oprah Winfrey sits down with director Ava DuVe...
6827,s6828,TV Show,Gargantia on the Verdurous Planet,,"Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka...",Japan,"December 1, 2016",2013,,1 Season,"Anime Series, International TV Shows","After falling through a wormhole, a space-dwel..."
7312,s7313,TV Show,Little Lunch,,"Flynn Curry, Olivia Deeble, Madison Lu, Oisín ...",Australia,"February 1, 2018",2015,,1 Season,"Kids' TV, TV Comedies","Adopting a child's perspective, this show take..."
7537,s7538,Movie,My Honor Was Loyalty,Alessandro Pepe,"Leone Frisa, Paolo Vaccarino, Francesco Miglio...",Italy,"March 1, 2017",2015,,115 min,Dramas,"Amid the chaos and horror of World War II, a c..."


In [8]:
data['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [9]:
data=data[data['rating'].notna()]

In [10]:
data[data['duration'].isna()]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
5541,s5542,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,"April 4, 2017",2017,74 min,,Movies,"Louis C.K. muses on religion, eternal love, gi..."
5794,s5795,Movie,Louis C.K.: Hilarious,Louis C.K.,Louis C.K.,United States,"September 16, 2016",2010,84 min,,Movies,Emmy-winning comedy writer Louis C.K. brings h...
5813,s5814,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,"August 15, 2016",2015,66 min,,Movies,The comic puts his trademark hilarious/thought...


In [11]:
data['duration'] = data['duration'].fillna(data['rating'])

In [12]:
data[data['duration'].isna()]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description


In [13]:
data.at[5541,'rating']=data['rating'].mode()[0]
data.at[5794,'rating']=data['rating'].mode()[0]
data.at[5813,'rating']=data['rating'].mode()[0]

In [14]:
data.groupby('type')['country'].agg(pd.Series.mode)

type
Movie      United States
TV Show    United States
Name: country, dtype: object

In [15]:
data['country']=data['country'].fillna(data['country'].mode()[0])

In [16]:
data=data[data['date_added'].notna()]

In [17]:
data=data.drop(columns=['cast','director'])

In [18]:
data.isnull().sum().sort_values()

show_id         0
type            0
title           0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [19]:
data[['duration', 'min']] = data['duration'].str.split(expand=True, n=1)

In [20]:
data['duration']=pd.to_numeric(data['duration'],errors='coerce')

In [21]:
data.groupby('min')['duration'].mean()

min
Season      1.000000
Seasons     3.294387
min        99.572687
Name: duration, dtype: float64

In [22]:
data['year_added']=data['date_added'].apply(lambda x : x.split(' ')[-1])
data['year_added'].head()

0    2021
1    2021
2    2021
3    2021
4    2021
Name: year_added, dtype: object

In [23]:
data['month_added']=data['date_added'].apply(lambda x : x.split(' ')[0])
data['month_added']

0       September
1       September
2       September
3       September
4       September
          ...    
8802     November
8803         July
8804     November
8805      January
8806        March
Name: month_added, Length: 8793, dtype: object

In [24]:
ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

In [25]:
data['targets_age']=data['rating'].replace(ratings_ages)
data['targets_age'].unique()

array(['Teens', 'Adults', 'Older Kids', 'Kids'], dtype=object)

In [26]:
data['prinsipal_country']=data['country'].apply(lambda x : x.split(',')[0])
data['prinsipal_country'].head()

0    United States
1     South Africa
2    United States
3    United States
4            India
Name: prinsipal_country, dtype: object

In [27]:
data['type']=pd.Categorical(data['type'])

In [28]:
data['targets_age']=pd.Categorical(data['targets_age'],categories=['Kids', 'Older Kids', 'Teens', 'Adults'])
data['year_added']=pd.to_numeric(data['year_added'])

In [29]:
data.dtypes

show_id                object
type                 category
title                  object
country                object
date_added             object
release_year            int64
rating                 object
duration                int64
listed_in              object
description            object
min                    object
year_added              int64
month_added            object
targets_age          category
prinsipal_country      object
dtype: object

In [30]:
data.head()

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration,listed_in,description,min,year_added,month_added,targets_age,prinsipal_country
0,s1,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,90,Documentaries,"As her father nears the end of his life, filmm...",min,2021,September,Teens,United States
1,s2,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,2,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Seasons,2021,September,Adults,South Africa
2,s3,TV Show,Ganglands,United States,"September 24, 2021",2021,TV-MA,1,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Season,2021,September,Adults,United States
3,s4,TV Show,Jailbirds New Orleans,United States,"September 24, 2021",2021,TV-MA,1,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",Season,2021,September,Adults,United States
4,s5,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,2,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Seasons,2021,September,Adults,India


In [31]:
data['genre']=data['listed_in'].apply(lambda x: x.replace(', ',',').replace(' ,',',').split(','))
data['genre'].head()

0                                      [Documentaries]
1    [International TV Shows, TV Dramas, TV Mysteries]
2    [Crime TV Shows, International TV Shows, TV Ac...
3                             [Docuseries, Reality TV]
4    [International TV Shows, Romantic TV Shows, TV...
Name: genre, dtype: object

In [32]:
data_movie=data[data['type']=='Movie']
data_show=data[data['type']=='TV Show']

In [33]:
fig=px.pie(data['type'].value_counts().reset_index(),values='type',names='index')
fig.update_traces(textposition='inside',textinfo='percent+label')
fig.show()

In [34]:
def generete_rating(data):
  rating_data=data.groupby(['rating','targets_age']).agg({'show_id':'count'}).reset_index()
  rating_data=rating_data[rating_data['show_id']!=0]
  rating_data.columns=['rating','targets_age','counts']
  rating_data=rating_data.sort_values('targets_age')
  return rating_data

In [35]:
rating_data=generete_rating(data)
fig=px.bar(rating_data,x='rating',y='counts',color='targets_age')
fig.show()

In [55]:
data_movie_rating=generete_rating(data_movie)
data_show_rating=generete_rating(data_show)

fIg=make_subplots(rows=1,cols=2,specs=[[{"type": "pie"}, {"type": "pie"}]])

fig.add_trace(go.Pie(labels=data_movie_rating['targets_age'], values=data_movie_rating['counts']),row=1, col=1)


fig.add_trace(go.Pie(labels=data_show_rating['targets_age'],values=data_show_rating['counts']),row=1,col=2)

fig.update_traces(textposition='inside',hole=0.4,hoverinfo='label+percent+name')
fig.update_layout(
    title_text='Rating distribution by Type of content',
    annotations=[dict(text='Movies',x=0.16,y=0.5,font_size=12,showarrow=False),
              dict(text='TV Shows',x=0.82,y=0.5,font_size=12,showarrow=False)])

fig.show()