# Netflix analysis 

The file `netflix_titles.csv` contains information about Netflix content.

## Load, examine, clean, prepare

In [2]:
# Read and parse the netflix_titles.csv file.

import pandas as pd
data = pd.read_csv("../raw_data/dashboard.csv")

# We can take a look at the dataset to see what it contains 
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [3]:
# Rows and columns in the dataset

n_rows, n_cols = data.shape
print("The dataset has {} rows and {} columns.".format(n_rows, n_cols))

The dataset has 6234 rows and 12 columns.


In [4]:
# Columns of this dataset.

print("The columns of this dataset are :")
for col_name in data.columns:
    print("  -", col_name)

The columns of this dataset are :
  - show_id
  - type
  - title
  - director
  - cast
  - country
  - date_added
  - release_year
  - rating
  - duration
  - listed_in
  - description


In [5]:
# Does this dataset have any missing information ?

data.isna().head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,True,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,True,True,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
# How many missing values ?

print(data.isna().sum())
print("There are {} missing values in the dataframe.".format(data.isna().sum().sum()))

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64
There are 3036 missing values in the dataframe.


In [7]:
# Save columns with empty values 
liste_colonnes_avec_nulles = list(data.columns[data.isna().any()])
liste_colonnes_avec_nulles

['director', 'cast', 'country', 'date_added', 'rating']

In [8]:
# Imputation of Missing Values in Netflix Dataset

for column in liste_colonnes_avec_nulles:
    data[column].fillna("Unknown", inplace=True)
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,Unknown,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,Unknown,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,Unknown,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [9]:
# How many duplicate values ?
print("There are {} duplicate values in the dataframe.".
     format(data.duplicated().sum()))

There are 0 duplicate values in the dataframe.


In [10]:
# Sauvegarde dans un fichier CSV
data.to_csv('../suitable_data/clean_dashboard.csv', index=False)