# Netflix Top Rated Movies and TV Shows

[Dataset from Kaggle](https://www.kaggle.com/datasets/thedevastator/netflix-top-rated-movies-and-tv-shows-2020-2022)

In [24]:
import pandas as pd
import numpy as np

First we inspect data files.

In [13]:
# show list of data files

!ls -lh data/

total 9448
-rw-r--r--@ 1 April  staff   1.9K Jan 17 03:47 best_movie_by_year_netflix.csv
-rw-r--r--@ 1 April  staff    19K Jan 17 03:47 best_movies_netflix.csv
-rw-r--r--@ 1 April  staff   1.2K Jan 17 03:47 best_show_by_year_netflix.csv
-rw-r--r--@ 1 April  staff    12K Jan 17 03:47 best_shows_netflix.csv
-rw-r--r--@ 1 April  staff   4.0M Jan 17 03:47 raw_credits.csv
-rw-r--r--@ 1 April  staff   614K Jan 17 03:47 raw_titles.csv


In [19]:
# check the number of lines

!wc -l data/best_movie_by_year_netflix.csv
!wc -l data/best_movies_netflix.csv
!wc -l data/best_show_by_year_netflix.csv
!wc -l data/best_shows_netflix.csv
!wc -l data/raw_titles.csv
!wc -l data/raw_credits.csv

      50 data/best_movie_by_year_netflix.csv
     388 data/best_movies_netflix.csv
      32 data/best_show_by_year_netflix.csv
     247 data/best_shows_netflix.csv
    5807 data/raw_titles.csv
   77214 data/raw_credits.csv


In [21]:
# take a look at the top few rows

!head -n 3 data/best_movie_by_year_netflix.csv
!head -n 3 data/best_movies_netflix.csv
!head -n 3 data/best_show_by_year_netflix.csv
!head -n 3 data/best_shows_netflix.csv

index,TITLE,RELEASE_YEAR,SCORE,MAIN_GENRE,MAIN_PRODUCTION
0,White Christmas,1954,7.5,romance,US
1,The Guns of Navarone,1961,7.5,war,US
index,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,MAIN_GENRE,MAIN_PRODUCTION
0,David Attenborough: A Life on Our Planet,2020,9.0,31180,83,documentary,GB
1,Inception,2010,8.8,2268288,148,scifi,GB
index,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_SEASONS,MAIN_GENRE,MAIN_PRODUCTION
0,Monty Python's Flying Circus,1969,8.8,4,comedy,GB
1,Knight Rider,1982,6.9,4,action,US
index,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,NUMBER_OF_SEASONS,MAIN_GENRE,MAIN_PRODUCTION
0,Breaking Bad,2008,9.5,1727694,48,5,drama,US
1,Avatar: The Last Airbender,2005,9.3,297336,24,3,scifi,US


In [22]:
!head -n 3 data/raw_titles.csv

index,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0


In [23]:
!head -n 3 data/raw_credits.csv

index,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR


In [31]:
movie_df = pd.read_csv('data/best_movies_netflix.csv', index_col='index')
movie_df.columns = movie_df.columns.str.lower()
movie_df.sample(10)

Unnamed: 0_level_0,title,release_year,score,number_of_votes,duration,main_genre,main_production
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
169,Les Misérables,2012,7.5,325132,157,drama,GB
63,Her,2013,8.0,586679,126,drama,US
274,The Siege of Jadotville,2016,7.2,38308,108,thriller,IE
266,First They Killed My Father,2017,7.2,17871,136,drama,KH
90,The Irishman,2019,7.8,371209,209,drama,US
181,Sivaji: The Boss,2007,7.5,19556,189,drama,IN
193,Sherlock Holmes: A Game of Shadows,2011,7.4,446531,129,crime,US
93,Lakshya,2004,7.8,23076,186,drama,IN
154,Athlete A,2020,7.6,10544,104,documentary,US
68,Bāhubali: The Beginning,2015,8.0,117333,159,drama,IN


In [32]:
movie_year_df = pd.read_csv('data/best_movie_by_year_netflix.csv', index_col='index')
movie_year_df.columns = movie_year_df.columns.str.lower()
movie_year_df.sample(10)

Unnamed: 0_level_0,title,release_year,score,main_genre,main_production
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
45,Dave Chappelle: Sticks & Stones,2019,8.4,comedy,US
31,Memoirs of a Geisha,2005,7.3,drama,FR
39,Rush,2013,8.1,drama,US
33,Like Stars on Earth,2007,8.3,drama,IN
29,Big Fish,2003,8.0,drama,US
36,Inception,2010,8.8,scifi,GB
35,3 Idiots,2009,8.4,comedy,IN
1,The Guns of Navarone,1961,7.5,war,US
5,The Exorcist,1973,8.1,horror,US
32,Rang De Basanti,2006,8.1,comedy,IN


In [46]:
movie_df.release_year.value_counts().sort_index()

1954     1
1961     1
1964     1
1966     1
1967     1
1971     1
1973     1
1975     1
1976     1
1979     2
1980     1
1982     1
1984     2
1986     2
1987     2
1989     1
1990     1
1991     1
1992     1
1993     2
1994     4
1995     2
1996     1
1997     6
1998     5
1999     2
2000     1
2001     6
2002     4
2003     8
2004     7
2005     3
2006     7
2007    14
2008    11
2009    10
2010    16
2011    15
2012    12
2013    16
2014    19
2015    19
2016    26
2017    35
2018    26
2019    37
2020    19
2021    26
2022     5
Name: release_year, dtype: int64

In [52]:
movie_year_df.query('main_genre == "comedy"')

Unnamed: 0_level_0,title,release_year,score,main_genre,main_production
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,Monty Python and the Holy Grail,1975,8.2,comedy,GB
8,Life of Brian,1979,8.0,comedy,GB
17,My Girl,1991,6.9,comedy,US
22,Happy Gilmore,1996,7.0,comedy,US
32,Rang De Basanti,2006,8.1,comedy,IN
35,3 Idiots,2009,8.4,comedy,IN
45,Dave Chappelle: Sticks & Stones,2019,8.4,comedy,US
47,Bo Burnham: Inside,2021,8.7,comedy,US


In [51]:
movie_year_df.main_genre.value_counts()

drama          22
comedy          8
thriller        5
romance         3
horror          2
crime           2
scifi           2
war             1
western         1
action          1
fantasy         1
documentary     1
Name: main_genre, dtype: int64

In [56]:
show_df = pd.read_csv('data/best_show_by_year_netflix.csv', index_col='index')
show_df.columns = show_df.columns.str.lower()
show_df.sample(10)

Unnamed: 0_level_0,title,release_year,score,number_of_seasons,main_genre,main_production
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25,Anne with an E,2017,8.7,3,drama,CA
1,Knight Rider,1982,6.9,4,action,US
9,Trailer Park Boys,2001,8.6,12,comedy,CA
26,Cobra Kai,2018,8.6,5,action,US
28,The Last Dance,2020,9.1,1,documentary,US
29,Arcane,2021,9.1,1,action,US
0,Monty Python's Flying Circus,1969,8.8,4,comedy,GB
20,Call the Midwife,2012,8.5,11,drama,GB
6,Cowboy Bebop,1998,8.9,1,western,JP
13,Avatar: The Last Airbender,2005,9.3,3,scifi,US


In [57]:
show_df.release_year.sort_values

<bound method Series.sort_values of index
0     1969
1     1982
2     1989
3     1993
4     1995
5     1997
6     1998
7     1999
8     2000
9     2001
10    2002
11    2003
12    2004
13    2005
14    2006
15    2007
16    2008
17    2009
18    2010
19    2011
20    2012
21    2013
22    2014
23    2015
24    2016
25    2017
26    2018
27    2019
28    2020
29    2021
30    2022
Name: release_year, dtype: int64>

In [65]:
show_df.query('release_year == 2017')

Unnamed: 0_level_0,title,release_year,score,number_of_seasons,main_genre,main_production
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25,Anne with an E,2017,8.7,3,drama,CA


In [59]:
show_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 0 to 30
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              31 non-null     object 
 1   release_year       31 non-null     int64  
 2   score              31 non-null     float64
 3   number_of_seasons  31 non-null     int64  
 4   main_genre         31 non-null     object 
 5   main_production    31 non-null     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.7+ KB
