In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
netflixDf = pd.read_csv('netflix_titles.csv')
showsDf = pd.read_csv('netflix_shows.csv', encoding='latin-1')

Calculate the amount of null values in each row

In [3]:
netflixDf.isnull().sum()/netflixDf.shape[0]*100

show_id          0.000000
type             0.000000
title            0.000000
director        31.584857
cast             9.143407
country          7.635547
date_added       0.176452
release_year     0.000000
rating           0.160411
duration         0.000000
listed_in        0.000000
description      0.000000
dtype: float64

In [4]:
showsDf.isnull().sum()/showsDf.shape[0]*100

title                 0.0
rating                0.0
ratingLevel           5.9
ratingDescription     0.0
release year          0.0
user rating score    39.5
user rating size      0.0
dtype: float64

Merge the two data frames, apply some data cleaning.

In [5]:
pd.set_option('display.max_rows', None)

tempDf=pd.DataFrame(columns=['title', 'rating'])
tempDf['title'] = showsDf['title']
tempDf['rating'] = showsDf['user rating score']

# Drop entries that don't have a user score
tempDf = tempDf.dropna()

# In this data set, multiple seasons of a show are input with the same title.
# To circumvent duplicate data, we grouped the common titles together and
# took the average of the ratings. Instead of ratings for each season, we now
# have the average rating of the show.
tempDf = tempDf.groupby(['title']).mean()

# Inner join from new dataframe with the netflix dataframe
merged_inner = pd.merge(left=netflixDf, right=tempDf, left_on='title', right_on='title')

# Some movies and TV Shows have the same name, so we drop all the movies
# As the ratings data frame only has ratings for TV Shows
merged_inner = merged_inner.loc[merged_inner['type'] == 'TV Show']

# Remove all entries that were released after 2018, since the ratings only
# deals with TV Shows released 2017 and before.
merged_inner = merged_inner.loc[merged_inner['release_year'] < 2018]

Final cleaning of the data, dropped rows we won't use in analysis and renamed ambiguous columns

In [6]:
finalDf = merged_inner
finalDf = finalDf.drop(['show_id', 'type', 'director', 'cast'], axis=1)
finalDf = finalDf.rename(columns={"rating_x": "mpaa rating", "rating_y": "user_rating"})

finalDf

Unnamed: 0,title,country,date_added,release_year,mpaa rating,duration,listed_in,description,user_rating
4,Limitless,United States,"July 1, 2016",2016,TV-14,1 Season,"Crime TV Shows, TV Dramas, TV Sci-Fi & Fantasy",A failed musician unleashes his brain's full p...,84.0
5,Gilmore Girls: A Year in the Life,United States,"November 25, 2016",2016,TV-14,1 Season,"TV Comedies, TV Dramas, Teen TV Shows",Set nearly a decade after the finale of the or...,77.0
8,Charmed,United States,"October 1, 2017",2005,TV-PG,8 Seasons,"Romantic TV Shows, TV Comedies, TV Dramas",After a trio of sisters discover an ancient te...,90.0
10,Revenge,,"May 2, 2019",2017,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Dramas",After a mother and her daughter are gang-raped...,84.0
11,The Jungle Book,"India, Germany, France","May 11, 2019",2010,TV-Y7,1 Season,Kids' TV,This animated series follows young Mowgli and ...,97.0
13,American Crime Story: The People v. O.J. Simpson,United States,"March 6, 2019",2016,TV-MA,1 Season,"Crime TV Shows, TV Dramas",This anthology series dramatizes historic crim...,79.0
14,Dave Chappelle,United States,"March 21, 2017",2017,TV-MA,1 Season,"Stand-Up Comedy & Talk Shows, TV Comedies",Comedy icon Dave Chappelle makes his triumphan...,94.0
17,Cheer Squad,United States,"March 14, 2017",2016,TV-PG,1 Season,"Reality TV, Teen TV Shows","Follow the Great White Sharks, a world-champio...",61.0
21,Barbie Life in the Dreamhouse,United States,"July 1, 2017",2012,TV-G,1 Season,"Kids' TV, TV Comedies",Join blond icon Barbie and her fabulous friend...,85.0
26,DreamWorks Shrek's Swamp Stories,United States,"August 17, 2012",2008,TV-PG,1 Season,"Kids' TV, TV Action & Adventure, TV Comedies","Shrek celebrates Halloween, Puss in Boots is c...",56.0


In [7]:
finalDf.isnull().sum()/finalDf.shape[0]*100

title           0.000000
country         1.724138
date_added      1.724138
release_year    0.000000
mpaa rating     0.000000
duration        0.000000
listed_in       0.000000
description     0.000000
user_rating     0.000000
dtype: float64