In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

In [7]:
netflixDf = pd.read_csv('netflix_titles.csv')
showsDf = pd.read_csv('netflix_shows.csv', encoding='latin-1')

Calculate the amount of null values in each row

In [8]:
netflixDf.isnull().sum()/6234*100

show_id          0.000000
type             0.000000
title            0.000000
director        31.584857
cast             9.143407
country          7.635547
date_added       0.176452
release_year     0.000000
rating           0.160411
duration         0.000000
listed_in        0.000000
description      0.000000
dtype: float64

In [9]:
showsDf.isnull().sum()/6234*100

title                0.000000
rating               0.000000
ratingLevel          0.946423
ratingDescription    0.000000
release year         0.000000
user rating score    6.336221
user rating size     0.000000
dtype: float64

Merge the two data frames, apply some data cleaning.

In [34]:
pd.set_option('display.max_rows', None)

tempDf=pd.DataFrame(columns=['title', 'rating'])
tempDf['title'] = showsDf['title']
tempDf['rating'] = showsDf['user rating score']

# Drop entries that don't have a user score
tempDf = tempDf.dropna()

# In this data set, multiple seasons of a show are input with the same title.
# To circumvent duplicate data, we grouped the common titles together and
# took the average of the ratings. Instead of ratings for each season, we now
# have the average rating of the show.
tempDf = tempDf.groupby(['title']).mean()

# Inner join from new dataframe with the netflix dataframe
merged_inner = pd.merge(left=netflixDf, right=tempDf, left_on='title', right_on='title')

# Some movies and TV Shows have the same name, so we drop all the movies
# As the ratings data frame only has ratings for TV Shows
merged_inner = merged_inner.loc[merged_inner['type'] == 'TV Show']

# Remove all entries that were released after 2018, since the ratings only
# deals with TV Shows released 2017 and before.
merged_inner = merged_inner.loc[merged_inner['release_year'] < 2018]

merged_inner.shape

(58, 13)