In [1]:
# Preprocessing Dataframes for simplicity.

In [2]:
import pandas as pd
import os 

from pymongo import MongoClient
from dotenv import load_dotenv

In [3]:
# only movies dfs
df_awarded_movies = pd.read_csv("../Data/Best Movie by Year Netflix.csv", index_col='index')
df_best_movies = pd.read_csv("../Data/Best Movies Netflix.csv", index_col='index')

# only shows dfs
df_awarded_shows = pd.read_csv('../Data/Best Show by Year Netflix.csv', index_col='index')
df_best_shows = pd.read_csv('../Data/Best Shows Netflix.csv', index_col='index')

# all netflix titles
df_titles = pd.read_csv("../Data/raw_titles.csv", index_col='index')

# credits
df_credits = pd.read_csv("../Data/raw_credits.csv", index_col='index')

### Taking a look at the Data

In [4]:
df_awarded_movies.head()

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,MAIN_GENRE,MAIN_PRODUCTION
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,White Christmas,1954,7.5,romance,US
1,The Guns of Navarone,1961,7.5,war,US
2,My Fair Lady,1964,7.8,drama,US
3,Bonnie and Clyde,1967,7.7,drama,US
4,Dirty Harry,1971,7.7,thriller,US


In [5]:
df_best_movies.head()

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,MAIN_GENRE,MAIN_PRODUCTION
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,David Attenborough: A Life on Our Planet,2020,9.0,31180,83,documentary,GB
1,Inception,2010,8.8,2268288,148,scifi,GB
2,Forrest Gump,1994,8.8,1994599,142,drama,US
3,Anbe Sivam,2003,8.7,20595,160,comedy,IN
4,Bo Burnham: Inside,2021,8.7,44074,87,comedy,US


In [6]:
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0


In [7]:
df_credits.head()

Unnamed: 0_level_0,person_id,id,name,character,role
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


### simplifying the data by adding a new column 'is_awarded' [True/False] for all the titles in awarded_movie/show to the df containing all netflix titles

In [8]:
awarded_movies = list(df_awarded_movies['TITLE'])
awarded_shows = list(df_awarded_shows['TITLE'])

total_awarded = len(awarded_movies) + len(awarded_shows)
print("Expected number of awarded titles == True: ", int(total_awarded), '\n')

def set_isAwarded(title):
    return title['title'] in awarded_movies or title['title'] in awarded_shows

df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_awarded'] = df_titles.apply(set_isAwarded, axis=1)

print(df_titles.is_awarded.value_counts())
# we observe that there are move True values then expected. Because of this we will need to dive deeper.

Expected number of awarded titles == True:  80 

is_awarded
False    5722
True       84
Name: count, dtype: int64


In [9]:
print(f"""
Are There duplicate combinations of title and type?:" {df_titles[['title', 'type']].duplicated().any()}
Are all ID values unique?: {df_titles['id'].is_unique}
Are there duplicate combinations of title, type and release year?: {df_titles[['title', 'type', 'release_year']].duplicated().any()} 
""")
# we can see that there are some duplicated movie/shows titles.
# this isn't a problem with the data however as they do have unique IDs
# given that there are no duplicates of the titles with the release year, we can hipothesize that there has been a few shows/movies remakes over the years.


Are There duplicate combinations of title and type?:" True
Are all ID values unique?: True
Are there duplicate combinations of title, type and release year?: False 



In [10]:
# creating a composite key for the awarded titles with the title name and release year
awarded_movies, year_movie = list(df_awarded_movies['TITLE']), list(df_awarded_movies['RELEASE_YEAR'])
awarded_shows, year_show = list(df_awarded_shows['TITLE']), list(df_awarded_shows['RELEASE_YEAR'])

awarded_movies = [[x,y] for x,y in zip(awarded_movies, year_movie)]
awarded_shows = [[x,y] for x,y in zip(awarded_shows, year_show)]

In [11]:
def set_isAwarded(title): # updating set_isAwarded Function for the composite key format
    return [title['title'], title['release_year']] in awarded_movies or [title['title'], title['release_year']] in awarded_shows

df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_awarded'] = df_titles.apply(set_isAwarded, axis=1)

print(df_titles.is_awarded.value_counts())
# and with this we get the expected 80 awarded titles.

is_awarded
False    5726
True       80
Name: count, dtype: int64


In [12]:
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,is_awarded
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,,False
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,True
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,True
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,True
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,True


### simplifying the data by adding a new column 'is_best' [True/False] for all the titles in best_movie/show to the df containing all netflix titles

In [13]:
best_movies, year_bestMovie = list(df_best_movies['TITLE']), list(df_best_movies['RELEASE_YEAR'])
best_shows, year_bestShows = list(df_best_shows['TITLE']), list(df_best_shows['RELEASE_YEAR'])

best_movies = [[x,y] for x,y in zip(best_movies, year_bestMovie)]
best_shows = [[x,y] for x,y in zip(best_shows, year_bestShows)]

total_best = len(best_movies) + len(best_shows)
print("Expected number of best titles: ", int(total_best), '\n')

def set_isBestTitle(title):
    return [title['title'], title['release_year']] in best_movies or [title['title'], title['release_year']] in best_shows

df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_best'] = df_titles.apply(set_isBestTitle, axis=1)

print(df_titles['is_best'].value_counts())
# This time we are expecting the total count of best movies to be 633 and we get 634

Expected number of best titles:  633 

is_best
False    5172
True      634
Name: count, dtype: int64


In [14]:
# creating a composite key for the best titles with the title name and release year
best_movies, year_bestMovie = list(df_best_movies['TITLE']), list(df_best_movies['RELEASE_YEAR'])
best_shows, year_bestShow = list(df_best_shows['TITLE']), list(df_best_shows['RELEASE_YEAR'])

best_movies = [[x,y] for x,y in zip(best_movies, year_bestMovie)]
best_shows = [[x,y] for x,y in zip(best_shows, year_bestShow)]

In [15]:
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,is_awarded,is_best
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,,False,False
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,True,True
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,True,True
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,True,True
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,True,True


In [16]:
def set_isBestTitle(title): # updating set_isAwarded Function for the composite key format
    return [title['title'], title['release_year']] in best_movies or [title['title'], title['release_year']] in best_shows

df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_best'] = df_titles.apply(set_isBestTitle, axis=1)

print(df_titles['is_best'].value_counts())
# for some unknown reason it continues to have 634 best titles. We were unable to identify any reason for this thus we will add it to the "known erros" section

is_best
False    5172
True      634
Name: count, dtype: int64


In [17]:
df_titles.to_csv('../Data/complete_titles.csv', index='index')

# Working the data to fit the Relational Schema

In [18]:
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,is_awarded,is_best
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,,False,False
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,True,True
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,True,True
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,True,True
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,True,True


In [19]:
df_credits.head()

Unnamed: 0_level_0,person_id,id,name,character,role
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [20]:
table_titles = pd.DataFrame()
table_genres = pd.DataFrame()
table_countries = pd.DataFrame()
table_persons = pd.DataFrame()
table_roles = pd.DataFrame()

In [32]:
# setting feature normalized Titles Table
table_titles[['title_id', 'title_name', 'release_year', 'num_seasons', 'runtime', 'score_imbd', 'votes_imbd', 'is_awarded', 'is_best']] = df_titles[['id', 'title', 'release_year', 'runtime' ,'seasons', 'imdb_score', 'imdb_votes', 'is_awarded', 'is_best']]
table_titles['is_movie'] = df_titles['type'].apply(lambda x: x == 'MOVIE')
table_titles.columns

Index(['title_id', 'title_name', 'release_year', 'num_seasons', 'runtime',
       'score_imbd', 'votes_imbd', 'is_awarded', 'is_best', 'is_movie'],
      dtype='object')

In [22]:
# setting feature normalized Persons Table

table_persons['person_id'] = df_credits['person_id']
table_persons['person_name'] = df_credits['name']

In [23]:
# setting feature normlized Roles Table

table_roles[['title_id','person_id', 'character', 'role']] = df_credits[['id', 'person_id', 'character', 'role']]

In [24]:
df_credits.head()

Unnamed: 0_level_0,person_id,id,name,character,role
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [25]:
table_roles.head()

Unnamed: 0_level_0,title_id,person_id,character,role
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,tm84618,3748,Travis Bickle,ACTOR
1,tm84618,14658,Iris Steensma,ACTOR
2,tm84618,7064,Tom,ACTOR
3,tm84618,3739,Matthew 'Sport' Higgins,ACTOR
4,tm84618,48933,Betsy,ACTOR


In [30]:
table_roles[['person_id', 'title_id']].duplicated().any() # This can't be used as composite key, a role_id PK will be created and autoincremented in the DB.

np.True_