In [1]:
import os
import pandas as pd
from database_loader import DatabaseLoader
from dataframe_cleaner import DataframeCleaner
import models
from query_recommender_api import QueryRecommender_api
from functions import compute_missing_values, lowercase_column_names, factorize_id

# Imports

In [2]:
directory = "data/"
files = os.listdir(directory)

raw_titles_df = pd.read_csv(os.path.join(directory, 'raw_titles.csv'), index_col = "index")
raw_credits_df = pd.read_csv(os.path.join(directory, 'raw_credits.csv'), index_col = "index")

In [3]:
compute_missing_values(raw_titles_df)

Unnamed: 0,Missing Values,Percentage (%)
seasons,3759,64.74
age_certification,2610,44.95
imdb_votes,539,9.28
imdb_score,523,9.01
imdb_id,444,7.65
title,1,0.02


In [4]:
compute_missing_values(raw_credits_df)

Unnamed: 0,Missing Values,Percentage (%)
character,9627,12.47


In [5]:
"""Cleaning titles data."""
processing_titles = DataframeCleaner(raw_titles_df)

processing_titles.remove_duplicates()
processing_titles.remove_column_duplicates('id')
processing_titles.remove_column_duplicates('imdb_id')

raw_titles_df = processing_titles.data

"""Cleaning credits data."""
processing_credits = DataframeCleaner(raw_credits_df)

processing_credits.remove_duplicates()
raw_credits_df = processing_credits.data

In [6]:
"""Lists storing information needed for DataFrame cleaning."""
titles_columns_for_null = ['age_certification', 'seasons', 'imdb_id', 'genres', 'production_countries']

titles_columns_for_median_or_mode = ['imdb_score', 'imdb_votes']

titles_columns_for_brackets = ['genres', 'production_countries']

credits_columns_for_null = ['name',	'character',	'role']

In [7]:
for column in titles_columns_for_brackets:
    raw_titles_df[column] = processing_titles.remove_brackets_and_quotes(column_name=column)

for column in titles_columns_for_null:
    raw_titles_df[column] = processing_titles.missing_to_zero(column_name=column)

for column in titles_columns_for_median_or_mode:
    raw_titles_df[column] = processing_titles.missing_to_median_or_mode(column_name=column)

raw_titles_df = raw_titles_df.dropna()

raw_titles_df = raw_titles_df.copy()

In [8]:
for column in credits_columns_for_null:
    raw_credits_df[column] = processing_credits.missing_to_zero(column_name=column)
    
raw_credits_df = raw_credits_df.copy()

# Save data for EDA

In [9]:
file_path = "raw_titles_df.csv"
raw_titles_df.to_csv(file_path, index=False)

file_path = "raw_credits_df.csv"
raw_credits_df.to_csv(file_path, index=False)

In [10]:
data = pd.read_csv("raw_credits_df.csv")
raw_credits_df = pd.DataFrame(data)

data = pd.read_csv("raw_titles_df.csv")
raw_titles_df = pd.DataFrame(data)

In [11]:
"""Mapping for column names and types."""
columns_dict = {
                'id': 'string',
                'title': 'string',
                'type': 'string',
                'release_year': 'int',
                'age_certification': 'string',
                'runtime': 'int', 
                'genres': 'string',
                'production_countries': 'string',
                'seasons': 'int',
                'imdb_id': 'string',
                'imdb_score': 'float64',
                'imdb_votes': 'int'
                }

errors = []

for column, dtype in columns_dict.items():
    try:
        raw_titles_df[column] = raw_titles_df[column].astype(dtype)
    except ValueError:
        errors.append(f'Column {column} dtype change failed!')

# Normalization

In [12]:
"""Normalizing Titles tables."""
titles_df = raw_titles_df[['id', 'title', 'release_year', 'age_certification', 'runtime', 'imdb_id', 'imdb_score', 'imdb_votes']]

title_type_df = raw_titles_df[['id', 'type']].copy()
title_type_df = factorize_id(title_type_df, 'type', id_field='type_id')

title_genres_df = raw_titles_df[['id', 'genres']].copy()
title_genres_df['genres'] = title_genres_df['genres'].str.split(", ")
title_genres_df = title_genres_df.explode('genres')
title_genres_df = factorize_id(title_genres_df, 'genres', id_field='genre_id')
title_genres_df = title_genres_df.rename(columns={'genres': 'genre'})

title_production_countries_df = raw_titles_df[['id', 'production_countries']].copy()
title_production_countries_df['production_countries'] = title_production_countries_df['production_countries'].str.split(", ")
title_production_countries_df = title_production_countries_df.explode('production_countries')
title_production_countries_df = factorize_id(title_production_countries_df, 'production_countries', id_field='production_id')
title_production_countries_df = title_production_countries_df.rename(columns={'production_countries': 'country'})

In [13]:
"""Normalizing Credits tables."""
raw_credits_df = factorize_id(raw_credits_df, 'name', id_field='person_id')

credits_persons_df = raw_credits_df[['person_id', 'name']].drop_duplicates().reset_index(drop=True)

credits_df = raw_credits_df.copy()
credits_df.drop(columns=['name'], inplace=True)

credits_df['credit_id'] = credits_df.index + 1
credits_df = credits_df[['credit_id', 'id', 'person_id', 'character', 'role']]

# Data Enrichment

In [14]:
"""Importing related tables with lowercase column names."""
best_movie_by_year_df = lowercase_column_names(pd.read_csv(os.path.join(directory, 'Best Movie by Year Netflix.csv'), index_col = "index"))
best_movies_df = lowercase_column_names(pd.read_csv(os.path.join(directory, 'Best Movies Netflix.csv'), index_col = "index"))
best_shows_df = lowercase_column_names(pd.read_csv(os.path.join(directory, 'Best Shows Netflix.csv'), index_col = "index"))
best_show_by_year_df = lowercase_column_names(pd.read_csv(os.path.join(directory, 'Best Show by Year Netflix.csv'), index_col = "index"))

"""Merge datasets with the titles_df based on title and release year"""
best_movie_by_year_merged = pd.merge(titles_df, best_movie_by_year_df, how='inner', on=['title', 'release_year'])
best_movies_merged = pd.merge(titles_df, best_movies_df, how='inner', on=['title', 'release_year'])
best_show_by_year_merged = pd.merge(titles_df, best_show_by_year_df, how='inner', on=['title', 'release_year'])
best_shows_merged = pd.merge(titles_df, best_shows_df, how='inner', on=['title', 'release_year'])

print(best_movie_by_year_merged['title'].count() == best_movie_by_year_df['title'].count())
print(best_movies_merged['title'].count() == best_movies_df['title'].count())
print(best_show_by_year_merged['title'].count() == best_show_by_year_df['title'].count())
print(best_shows_merged['title'].count() == best_shows_df['title'].count(), 
      "\nDifference between: ", best_shows_merged['title'].count() - best_shows_df['title'].count())

True
True
True
False 
Difference between:  1


In [15]:
# Check if all ids in credits_df are present in titles_df
missing_ids = set(credits_df['id']) - set(titles_df['id'])
if missing_ids:
    print(f"Missing ids: {missing_ids}")


Missing ids: {'tm1063792'}


In [16]:
credits_df = credits_df[credits_df['id'] != missing_ids]

In [17]:
# Merging with indicator=True to identify the source of each row
comparison_df = pd.merge(best_shows_df, best_shows_merged, on=['title', 'release_year'], how='outer', indicator=True)

# Filtering to get titles that are only in 'best_shows_df'
missing_title = comparison_df[comparison_df['_merge'] == 'left_only'][['title', 'release_year']]
print(missing_title)


Empty DataFrame
Columns: [title, release_year]
Index: []


In [18]:
"""Check if there are any unmatched titles. Find titles that don't have a match in titles_df."""
unmatched_titles = {
    'best_movie_by_year': [],
    'best_movies': [],
    'best_show_by_year': [],
    'best_shows': []
}

unmatched_titles['best_movie_by_year'] = best_movie_by_year_df.loc[~best_movie_by_year_df['title'].isin(best_movie_by_year_merged['title']), 'title'].tolist()
unmatched_titles['best_movies'] = best_movies_df.loc[~best_movies_df['title'].isin(best_movies_merged['title']), 'title'].tolist()
unmatched_titles['best_show_by_year'] = best_show_by_year_df.loc[~best_show_by_year_df['title'].isin(best_show_by_year_merged['title']), 'title'].tolist()
unmatched_titles['best_shows'] = best_shows_df.loc[~best_shows_df['title'].isin(best_shows_merged['title']), 'title'].tolist()

unmatched_titles


{'best_movie_by_year': [],
 'best_movies': [],
 'best_show_by_year': [],
 'best_shows': []}

# Creating Flags for most popular Shows & Movies

In [19]:
"""Create a Flag in the original Titles dataset for each of the merged datasets, when a title is present in one of those datasets.
Instantiate Flags, set them to 1 if the movies/shows are present in 'most popular' dataframe."""
titles_df = titles_df.copy()
credits_df = credits_df.copy()
titles_df['is_best_movie_by_year'] = 0
titles_df['is_best_movie'] = 0
titles_df['is_best_show_by_year'] = 0
titles_df['is_best_show'] = 0

titles_df.loc[titles_df['id'].isin(best_movie_by_year_merged['id']), 'is_best_movie_by_year'] = 1
titles_df.loc[titles_df['id'].isin(best_movies_merged['id']), 'is_best_movie'] = 1
titles_df.loc[titles_df['id'].isin(best_show_by_year_merged['id']), 'is_best_show_by_year'] = 1
titles_df.loc[titles_df['id'].isin(best_shows_merged['id']), 'is_best_show'] = 1

titles_df.head()


Unnamed: 0,id,title,release_year,age_certification,runtime,imdb_id,imdb_score,imdb_votes,is_best_movie_by_year,is_best_movie,is_best_show_by_year,is_best_show
0,tm19608,The Blazing Sun,1954,,100,tt0044429,7.4,1219,0,0,0,0
1,tm16479,White Christmas,1954,,115,tt0047673,7.5,42373,1,1,0,0
2,tm204541,Dark Waters,1956,,120,tt0049761,6.7,590,0,0,0,0
3,tm135083,Cairo Station,1958,,77,tt0051390,7.5,4385,0,0,0,0
4,tm44204,The Guns of Navarone,1961,,158,tt0054953,7.5,50150,1,1,0,0


# Save Normalized and Enriched data

In [20]:
output_dir = "normalized_tables/"
os.makedirs(output_dir, exist_ok=True)

titles_df.to_csv(os.path.join(output_dir, 'titles.csv'), index=False)
title_type_df.to_csv(os.path.join(output_dir, 'title_type.csv'), index=False)
title_genres_df.to_csv(os.path.join(output_dir, 'title_genres.csv'), index=False)
title_production_countries_df.to_csv(os.path.join(output_dir, 'title_production_countries.csv'), index=False)

credits_df.to_csv(os.path.join(output_dir, 'credits.csv'), index=False)
credits_persons_df.to_csv(os.path.join(output_dir, 'credits_persons.csv'), index=False)

In [21]:
# Reading data
output_dir = "normalized_tables/"
os.makedirs(output_dir, exist_ok=True)

titles_df = pd.read_csv(os.path.join(output_dir, 'titles.csv'))
title_type_df = pd.read_csv(os.path.join(output_dir, 'title_type.csv'))
title_genres_df = pd.read_csv(os.path.join(output_dir, 'title_genres.csv')) 
title_production_countries_df = pd.read_csv(os.path.join(output_dir, 'title_production_countries.csv'))
credits_df = pd.read_csv(os.path.join(output_dir, 'credits.csv'))
credits_persons_df = pd.read_csv(os.path.join(output_dir, 'credits_persons.csv'))

# Database Setup

In [22]:
# From Imported Class
"""Create database connection and database."""
db_loader = DatabaseLoader()
db_loader.create_engine()

"""Create database and update engine."""
db_loader.create_database()

Engine updated.


In [23]:
"""Create all tables in MySQL database."""
db_loader.create_all()

In [24]:
db_tables = [
 'titles', 'title_type',
 'title_genres', 'title_production_countries',
 'persons', 'credits'
 ]

dfs_list = [
    titles_df, title_type_df, 
    title_genres_df, title_production_countries_df,
    credits_persons_df, credits_df
]

In [25]:
"""Turn off foregin key constaint checks."""
db_loader.turn_off_fk_check()

"""Upload all the data from the dataframes to MySQL database tables."""
for df, db_table in zip(dfs_list, db_tables):
    print(f"Inserting into table: {db_table}")
    db_loader.send_data(df, db_table)
print("Upload complete.")    

"""Turn on foreign key checks."""
db_loader.turn_on_fk_check()

Foreign key checks turned off.
Inserting into table: titles
Inserting into table: title_type
Inserting into table: title_genres
Inserting into table: title_production_countries
Inserting into table: persons
Inserting into table: credits
Upload complete.
Foreign key checks turned on.


# SQL Read/Write through API

In [27]:
recommender_api = QueryRecommender_api()
recommender_api.recommender_retrieve()

query = "SELECT * FROM titles"
recommender_api.read(query)

Unnamed: 0,id,title,release_year,age_certification,runtime,imdb_id,imdb_score,imdb_votes,is_best_movie_by_year,is_best_movie,is_best_show_by_year,is_best_show
0,tm1000037,Je suis Karl,2021,R,126,tt9205538,5.4,889,0,0,0,0
1,tm1000147,Zone 414,2021,R,98,tt8545482,4.9,3878,0,0,0,0
2,tm1000166,Wave of Cinema: Surat Dari Timur,2020,,57,,6.6,2279,0,0,0,0
3,tm1000185,Squared Love,2021,,102,tt13846542,5.0,3059,0,0,0,0
4,tm100027,Alibaba Aur 40 Chor,1979,,138,tt0079749,6.2,555,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5800,ts98316,Edge of the Universe,2002,TV-G,49,tt6951568,7.2,206,0,0,0,0
5801,ts98340,Metallica: Some Kind of Monster,2004,TV-MA,84,tt6954486,6.6,2279,0,0,0,0
5802,ts98353,DreamWorks Spooky Stories,2009,TV-PG,26,tt6958564,6.2,19,0,0,0,0
5803,ts987,Danger Mouse,1981,TV-Y,15,tt0081848,7.4,3514,0,0,0,0


In [28]:
query = """SELECT 
            t1.id,
            t1.title,
            t1.imdb_score,
            t2.genre
            FROM titles AS t1
            LEFT JOIN title_genres AS t2
            ON t1.id=t2.id"""
recommender_api.read(query).head(5)

Unnamed: 0,id,title,imdb_score,genre
0,tm1000037,Je suis Karl,5.4,romance
1,tm1000037,Je suis Karl,5.4,drama
2,tm1000037,Je suis Karl,5.4,thriller
3,tm1000037,Je suis Karl,5.4,european
4,tm1000147,Zone 414,4.9,thriller


In [29]:
"""Showcasing validation on SELECT. """
query = """CREATE OR REPLACE VIEW titles_and_genres AS
            SELECT
            t1.id,
            t1.title,
            t1.imdb_score,
            t2.genre
            FROM titles AS t1
            LEFT JOIN title_genres AS t2
            ON t1.id=t2.id"""
recommender_api.read(query).head(5)

ValueError: Not a SELECT statement

In [30]:
write_query = "CREATE TABLE demo_table (demo_id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(30) NOT NULL)"
recommender_api.write(write_query)

insert_query = "INSERT INTO demo_table (name) VALUES ('Some name'), ('Another name'), ('Last name')"
recommender_api.write(insert_query)

Successfully executed query: 
CREATE TABLE demo_table (demo_id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(30) NOT NULL)
Successfully executed query: 
INSERT INTO demo_table (name) VALUES ('Some name'), ('Another name'), ('Last name')


## Recommender showcase

In [31]:
recommender_api.recommend_title(input_id=['tm1000185', 'ts99814'])

Unnamed: 0,input_id,input_title,imdb_score,genre,rec_title_1,rec_title_2,rec_title_3,rec_title_4,rec_title_5
0,tm1000185,Squared Love,5.0,romance,Who Rules The World,Reply 1988,Seinfeld,Still Game,Heartstopper


In [32]:
recommender_api.recommender_send_data()

Foreign key checks turned off.
Data sent to MySQL database successfully.
Foreign key checks turned on.


In [33]:
recommender_api.recommend_titles(['tm1000037', 'tm1000147'])

Unnamed: 0,input_id,input_title,imdb_score,genre,rec_title_1,rec_title_2,rec_title_3,rec_title_4,rec_title_5
0,tm1000037,Je suis Karl,5.4,romance,Who Rules The World,Reply 1988,Heartstopper,Forrest Gump,The Untamed
1,tm1000147,Zone 414,4.9,thriller,Shot in the Dark,Icarus,Andhaghaaram,Human Nature,The Conjuring


In [34]:
recommender_api.recommender_send_data()

Foreign key checks turned off.
Data sent to MySQL database successfully.
Foreign key checks turned on.


# Creating requirements.txt

In [None]:
python3 -m pip freeze

In [None]:
pip freeze > requirements.txt