# BDA Project

- MSc Data Science, FCUL Dec 2024

## Settings

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import time
# db imports
import mysql.connector
from pymongo import MongoClient

In [2]:
load_dotenv('./../mysecrets.env')
PASSWORD = os.getenv('PASSWORD')
IP = os.getenv('IP')

In [3]:
# Database connection parameters
username = 'valente'
password = PASSWORD
host = IP
port = '3306'
database_name = 'DV_student_netlixProject'

In [4]:
def connectMySQL(use_db=True):
    if use_db:
        connection = mysql.connector.connect(
            host=IP,
            user="valente",
            password=PASSWORD,
            database="DV_student_netlixProject",
            autocommit=True
        )
    else:
        connection = mysql.connector.connect(
            host=IP,
            user="valente",
            password=PASSWORD,
            autocommit=True
        )
    return connection

## Preparing Data

In [5]:
# Load data into pandas
df_titles = pd.read_csv('./../Data/complete_titles.csv')
df_credits = pd.read_csv('./../Data/raw_credits.csv')

In [6]:
# setting feature normalized Titles Table
table_titles = pd.DataFrame()
table_titles[
    [
    'title_id','title_name','release_year',
    'num_seasons','runtime','score_imbd', 'votes_imbd', 'is_awarded',
    'is_best']] = df_titles[[
    'id', 'title', 'release_year',
    'seasons', 'runtime' , 'imdb_score',
    'imdb_votes', 'is_awarded', 'is_best']]

table_titles['is_movie'] = df_titles['type'].apply(lambda x: x == 'MOVIE')

table_titles.head()

Unnamed: 0,title_id,title_name,release_year,num_seasons,runtime,score_imbd,votes_imbd,is_awarded,is_best,is_movie
0,ts300399,Five Came Back: The Reference Films,1945,1.0,48,,,False,False,False
1,tm84618,Taxi Driver,1976,,113,8.3,795222.0,True,True,True
2,tm127384,Monty Python and the Holy Grail,1975,,91,8.2,530877.0,True,True,True
3,tm70993,Life of Brian,1979,,94,8.0,392419.0,True,True,True
4,tm190788,The Exorcist,1973,,133,8.1,391942.0,True,True,True


In [7]:
# setting feature normalized Persons Table
table_persons = pd.DataFrame()
table_persons['person_id'] = df_credits['person_id']
table_persons['person_name'] = df_credits['name']
table_persons['person_id'] = table_persons['person_id'].drop_duplicates(keep='first')
table_persons = table_persons.dropna()
table_persons.head()

Unnamed: 0,person_id,person_name
0,3748.0,Robert De Niro
1,14658.0,Jodie Foster
2,7064.0,Albert Brooks
3,3739.0,Harvey Keitel
4,48933.0,Cybill Shepherd


In [8]:
# setting feature normlized Roles Table
table_roles = pd.DataFrame()
table_roles[['title_id','person_id', 'character']] = df_credits[['id', 'person_id', 'character']]
table_roles['is_actor'] = df_credits['role'].apply(lambda x: x == 'ACTOR')
table_roles.head()

Unnamed: 0,title_id,person_id,character,is_actor
0,tm84618,3748,Travis Bickle,True
1,tm84618,14658,Iris Steensma,True
2,tm84618,7064,Tom,True
3,tm84618,3739,Matthew 'Sport' Higgins,True
4,tm84618,48933,Betsy,True


In [9]:
table_genres = pd.DataFrame()
table_countries = pd.DataFrame()

## Database Creation

### Relational MySQL DB Creation

In [10]:
# Drop and recreate the database
connection=connectMySQL(use_db=False)

with connection.cursor() as cursor:
    try:
        cursor.execute(f"DROP DATABASE IF EXISTS {database_name}")
        print(f"Database {database_name} dropped.")
        cursor.execute(f"CREATE DATABASE {database_name}")
        print(f"Database {database_name} created.")
    except mysql.connector.Error as err:
        print(f"Error: {err}")


Database DV_student_netlixProject dropped.
Database DV_student_netlixProject created.


In [11]:
# Drop and recreate the tables
table_names = ['Titles', 'Genres','Countries', 'Roles', 'Persons']

def delete_tables(table_names:list) -> None: 
    with connection.cursor as cursor:
        for table in table_names:
            try:
                cursor.execute(f"DROP TABLE IF EXISTS {table}")
                print(f"Database {table} dropped.")
            except mysql.connector.Error as err:
                print(f"Error: {err}")

In [12]:
# Create Titles table

connection=connectMySQL(use_db=True)

while True:
    try:
        with connection.cursor() as cursor:
            cursor.execute("""
            CREATE TABLE Titles (
                title_id CHAR(10),
                title_name VARCHAR(128) NOT NULL,
                release_year INT NOT NULL,
                media_type_is_movie BOOLEAN NOT NULL,
                show_seasons INT,
                title_runtime INT,
                votes_imdb INT,
                score_imdb FLOAT,
                is_awarded BOOLEAN NOT NULL,
                is_best BOOLEAN NOT NULL,

                PRIMARY KEY (title_id)
            );
            """)
            # Create Genres Table
            cursor.execute( """
            CREATE TABLE Genres(
                Genre_name VARCHAR(50),
                Title_id CHAR(10),

                PRIMARY KEY (Title_id, Genre_name),
                FOREIGN KEY (Title_id) REFERENCES Titles(Title_id)
            );
            """)

            # Create Country Table
            cursor.execute("""
            CREATE TABLE Countries(
                Country_code VARCHAR(10),
                Title_id CHAR(10),

                PRIMARY KEY (Title_id, Country_code),
                FOREIGN KEY (Title_id) REFERENCES Titles(Title_id)
            );
            """)

            # Create Persons Table
            cursor.execute(""" 
            CREATE TABLE Persons(
                person_id INT,
                personName VARCHAR(128) NOT NULL,

                PRIMARY KEY (person_id)
            );
            """)

            # Crate Roles Table
            cursor.execute(""" 
            CREATE TABLE Roles(
                role_id INT AUTO_INCREMENT,
                person_id INT NOT NULL,
                title_id CHAR(10) NOT NULL,
                role_character VARCHAR(1000),
                role_isActor BOOLEAN NOT NULL,

                PRIMARY KEY (role_id),
                FOREIGN KEY (person_id) REFERENCES Persons(person_id),
                FOREIGN KEY (title_id) REFERENCES Titles(title_id)
            );
            """)
            break

    except mysql.connector.Error as err:
        print(err)
        delete_tables(table_names)

## Inserting Data into Database

### Relational MySQL DB Inserting Data

In [13]:
for row in table_titles.iloc: # Adding Data to Titles Table
    title_id, title_name, release_year, num_seasons, runtime, score_imbd, votes_imbd, is_awarded, is_best, is_movie = row

    query = """
INSERT INTO Titles (title_id, title_name, release_year, media_type_is_movie, show_seasons, title_runtime, votes_imdb, score_imdb, is_awarded, is_best)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
    with connection.cursor() as cursor:
        cursor.execute(query, (str(title_id),
                            str(title_name),
                            int(release_year),
                            bool(is_movie),
                            int(num_seasons) if not np.isnan(num_seasons) else None,
                            int(runtime) if not np.isnan(runtime) else None, 
                            int(votes_imbd) if not np.isnan(votes_imbd) else None,
                            float(score_imbd) if not np.isnan(score_imbd) else None,
                            bool(is_awarded),
                            bool(is_best)))

In [14]:
for row in table_persons.iloc: # Adding Data to Persons Table

    person_id, person_name = row

    query = "INSERT INTO Persons (person_id, personName) VALUES (%s, %s)"
    
    with connection.cursor() as cursor:
        cursor.execute(query, (int(person_id),
                            str(person_name)))

In [15]:
for row in table_roles.iloc: # Adding Data to Roles Table
    title_id, person_id, character, role = row
    query = """ 
INSERT INTO Roles (person_id, title_id, role_character, role_isActor)
VALUES (%s,%s,%s,%s)
"""
    with connection.cursor() as cursor:
        cursor.execute(query, (int(person_id),
                            str(title_id),
                            str(character),
                            bool(role)))

In [16]:
connection.commit() # Commit Changes to DB

## Query Databases with&without optimization

### Relational MySQL Queries

In [17]:
query_1_SQL = []
query_2_SQL = []
query_3_SQL = []
query_4_SQL = []

#### Query 1

Get Average Movie of the Year Runtime

##### No Index

In [18]:
connection = connectMySQL()
cursor = connection.cursor()
query_1 = "SELECT AVG(title_runtime) FROM Titles WHERE is_awarded = 1 AND media_type_is_movie = 1;"

start_time = time.time()
cursor.execute(query_1)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print("MOTY Average Runtime:",result[0], "time:", end_time - start_time)
query_1_SQL.append(end_time - start_time)

MOTY Average Runtime: 129.9592 time: 0.0005464553833007812


##### With Index

Using Hash Index because we are working with set values and not with operations that would involve iterating trough the data. 
Not only this but the query doesn't involve any operation that loops through all data comparing values.

In [19]:
connection = connectMySQL()
cursor = connection.cursor()

index_query_1 = "CREATE INDEX index_isAwardedMovie ON Titles (is_awarded, media_type_is_movie) USING HASH;"

try:
    print(f"Running: {index_query_1}")
    cursor.execute(index_query_1)
    print("Index Created Successfully")
except mysql.connector.Error as err:
    print(f"Error creating indexes: {err}")

start_time = time.time()
cursor.execute(query_1)
end_time = time.time()
result = cursor.fetchone()

cursor.close()
connection.close()

print("MOTY Average Runtime:",result[0], "time:", end_time - start_time)
query_1_SQL.append(end_time - start_time)

Running: CREATE INDEX index_isAwardedMovie ON Titles (is_awarded, media_type_is_movie) USING HASH;
Index Created Successfully
MOTY Average Runtime: 129.9592 time: 0.0010297298431396484


#### Query 2

Get Highest Scored Movie released in 2020 that hasn't been awarded

##### NoIndex

In [20]:
connection = connectMySQL()
cursor = connection.cursor()
query_2 = """
SELECT title_name, score_imdb, release_year FROM Titles
WHERE is_awarded = 0 AND media_type_is_movie = 1 AND release_year = 2020
ORDER BY score_imdb DESC LIMIT 1;"""
start_time = time.time()
cursor.execute(query_2)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"""
Movie: {result[0]}
Release Year: {result[2]}
IMDB Score: {result[1]}
Execution Time > {end_time-start_time}
""")
query_2_SQL.append(end_time - start_time)


Movie: Sky Tour: The Movie
Release Year: 2020
IMDB Score: 8.8
Execution Time > 0.003163576126098633



##### With Index

For this query, even though at first glance it seems to be a situation in which using a hash index would be prefereable, we are going to use b-tree index.\
This is because the query uses an operation that will loop and compare all items, sorting them and presenting them in descending order.

In [21]:
connection = connectMySQL()
cursor = connection.cursor()
index_query_2 = "CREATE INDEX index_AwardedMovieScore2020 ON Titles (is_awarded, media_type_is_movie, release_year, score_imdb DESC);"

try:
    print(f"Running: {index_query_2}")
    cursor.execute(index_query_2)
    print("Index Created Successfully")
except mysql.connector.Error as err:
    print(f"Error creating indexes: {err}")

start_time = time.time()
cursor.execute(query_2)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"""
Movie: {result[0]}
Release Year: {result[2]}
IMDB Score: {result[1]}
Execution Time > {end_time-start_time}
""")
query_2_SQL.append(end_time - start_time)

Running: CREATE INDEX index_AwardedMovieScore2020 ON Titles (is_awarded, media_type_is_movie, release_year, score_imdb DESC);
Index Created Successfully

Movie: Sky Tour: The Movie
Release Year: 2020
IMDB Score: 8.8
Execution Time > 0.0



#### Query 3

Find the Actor with the most roles in Awarded Movies 

##### No Index

In [22]:
connection = connectMySQL()
cursor = connection.cursor()
query_3 = """
SELECT pTable.personName, COUNT(rTable.role_id) AS role_count
FROM Roles rTable
JOIN Persons pTable ON rTable.person_id = pTable.person_id      -- Join Persons PK to Roles FK (person_id)
JOIN Titles tTable ON rTable.title_id = tTable.title_id         -- Join Titles PK to Roles FK (title_id)
WHERE rTable.role_isActor = 1 AND tTable.is_awarded = 1         -- Filthers Actor & Awarded
GROUP BY pTable.personName                                      -- Apply to Person Name
ORDER BY role_count DESC                                        -- Top to Bottom
LIMIT 1;
"""
start_time = time.time()
cursor.execute(query_3)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"""
Actor: {result[0]}
Role Count: {result[1]}
Execution Time > {end_time-start_time}
""")
query_3_SQL.append(end_time - start_time)


Actor: Aamir Khan
Role Count: 5
Execution Time > 0.003587961196899414



##### With Index

As query-3 is the first complex querie we will be debunking the elements to index for clarity.

- The target operations for our querie will be the WHERE, JOIN and GROUP/ORDER BY operations.
1. Starting in Roles Table where we simply "merge" all relevant features into a single index.
2. In Titles Table. Here we index the WHERE clause and the title id.
3. On Persons Table we simply index the name/id into feature.

All these indexes are BTree indexes (MySQL default) because we are not looking any particular condition but rather the first element of an ordered list.

In [23]:
indexes = [
    "CREATE INDEX index_ActorKeys ON Roles (role_isActor, title_id, person_id);",
    "CREATE INDEX index_AwardedTitles ON Titles (is_awarded, title_id);",
    "CREATE INDEX index_Person ON Persons (person_id, personName);"
]

connection = connectMySQL()
cursor = connection.cursor()

try:
    for index_query in indexes:
        print(f"Running: {index_query}")
        cursor.execute(index_query)
except mysql.connector.Error as err:
    print(f"Error creating indexes: {err}")

start_time = time.time()
cursor.execute(query_3)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"""
Actor: {result[0]}
Role Count: {result[1]}
Execution Time > {end_time-start_time}
""")
query_3_SQL.append(end_time - start_time)

Running: CREATE INDEX index_ActorKeys ON Roles (role_isActor, title_id, person_id);
Running: CREATE INDEX index_AwardedTitles ON Titles (is_awarded, title_id);
Running: CREATE INDEX index_Person ON Persons (person_id, personName);

Actor: Aamir Khan
Role Count: 5
Execution Time > 0.005101680755615234



#### Query 4

Find Actor with the highest average title IMDB score before the year of 2010 and their titles total runtime.

##### No Index

In [24]:
connection = connectMySQL()
cursor = connection.cursor()
query_4 = """
SELECT pTable.personName, SUM(tTable.title_runtime)AS total_runtime, AVG(tTable.score_imdb) AS average_score, COUNT(tTable.title_id) AS num_titles -- Set outputs
FROM Titles tTable
JOIN Roles rTable ON tTable.title_id = rTable.title_id          -- Join Roles FK to Titles PK  (title_id)
JOIN Persons pTable ON rTable.person_id = pTable.person_id      -- Join Roles FK to Persons PK (person_id) 
WHERE tTable.release_year > 2009
GROUP BY pTable.personName                                      -- Set personName as the SELECT target 
ORDER BY average_score DESC                                     -- Highest to Smallest sum of the runtime
LIMIT 1;                                                        -- filter only the 1st result (Highest)
"""
start_time = time.time()
cursor.execute(query_4)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"Actor Name: {result[0]},\nTotal Runtime: {result[1]},\nAverage Score: {result[2]},\nTotal Titles: {result[3]}")
print("Time >", end_time-start_time)
query_4_SQL.append(end_time - start_time)

Actor Name: Kim Sung-kyun,
Total Runtime: 157,
Average Score: 9.199999809265137,
Total Titles: 2
Time > 0.2917149066925049


##### With Index

This 4th and final query is relatively simmilar to query 3 so we will skip the index explanation.\
All indexes are BTree indexes as we will be ordering the average score in the end.

In [25]:
connection = connectMySQL()
cursor = connection.cursor()

indexes = [
    "CREATE INDEX index_TitleYearScoreTime ON Titles (release_year, title_runtime, score_imdb, title_id);",
    "CREATE INDEX index_RolesIDs ON Roles (title_id, person_id);",
    "CREATE INDEX index_Person ON Persons (person_id, personName);" # Already created in query_3
]

try:
    for index_query in indexes:
        print(f"Running: {index_query}")
        cursor.execute(index_query)
except mysql.connector.Error as err:
    print(f"Error creating indexes: {err}")

start_time = time.time()
cursor.execute(query_4)
end_time = time.time()
result = cursor.fetchone()
cursor.close()
connection.close()

print(f"Actor Name: {result[0]},\nTotal Runtime: {result[1]},\nAverage Score: {result[2]},\nTotal Titles: {result[3]}")
print("Time >", end_time-start_time)
query_4_SQL.append(end_time - start_time)

Running: CREATE INDEX index_TitleYearScoreTime ON Titles (release_year, title_runtime, score_imdb, title_id);
Running: CREATE INDEX index_RolesIDs ON Roles (title_id, person_id);
Running: CREATE INDEX index_Person ON Persons (person_id, personName);
Error creating indexes: 1061 (42000): Duplicate key name 'index_Person'
Actor Name: Kim Sung-kyun,
Total Runtime: 157,
Average Score: 9.199999809265137,
Total Titles: 2
Time > 0.2508509159088135
