# Loading and Processing Raw Data

### Adapting Data to fit the DB

The definition of *df_titles* can be found in `./workingWithData.ipynb`

In [1]:
import pandas as pd

In [2]:
df_titles = pd.read_csv('../Data/complete_titles.csv')
df_credits = pd.read_csv('../Data/raw_credits.csv')
df_titles.head()

Unnamed: 0,index,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,is_awarded,is_best
0,0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,,False,False
1,1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,True,True
2,2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,True,True
3,3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,True,True
4,4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,True,True


In [3]:
table_titles = pd.DataFrame()
table_genres = pd.DataFrame()
table_countries = pd.DataFrame()
table_persons = pd.DataFrame()
table_roles = pd.DataFrame()

In [4]:
# setting feature normalized Titles Table
table_titles[['title_id',
              'title_name',
              'release_year',
              'num_seasons',
              'runtime',
              'score_imbd',
              'votes_imbd',
              'is_awarded',
              'is_best']] = df_titles[['id', 'title', 'release_year','seasons', 'runtime' , 'imdb_score', 'imdb_votes', 'is_awarded', 'is_best']]
table_titles['is_movie'] = df_titles['type'].apply(lambda x: x == 'MOVIE')

table_titles.head()

Unnamed: 0,title_id,title_name,release_year,num_seasons,runtime,score_imbd,votes_imbd,is_awarded,is_best,is_movie
0,ts300399,Five Came Back: The Reference Films,1945,1.0,48,,,False,False,False
1,tm84618,Taxi Driver,1976,,113,8.3,795222.0,True,True,True
2,tm127384,Monty Python and the Holy Grail,1975,,91,8.2,530877.0,True,True,True
3,tm70993,Life of Brian,1979,,94,8.0,392419.0,True,True,True
4,tm190788,The Exorcist,1973,,133,8.1,391942.0,True,True,True


In [5]:
# setting feature normalized Persons Table
table_persons['person_id'] = df_credits['person_id']
table_persons['person_name'] = df_credits['name']
table_persons.head()

Unnamed: 0,person_id,person_name
0,3748,Robert De Niro
1,14658,Jodie Foster
2,7064,Albert Brooks
3,3739,Harvey Keitel
4,48933,Cybill Shepherd


In [6]:
table_persons['person_id'] = table_persons['person_id'].drop_duplicates(keep='first')
table_persons = table_persons.dropna()
table_persons.isnull().sum()

person_id      0
person_name    0
dtype: int64

In [7]:
table_persons.shape

(53956, 2)

In [8]:
# setting feature normlized Roles Table
table_roles[['title_id','person_id', 'character']] = df_credits[['id', 'person_id', 'character']]
table_roles['is_actor'] = df_credits['role'].apply(lambda x: x == 'ACTOR')
table_roles.head()

Unnamed: 0,title_id,person_id,character,is_actor
0,tm84618,3748,Travis Bickle,True
1,tm84618,14658,Iris Steensma,True
2,tm84618,7064,Tom,True
3,tm84618,3739,Matthew 'Sport' Higgins,True
4,tm84618,48933,Betsy,True


# Creating Relational DB using MySQL

In [9]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv

import mysql.connector

### DB Creation

In [10]:
# Load environment variables
load_dotenv('./../mysecrets.env')
PASSWORD = os.getenv('PASSWORD')
IP = os.getenv('IP')

# Database connection parameters
username = 'valente'
password = PASSWORD
host = IP
port = '3306'
database_name = 'DV_student_netlixProject'

# Load data into pandas
df_titles = pd.read_csv('./../Data/complete_titles.csv')
df_credits = pd.read_csv('./../Data/raw_credits.csv')

# Establish MySQL connection
connection = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    port=port
)

In [11]:
# Drop and recreate the database
with connection.cursor() as cursor:
    try:
        cursor.execute(f"DROP DATABASE IF EXISTS {database_name}")
        print(f"Database {database_name} dropped.")
        cursor.execute(f"CREATE DATABASE {database_name}")
        print(f"Database {database_name} created.")
    except mysql.connector.Error as err:
        print(f"Error: {err}")

with connection.cursor() as cursor:
    # Use the new database
    try:
        cursor.execute(f"USE {database_name}")
        print(f"Using database {database_name}.")
    except mysql.connector.Error as err:
        print(f"Error: {err}")

Database DV_student_netlixProject dropped.
Database DV_student_netlixProject created.
Using database DV_student_netlixProject.


In [12]:
table_names = ['Titles', 'Genres','Countries', 'Roles', 'Persons']

def delete_tables(table_names:list) -> None: 
    with connection.cursor as cursor:
        for table in table_names:
            try:
                cursor.execute(f"DROP TABLE IF EXISTS {table}")
                print(f"Database {table} dropped.")
            except mysql.connector.Error as err:
                print(f"Error: {err}")


In [13]:
# Create Titles table
while True:
    try:
        with connection.cursor() as cursor:
            cursor.execute("""
            CREATE TABLE Titles (
                title_id CHAR(10),
                title_name VARCHAR(128) NOT NULL,
                release_year INT NOT NULL,
                media_type_is_movie BOOLEAN NOT NULL,
                show_seasons INT,
                title_runtime INT,
                votes_imdb INT,
                score_imdb FLOAT,
                is_awarded BOOLEAN NOT NULL,
                is_best BOOLEAN NOT NULL,

                PRIMARY KEY (title_id)
            );
            """)
            # Create Genres Table
            cursor.execute( """
            CREATE TABLE Genres(
                Genre_name VARCHAR(50),
                Title_id CHAR(10),

                PRIMARY KEY (Title_id, Genre_name),
                FOREIGN KEY (Title_id) REFERENCES Titles(Title_id)
            );
            """)

            # Create Country Table
            cursor.execute("""
            CREATE TABLE Countries(
                Country_code VARCHAR(10),
                Title_id CHAR(10),

                PRIMARY KEY (Title_id, Country_code),
                FOREIGN KEY (Title_id) REFERENCES Titles(Title_id)
            );
            """)

            # Create Persons Table
            cursor.execute(""" 
            CREATE TABLE Persons(
                person_id INT,
                personName VARCHAR(128) NOT NULL,

                PRIMARY KEY (person_id)
            );
            """)

            # Crate Roles Table
            cursor.execute(""" 
            CREATE TABLE Roles(
                role_id INT AUTO_INCREMENT,
                person_id INT NOT NULL,
                title_id CHAR(10) NOT NULL,
                role_character VARCHAR(1000),
                role_isActor BOOLEAN NOT NULL,

                PRIMARY KEY (role_id),
                FOREIGN KEY (person_id) REFERENCES Persons(person_id),
                FOREIGN KEY (title_id) REFERENCES Titles(title_id)
            );
            """)
            break

    except mysql.connector.Error as err:
        print(err)
        delete_tables(table_names)

### Adding Data to DB

In [14]:
# adding Data to Titles Table
table_titles.head()

Unnamed: 0,title_id,title_name,release_year,num_seasons,runtime,score_imbd,votes_imbd,is_awarded,is_best,is_movie
0,ts300399,Five Came Back: The Reference Films,1945,1.0,48,,,False,False,False
1,tm84618,Taxi Driver,1976,,113,8.3,795222.0,True,True,True
2,tm127384,Monty Python and the Holy Grail,1975,,91,8.2,530877.0,True,True,True
3,tm70993,Life of Brian,1979,,94,8.0,392419.0,True,True,True
4,tm190788,The Exorcist,1973,,133,8.1,391942.0,True,True,True


In [15]:
for row in table_titles.iloc: # Adding Data to Titles Table
    title_id, title_name, release_year, num_seasons, runtime, score_imbd, votes_imbd, is_awarded, is_best, is_movie = row

    query = """
INSERT INTO Titles (title_id, title_name, release_year, media_type_is_movie, show_seasons, title_runtime, votes_imdb, score_imdb, is_awarded, is_best)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
    with connection.cursor() as cursor:
        cursor.execute(query, (str(title_id),
                            str(title_name),
                            int(release_year),
                            bool(is_movie),
                            int(num_seasons) if not np.isnan(num_seasons) else None,
                            int(runtime) if not np.isnan(runtime) else None, 
                            int(votes_imbd) if not np.isnan(votes_imbd) else None,
                            float(score_imbd) if not np.isnan(score_imbd) else None,
                            bool(is_awarded),
                            bool(is_best)))
    


In [16]:
table_persons.head()

Unnamed: 0,person_id,person_name
0,3748.0,Robert De Niro
1,14658.0,Jodie Foster
2,7064.0,Albert Brooks
3,3739.0,Harvey Keitel
4,48933.0,Cybill Shepherd


In [17]:
for row in table_persons.iloc: # Adding Data to Persons Table

    person_id, person_name = row

    query = """ 

INSERT INTO Persons (person_id, personName) VALUES (%s, %s)
"""
    with connection.cursor() as cursor:
        cursor.execute(query, (int(person_id),
                            str(person_name)))

In [18]:
table_roles.head()

Unnamed: 0,title_id,person_id,character,is_actor
0,tm84618,3748,Travis Bickle,True
1,tm84618,14658,Iris Steensma,True
2,tm84618,7064,Tom,True
3,tm84618,3739,Matthew 'Sport' Higgins,True
4,tm84618,48933,Betsy,True


In [19]:
for row in table_roles.iloc: # Adding Data to Roles Table
    title_id, person_id, character, role = row
    query = """ 

INSERT INTO Roles (person_id, title_id, role_character, role_isActor)
VALUES (%s,%s,%s,%s)

"""
    with connection.cursor() as cursor:
        cursor.execute(query, (int(person_id),
                            str(title_id),
                            str(character),
                            bool(role)))


In [20]:
connection.commit() # Commit Changes to DB

### Querying on Relational DB

In [21]:
connection = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    port=port,
    database='DV_student_netlixProject'
)

In [22]:
import time
mysql_times = []

In [23]:
# Simple Querie -1.
# Selecting the average runtime on all awarded movies.
query_1_1 = " SELECT AVG(title_runtime) FROM Titles WHERE is_awarded = 1 AND media_type_is_movie = 1;"

with connection.cursor() as cursor:
    start_time = time.time()
    cursor.execute(query_1_1)
    results_1_1 = cursor.fetchone()
    end_time = time.time()

mysql_times.append(end_time - start_time)
avg_movies_runtime = results_1_1
print("MOTY Average Runtime:",avg_movies_runtime[0])

MOTY Average Runtime: 129.9592


In [24]:
# Selecting the average runtime on all awarded Shows.
query_1_2 = " SELECT AVG(title_runtime) FROM Titles WHERE is_awarded = 1 AND media_type_is_movie = 0;"

with connection.cursor() as cursor:
    start_time = time.time()
    cursor.execute(query_1_2)
    results_1_2 = cursor.fetchone()
    end_time = time.time()

mysql_times.append(end_time - start_time)
avg_shows_runtime = results_1_2
print("SOTY Average Runtime:",avg_shows_runtime[0])

SOTY Average Runtime: 35.4194


In [25]:
# Simple Querie -2.
# Selecting all Table information on the movie released in 2020 with the highest score that hasn't been awarded.
query_2 = """

SELECT * FROM Titles
WHERE is_awarded = 0 AND media_type_is_movie = 1 AND release_year = 2020
ORDER BY score_imdb DESC LIMIT 1;

"""

with connection.cursor() as cursor:
    start_time = time.time()
    cursor.execute(query_2)
    results_2 = cursor.fetchone()
    end_time = time.time()

mysql_times.append(end_time - start_time)
query_2_movie_info = results_2

query_2_movieName = query_2_movie_info[1]
query_2_movieScore = query_2_movie_info[7]

print(f"Name: {query_2_movieName}\nScore: {query_2_movieScore}")

Name: Sky Tour: The Movie
Score: 8.8


In [26]:
if not connection.is_connected():
    connection.ping(reconnect=True)

In [27]:
# Complex Querie -1.
# Find the actor that appeared on most awarded titles

query_3 = """

SELECT pTable.personName, COUNT(rTable.role_id) AS role_count, tTable.title_name, rTable.role_character -- Set the outputs we want and save the ammount of matching roles with the name role_count
FROM Roles rTable
JOIN Persons pTable ON rTable.person_id = pTable.person_id -- reference the PK on table Person to the FK on table Roles
JOIN Titles tTable ON rTable.title_id = tTable.title_id    -- reference tge PK on table Titles to the FK on table Roles
WHERE rTable.role_isActor = 1 -- apply is_actor filter
GROUP BY pTable.personName    -- setting PersonName the target feature of the querie
ORDER BY role_count DESC      -- simply ordering the ammount of roles each actor had in awarded movies in descending order
LIMIT 1;                      -- limiting the 1st result

"""

with connection.cursor() as cursor:
    start_time = time.time()
    cursor.execute(query_3)
    results_3 = cursor.fetchone()
    end_time = time.time()

mysql_times.append(end_time - start_time)
best_actor = results_3

best_actor_name = best_actor[0]
best_actor_awardedCount = best_actor[1]

print(f"Actor Name: {best_actor_name},\nTotal Awarded Titles: {best_actor_awardedCount}")
best_actor

Actor Name: Shah Rukh Khan,
Total Awarded Titles: 30


('Shah Rukh Khan', 30, 'Chaahat', 'Roop Singh Rathod')

In [28]:
connection = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    port=port,
    database='DV_student_netlixProject'
)

In [29]:
# Complex Querie -2.
# Find the actor with highest score before 2010 and their titles total runtime

query_4 = """
SELECT pTable.personName, SUM(tTable.title_runtime)AS total_runtime, AVG(tTable.score_imdb) AS average_score, COUNT(tTable.title_id) AS num_titles -- Set outputs
FROM Titles tTable
JOIN Roles rTable ON tTable.title_id = rTable.title_id          -- Join Roles FK to Titles PK  (title_id)
JOIN Persons pTable ON rTable.person_id = pTable.person_id      -- Join Roles FK to Persons PK (person_id) 
WHERE tTable.release_year > 2009
GROUP BY pTable.personName                                      -- Set personName as the SELECT target 
ORDER BY average_score DESC                                     -- Highest to Smallest sum of the runtime
LIMIT 1;                                                        -- filter only the 1st result (Highest)
"""

with connection.cursor() as cursor:
    cursor.execute(query_4)
    results_4 = cursor.fetchone()
cursor.close()

False

In [30]:
consistent_actor_2010 = results_4
consistent_actor_name = consistent_actor_2010[0]
consistent_actor_awardedCount = consistent_actor_2010[1]
consistent_actor_avgScore = consistent_actor_2010[2]
consistent_actor_numTitles = consistent_actor_2010[3]

print(f"Actor Name: {consistent_actor_name},\nTotal Runtime: {consistent_actor_awardedCount},\nAverage Score: {consistent_actor_avgScore},\nTotal Titles: {consistent_actor_numTitles}")

Actor Name: Kim Sung-kyun,
Total Runtime: 157,
Average Score: 9.199999809265137,
Total Titles: 2


# Creating Non-Relational DB using MongoDB

### Creating DB and inserting Data

In [31]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv('./../mysecrets.env')
PASSWORD = os.getenv('PASSWORD')
IP = os.getenv('IP')

In [32]:
client = MongoClient(f"mongodb+srv://fc64335:{PASSWORD}@fcul.l0hna.mongodb.net/?retryWrites=true&w=majority&appName=FCUL") # YOU SHOULD CHANGE TO YOUR CREDENTIALS AND SERVER INFO, just copy the ones from mongo atlas connect
client.drop_database("DV_student_netlixProject")
print("Database dropped")
db = client['DV_student_netflixProject']

Database dropped


In [33]:
# # Drop the collection if you need
db['Titles'].drop()
db['Roles'].drop()
db['Persons'].drop()
print("All collections dropped")

All collections dropped


In [34]:
table_titles.head()

Unnamed: 0,title_id,title_name,release_year,num_seasons,runtime,score_imbd,votes_imbd,is_awarded,is_best,is_movie
0,ts300399,Five Came Back: The Reference Films,1945,1.0,48,,,False,False,False
1,tm84618,Taxi Driver,1976,,113,8.3,795222.0,True,True,True
2,tm127384,Monty Python and the Holy Grail,1975,,91,8.2,530877.0,True,True,True
3,tm70993,Life of Brian,1979,,94,8.0,392419.0,True,True,True
4,tm190788,The Exorcist,1973,,133,8.1,391942.0,True,True,True


In [35]:
# Create the Titles collection with data validation
db.create_collection("Titles", validator={
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['title_id', 'title_name', 'is_movie', 'is_best', 'is_awarded'],
        'properties': {
            'title_id': {
                'bsonType': 'string',
                'description': 'Unique Title Identifier (String). REQUIRED'
            },
            'title_name': {
                'bsonType': 'string',
                'description': 'must be a string and is required'
            },
            'release_year': {
                'bsonType': 'int',
                'minimum': 1900,
                'maximum': 2024,
                'description': 'must be a valid year(INT) if provided'
            },
            'num_seasons': {
                'bsonType': 'int', #### idealy int
                'minimum': 0,
                'description': 'must be a non-negative integer if provided'
            },
            'runtime': {
                'bsonType': 'int',
                'minimum': 0,
                'description': "must be a integer and can't be negative if provided"
            },
            'score_imbd': {
                'bsonType': 'double',
                'minimum': 0,
                'maximum': 10,
                'description': "must be a non-int numeric value and can't be negative if provided"
            },
            'votes_imbd': {
                'bsonType': 'int',
                'minimum': 0,
                'description': "must be a int and can't be negative if provided"
            },
            'is_awarded': {
                'bsonType': 'bool',
                'description': "must be True/False and is required"
            },
            'is_best': {
                'bsonType': 'bool',
                'description': "must be True/False and is required"
            },
            'is_movie': {
                'bsonType': 'bool',
                'description': "must be True/False and is required"
            },
        }
    }
})

Collection(Database(MongoClient(host=['fcul-shard-00-00.l0hna.mongodb.net:27017', 'fcul-shard-00-01.l0hna.mongodb.net:27017', 'fcul-shard-00-02.l0hna.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='FCUL', authsource='admin', replicaset='atlas-r5n8a4-shard-0', tls=True), 'DV_student_netflixProject'), 'Titles')

In [36]:
table_titles['title_id'] = table_titles['title_id'].apply(lambda x: str(x) if pd.notna(x) else None)
table_titles['title_name'] = table_titles['title_name'].apply(lambda x: str(x) if pd.notna(x) else None)

table_titles['num_seasons'] = table_titles['num_seasons'].where(pd.notna(table_titles['num_seasons']), None)
table_titles['num_seasons'] = table_titles['num_seasons'].fillna(0).astype(int)
table_titles['num_seasons'] = table_titles['num_seasons'].replace(0, None)

table_titles['votes_imbd'] = table_titles['votes_imbd'].where(pd.notna(table_titles['votes_imbd']), None)
table_titles['votes_imbd'] = table_titles['votes_imbd'].fillna(0).astype(int)
table_titles['votes_imbd'] = table_titles['votes_imbd'].replace(0, None)

table_titles['score_imbd'] = table_titles['score_imbd'].where(pd.notna(table_titles['score_imbd']), None)
table_titles['score_imbd'] = table_titles['score_imbd'].fillna(0).astype(float)
table_titles['score_imbd'] = table_titles['score_imbd'].replace(0, None)

In [37]:
def remove_nan_values(entry):
    return {key: value for key, value in entry.items() if pd.notna(value)}

In [38]:
collection_titles = db["Titles"]
initial_count_titles = collection_titles.count_documents({})
print(f"# documents: {initial_count_titles}")

try:
    data_dict = table_titles.where(pd.notna(table_titles), None).to_dict(orient="records")
    data_dict = [remove_nan_values(entry) for entry in data_dict]
    result = collection_titles.insert_many(data_dict, ordered=False)
    valid_count = collection_titles.count_documents({}) - initial_count_titles
    print(f"{valid_count} documents inserted.")

except Exception as e:
    attempted_count = len(data_dict)
    valid_count = collection_titles.count_documents({}) - initial_count_titles
    print(f"{valid_count} documents inserted")
    print(f"{attempted_count - valid_count} documents failed to insert")

print(f"# documents: {collection_titles.count_documents({})}") 

# documents: 0
5805 documents inserted
1 documents failed to insert
# documents: 5805


In [39]:
table_persons.head()

Unnamed: 0,person_id,person_name
0,3748.0,Robert De Niro
1,14658.0,Jodie Foster
2,7064.0,Albert Brooks
3,3739.0,Harvey Keitel
4,48933.0,Cybill Shepherd


In [40]:
# Create the Persons collection with data validation
db.create_collection("Persons", validator={
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['person_id', 'person_name'],
        'properties': {
            'person_id': {
                'bsonType': 'int',
                'description': 'Unique Title Identifier (INT). REQUIRED'
            },
            'title_name': {
                'bsonType': 'string',
                'description': 'must be a string and is required'
            },
        }
    }
})

Collection(Database(MongoClient(host=['fcul-shard-00-00.l0hna.mongodb.net:27017', 'fcul-shard-00-01.l0hna.mongodb.net:27017', 'fcul-shard-00-02.l0hna.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='FCUL', authsource='admin', replicaset='atlas-r5n8a4-shard-0', tls=True), 'DV_student_netflixProject'), 'Persons')

In [41]:
table_persons['person_id'] = table_persons['person_id'].astype(int)

In [42]:
collection_persons = db["Persons"]
initial_count_persons = collection_persons.count_documents({})
print(f"# documents: {initial_count_persons}")

try:
    data_dict = table_persons.where(pd.notna(table_persons), None).to_dict(orient="records")
    data_dict = [remove_nan_values(entry) for entry in data_dict]
    result = collection_persons.insert_many(data_dict, ordered=False)
    valid_count = collection_persons.count_documents({}) - initial_count_persons
    print(f"{valid_count} documents inserted.")

except Exception as e:
    attempted_count = len(data_dict)
    valid_count = collection_persons.count_documents({}) - initial_count_persons
    print(f"{valid_count} documents inserted")
    print(f"{attempted_count - valid_count} documents failed to insert")

print(f"# documents: {collection_persons.count_documents({})}") 

# documents: 0
53956 documents inserted.
# documents: 53956


In [43]:
table_roles.head()

Unnamed: 0,title_id,person_id,character,is_actor
0,tm84618,3748,Travis Bickle,True
1,tm84618,14658,Iris Steensma,True
2,tm84618,7064,Tom,True
3,tm84618,3739,Matthew 'Sport' Higgins,True
4,tm84618,48933,Betsy,True


In [44]:
# Create the Roles collection with data validation
db.create_collection("Roles", validator={
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['title_id', 'person_id', 'is_actor'],
        'properties': {
            'title_id': {
                'bsonType': 'string',
                'description': 'Unique identifier for Title (String) at Titles collection. REQUIRED'
            },
            'person_id': {
                'bsonType': 'int',
                'description': 'unique identifier for Person (INT) at Persons collection. Required'
            },
            'character': {
                'bsonType': 'string',
                'description': 'must be string if provided.'
            },
            'is_actor': {
                'bsonType': 'bool',
                'description': 'must be True/False and is required.'
            },
        }
    }
})

Collection(Database(MongoClient(host=['fcul-shard-00-00.l0hna.mongodb.net:27017', 'fcul-shard-00-01.l0hna.mongodb.net:27017', 'fcul-shard-00-02.l0hna.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='FCUL', authsource='admin', replicaset='atlas-r5n8a4-shard-0', tls=True), 'DV_student_netflixProject'), 'Roles')

In [45]:
table_roles.columns

Index(['title_id', 'person_id', 'character', 'is_actor'], dtype='object')

In [46]:
table_roles['character'] = table_roles['character'].astype(str)

In [47]:
collection_roles = db["Roles"]
initial_count_roles = collection_roles.count_documents({})
print(f"# documents: {initial_count_roles}") 
try:
    data_dict = table_roles.where(pd.notna(table_roles), None).to_dict(orient="records")
    result = collection_roles.insert_many(data_dict, ordered=False)
    valid_count = collection_roles.count_documents({}) - initial_count_roles
    print(f"{valid_count} documents inserted.")

except Exception as e:
    attempted_count = len(data_dict)
    valid_count = collection_roles.count_documents({}) - initial_count_roles
    print(f"{valid_count} documents inserted")
    print(f"{attempted_count - valid_count} documents failed to insert")

print(f"# documents: {collection_roles.count_documents({})}") 

# documents: 0
77213 documents inserted.
# documents: 77213


### Non-Relational Queries

In [48]:
queries = [
    {"description": "Find average runtime of titles that have been awarded."}, 
    {"description": "Fing the non-awarded movie of 2020 with the highest score"},
    {"description": "Find the actor with the most roles in awarded movies"},
    {"description": "Find the actor with the higherst average IMDB score before the year of 2010 and their titles total runtime."}
     ]

In [49]:
mysql_times

[0.001505136489868164,
 0.0010001659393310547,
 0.0010764598846435547,
 0.5354599952697754]

In [50]:
mongo_times = []

In [51]:
query_1 = [
    {
        '$match': {
            'is_awarded': True,
            'is_movie': True,
            'runtime': { '$exists': True, '$ne': None}
        }
    },
    {
        '$group': {
            '_id': None,
            'averageRuntime': { '$avg': "$runtime" }
        }
    },
    {
        '$project': {
            '_id': 0,  # Exclude the default `_id` field
            'averageRuntime': 1
        }
    }
]

# Execute the aggregation query
start_time = time.time()
result_1 = db.Titles.aggregate(query_1)
end_time = time.time()

# Print the result
for doc in result_1:
    print(doc)

mongo_times.append(end_time - start_time)

{'averageRuntime': 129.9591836734694}


In [52]:
query_2 = [
    {
        '$match': {
            'is_awarded': False,
            'is_movie': True,
            'release_year': 2020,
        },
    },
    {
        '$sort': {
            'score_imbd': -1
        },
    },
    {
        '$limit': 1
    },
    {
        '$project': {
            '_id':0,
            'title_name':1,
            'release_year':1,
            'score_imbd':1,
        }
    }
]

start_time = time.time()
result_2 = db.Titles.aggregate(query_2)
end_time = time.time()

# Print the result
for doc in result_2:
    print(doc)

mongo_times.append(end_time - start_time)

{'title_name': 'Sky Tour: The Movie', 'release_year': 2020, 'score_imbd': 8.8}


In [53]:
table_roles.columns

Index(['title_id', 'person_id', 'character', 'is_actor'], dtype='object')

In [54]:
table_persons.columns

Index(['person_id', 'person_name'], dtype='object')

In [55]:
roles_count = db.Roles.count_documents({'is_actor': True, 'is_awarded': True})
print("Number of matching roles:", roles_count)

Number of matching roles: 0


In [56]:
query_debug = [
    {
        "$match": {
            'is_actor': True,  # Filter for actors
        }
    },
    { 
        '$lookup': {
            'from': 'Titles',  # Join Titles collection
            'localField': 'title_id',  # Match on title_id from Roles
            'foreignField': 'title_id',  # Match on title_id from Titles
            'as': 'titleInfo'  # Store result as titleInfo
        }
    },
    {
        '$unwind': '$titleInfo'  # Unwind the array-like result of titleInfo
    },
    {
        "$match": {
            'titleInfo.is_awarded': True  # Filter based on the is_awarded field in Titles collection
        }
    },
    {
        '$group': {
            '_id': '$person_id',  # Group by person_id (or person_name, depending on your requirement)
            'num_roles': {'$sum': 1}  # Count the number of roles for each person
        }
    },
    {
        '$sort': {
            'num_roles': -1  # Sort by number of roles in descending order
        }
    },
    {
        '$limit': 1  # Limit the result to the person with the most roles
    },
    {
        '$project': {
            '_id': 0,  # Exclude the default _id field
            'person_id': '$_id',  # Include person_id in the result
            'num_roles': 1  # Include the number of roles
        }
    }
]

result = db.Roles.aggregate(query_debug)

# Print the results
for doc in result:
    print(doc)

{'num_roles': 5, 'person_id': 61013}


In [57]:
query_full = [
    {
        "$match": {
            'is_actor': True,  # Filter for roles where the person is an actor
        }
    },
    { 
        '$lookup': {
            'from': 'Titles',  # Join Titles collection
            'localField': 'title_id',  # Match on title_id from Roles
            'foreignField': 'title_id',  # Match on title_id from Titles
            'as': 'titleInfo'  # Store result as titleInfo
        }
    },
    {
        '$unwind': '$titleInfo'  # Unwind the array of titleInfo to work with individual documents
    },
    {
        "$match": {
            'titleInfo.is_awarded': True  # Filter only awarded titles
        }
    },
    {
        '$group': {
            '_id': '$person_id',  # Group by person_id (or person_name)
            'num_roles': {'$sum': 1},  # Count the number of roles for each person
            'total_runtime': {'$sum': '$titleInfo.runtime'}  # Sum the runtime of all their titles
        }
    },
    {
        '$sort': {
            'num_roles': -1  # Sort by the number of roles in descending order
        }
    },
    {
        '$limit': 1  # Limit the result to only the person with the highest number of roles
    },
    {
        '$lookup': {
            'from': 'Persons',  # Join with Persons collection to get the actor's name
            'localField': '_id',  # Match on person_id from grouped result
            'foreignField': 'person_id',  # Match on person_id in Persons collection
            'as': 'personInfo'  # Store result as personInfo
        }
    },
    {
        '$unwind': '$personInfo'  # Unwind the array of personInfo
    },
    {
        '$project': {
            '_id': 0,  # Exclude the default _id field
            'person_name': '$personInfo.person_name',  # Include the actor's name
            'num_roles': 1,  # Include the number of roles
            'total_runtime': 1  # Include the total runtime of all awarded titles
        }
    }
]

start = time.time()
result = db.Roles.aggregate(query_full)
end  = time.time()

for doc in result:
    print(doc)

mongo_times.append(end-start)

{'num_roles': 5, 'total_runtime': 877, 'person_name': 'Aamir Khan'}


In [58]:
mysql_times

[0.001505136489868164,
 0.0010001659393310547,
 0.0010764598846435547,
 0.5354599952697754]

In [59]:
mongo_times

[0.04387021064758301, 0.04331374168395996, 263.8148055076599]

In [61]:
query_4 = [
    # Join Roles with Titles to get title details
    {
        "$lookup": {
            "from": "Titles",
            "localField": "title_id",
            "foreignField": "title_id",
            "as": "titleDetails"
        }
    },
    # Flatten the title details array
    { "$unwind": "$titleDetails" },
    # Filter titles released before 2010
    {
        "$match": {
            "titleDetails.release_year": { "$lt": 2010 },
            "titleDetails.score_imdb": { "$exists": True, "$ne": None }  # Ensure IMDb score exists
        }
    },
    # Join with Persons to get actor names
    {
        "$lookup": {
            "from": "Persons",
            "localField": "actor_id",
            "foreignField": "actor_id",
            "as": "personDetails"
        }
    },
    # Flatten the person details array
    { "$unwind": "$personDetails" },
    # Group by actor to calculate average IMDb score and total runtime
    {
        "$group": {
            "_id": "$personDetails.name",  # Group by actor's name
            "averageImdbScore": { "$avg": "$titleDetails.score_imdb" },  # Calculate average IMDb score
            "totalRuntime": { "$sum": "$titleDetails.runtime" },  # Calculate total runtime
            "titles": { "$push": "$titleDetails.title_name" }  # Optional: Collect titles for reference
        }
    },
    # Sort by highest average IMDb score
    { "$sort": { "averageImdbScore": -1 } },
    # Limit to the top actor
    { "$limit": 1 }
]

# Run the aggregation query
start = time.time()
result_4 = db.Roles.aggregate(query_4)
end = time.time()

mongo_times.append(end-start)

for doc in result_4:
    print(doc)

In [62]:
mongo_times

[0.04387021064758301,
 0.04331374168395996,
 263.8148055076599,
 273.0004372596741]