2. Select relevant attributes from the IMDb dataset and reload.

In [20]:
import pandas as pd 
import numpy as np 
import time

In [4]:
name_basics = pd.read_csv('data/name.basics.tsv', sep = '\t', usecols=['nconst', 'primaryName', 'primaryProfession', 'knownForTitles'])

In [14]:
primary_profession = name_basics['primaryProfession'].unique()
print("primary_profession types:", primary_profession)

primary_profession types: ['actor,miscellaneous,producer' 'actress,soundtrack,archive_footage'
 'actress,music_department,producer' ... 'stunts,publicist,executive'
 'costume_department,camera_department,location_management'
 'composer,director,archive_footage']


In [15]:
unique_primary_professions = name_basics['primaryProfession'].str.split(',').explode().unique()
print("Unique primary professions:", unique_primary_professions)

Unique primary professions: ['actor' 'miscellaneous' 'producer' 'actress' 'soundtrack'
 'archive_footage' 'music_department' 'writer' 'director' 'stunts'
 'make_up_department' 'composer' 'assistant_director' 'camera_department'
 'music_artist' 'art_department' 'editor' 'cinematographer'
 'casting_director' 'executive' 'costume_designer' 'script_department'
 'art_director' 'editorial_department' 'costume_department'
 'animation_department' 'talent_agent' 'archive_sound'
 'production_designer' 'special_effects' 'production_manager'
 'sound_department' 'casting_department' 'visual_effects'
 'location_management' 'set_decorator' 'transportation_department' '\\N'
 'choreographer' 'legal' 'manager' 'publicist' 'podcaster' 'assistant'
 'production_department' 'accountant' 'electrical_department']


In [5]:
title_basics = pd.read_csv('data/title.basics.tsv', sep = '\t', usecols=['tconst', 'titleType', 'primaryTitle', 'genres'])

In [6]:
title_types = title_basics['titleType'].unique()
print("titleType categories:", title_types)

genres = title_basics['genres'].unique()
print("Genres categories:", genres)

titleType categories: ['short' 'movie' 'tvShort' 'tvMovie' 'tvEpisode' 'tvSeries' 'tvMiniSeries'
 'tvSpecial' 'video' 'videoGame' 'tvPilot']
Genres categories: ['Documentary,Short' 'Animation,Short' 'Animation,Comedy,Romance' ...
 'Biography,Crime,Fantasy' 'Mystery,Reality-TV,Thriller'
 'Musical,Reality-TV,Talk-Show']


My quiz is focus only in titleType category: 'movie'

In [None]:
movies = title_basics[title_basics['titleType'] == 'movie']
unique_genres_movies = movies['genres'].str.split(',').explode().dropna().unique()
print("Unique genres movies:", unique_genres_movies)

Unique genres movies: ['Romance' 'Documentary' 'News' 'Sport' '\\N' 'Action' 'Adventure'
 'Biography' 'Drama' 'Fantasy' 'Comedy' 'War' 'Crime' 'Family' 'History'
 'Sci-Fi' 'Thriller' 'Western' 'Mystery' 'Horror' 'Music' 'Animation'
 'Musical' 'Film-Noir' 'Talk-Show' 'Adult' 'Reality-TV' 'Game-Show']


My quiz is not focus on the following unique genres categories: 'News', '\\N', 'Animation', 'Film-Noir, 'Talk-Show, 'Adult', 'Reality-TV', 'Game-Show'

In [30]:
filtered_titles = title_basics[
    (title_basics['titleType'] == 'movie') & 
    (~title_basics['genres'].str.contains(r'Animation|News|\\N|Film-Noir|Talk-Show|Adult|Reality-TV|Game-Show', na=False, regex=True))
]

In [10]:
title_crew = pd.read_csv('data/title.crew.tsv', sep = '\t', usecols=['tconst', 'directors'])

In [11]:
title_ratings = pd.read_csv('data/title.ratings.tsv', sep = '\t')

In [12]:
title_principals = pd.read_csv('data/title.principals.tsv', sep = '\t', usecols=['tconst', 'ordering', 'nconst', 'category'])

In [19]:
num_rows = title_principals.shape[0]
num_columns = title_principals.shape[1]
print(num_rows)
print(num_columns)

88962941
4


In [16]:
categories = title_principals['category'].unique()
print(categories)

['self' 'director' 'producer' 'cinematographer' 'composer' 'editor'
 'actor' 'actress' 'writer' 'production_designer' 'archive_footage'
 'casting_director' 'archive_sound']


In [24]:

chunk_sizes = [500000, 5000, 200000]

for chunk_size in chunk_sizes:
    start_time = time.time()
    for chunk in pd.read_csv('data/title.principals.tsv', sep='\t', chunksize=chunk_size):
        pass 
    end_time = time.time()
    print(f"Chunk_size time={chunk_size}: {end_time - start_time:.2f} seconds")


Chunk_size time=500000: 152.55 seconds
Chunk_size time=5000: 166.42 seconds
Chunk_size time=200000: 86.73 seconds


In [28]:
filtered_title_principals = title_principals[title_principals['category'].isin(['actor', 'actress', 'director'])]

3. Define the structure of the quiz

In [3]:
import pandas as pd

title_basics = pd.read_csv('data/title.basics.tsv', sep='\t', usecols=['tconst', 'titleType', 'primaryTitle', 'genres'])
title_ratings = pd.read_csv('data/title.ratings.tsv', sep='\t')

filtered_titles = title_basics[
    (title_basics['titleType'] == 'movie') &
    (~title_basics['genres'].str.contains(r'Animation|News|\\N|Film-Noir|Talk-Show|Adult|Reality-TV|Game-Show', na=False, regex=True))
]

filtered_movies_with_ratings = filtered_titles.merge(title_ratings, on='tconst', how='inner')

#top_10_average = filtered_movies_with_ratings.nlargest(10, 'averageRating')
#print("Top 10 movies with higher averageRating:")
#print(top_10_average[['primaryTitle', 'averageRating', 'numVotes']])

top_10_votes = filtered_movies_with_ratings.nlargest(10, 'numVotes')
print("Top 10 movies with higher numVotes:")
print(top_10_votes[['primaryTitle', 'averageRating', 'numVotes']])


Top 10 movies with higher numVotes:
                                             primaryTitle  averageRating  \
59128                            The Shawshank Redemption            9.3   
126328                                    The Dark Knight            9.0   
165886                                          Inception            8.8   
67896                                          Fight Club            8.8   
58424                                        Forrest Gump            8.8   
58990                                        Pulp Fiction            8.9   
131941                                       Interstellar            8.7   
66852                                          The Matrix            8.7   
35562                                       The Godfather            9.2   
63626   The Lord of the Rings: The Fellowship of the Ring            8.9   

        numVotes  
59128    2959184  
126328   2940814  
165886   2609684  
67896    2391254  
58424    2315661  
58990    2272

In [None]:
top_300_votes = filtered_movies_with_ratings.nlargest(300, 'numVotes')

#easy_category = top_300_votes.iloc[0:100]
#print(easy_category[['primaryTitle', 'averageRating', 'numVotes']])

#medium_category = top_300_votes.iloc[100:200]
#print(medium_category[['primaryTitle', 'averageRating', 'numVotes']])

#hard_category = top_300_votes.iloc[200:300]
#print(hard_category[['primaryTitle', 'averageRating', 'numVotes']])


                                     primaryTitle  averageRating  numVotes
59128                    The Shawshank Redemption            9.3   2959184
126328                            The Dark Knight            9.0   2940814
165886                                  Inception            8.8   2609684
67896                                  Fight Club            8.8   2391254
58424                                Forrest Gump            8.8   2315661
...                                           ...            ...       ...
90333       Harry Potter and the Sorcerer's Stone            7.6    877801
119989                                        300            7.6    877100
278721           Once Upon a Time... in Hollywood            7.6    876393
63740   Star Wars: Episode I - The Phantom Menace            6.5    875050
62439                            The Big Lebowski            8.1    874795

[100 rows x 3 columns]
                      primaryTitle  averageRating  numVotes
171833          

In [6]:
hard_category = top_300_votes.iloc[200:300]
print(hard_category[['primaryTitle', 'averageRating', 'numVotes']])

                                     primaryTitle  averageRating  numVotes
132524                                   Superbad            7.6    648176
114448  Harry Potter and the Order of the Phoenix            7.5    648171
138758                                       Argo            7.7    645729
135251                                      Taken            7.7    644708
39578                                       Rocky            8.1    643441
...                                           ...            ...       ...
235439    Fantastic Beasts and Where to Find Them            7.2    515915
49107                                     Top Gun            6.9    514941
184044                   Avatar: The Way of Water            7.5    514526
196889      The Hunger Games: Mockingjay - Part 1            6.6    513518
195159                                  Spotlight            8.1    509645

[100 rows x 3 columns]


In [7]:
top_450_votes = filtered_movies_with_ratings.nlargest(450, 'numVotes')

In [8]:
hard_category = top_450_votes.iloc[300:450]
print(hard_category[['primaryTitle', 'averageRating', 'numVotes']])

                                         primaryTitle  averageRating  numVotes
15547                           It's a Wonderful Life            8.6    508186
217774  Star Wars: Episode IX - The Rise of Skywalker            6.4    505519
158941                                        Warrior            8.1    505134
191182                                American Hustle            7.2    505076
112576                                   The Terminal            7.4    504886
...                                               ...            ...       ...
283959                                     Hereditary            7.3    399909
206840                                     About Time            7.8    399696
222952                                       Bird Box            6.6    397133
107428                                    Man on Fire            7.7    397085
105903                        Mission: Impossible III            6.9    396773

[150 rows x 3 columns]


Let's reload with the desired attributes

In [3]:

import pandas as pd 
import numpy as np

name_basics = pd.read_csv('data/name.basics.tsv', sep = '\t', usecols=['nconst', 'primaryName', 'primaryProfession', 'knownForTitles'])
title_basics = pd.read_csv('data/title.basics.tsv', sep = '\t', usecols=['tconst', 'titleType', 'primaryTitle', 'genres'])
title_ratings = pd.read_csv('data/title.ratings.tsv', sep = '\t', usecols=['tconst', 'numVotes'])
title_principals = pd.read_csv('data/title.principals.tsv', sep = '\t', usecols=['tconst', 'ordering', 'nconst', 'category'])

In [4]:
# Filter only movies and the desired categories
filtered_titles = title_basics[(title_basics['titleType'] == 'movie')]
filtered_title_principals = title_principals[title_principals['category'].isin(['actor', 'actress', 'director'])]
filtered_movies_with_ratings = filtered_titles.merge(title_ratings, on='tconst', how='inner')

Three levels of difficulty: easy, medium and hard.

In [5]:

# Order the films by the number of votes and select the top 450.
top_450_votes = filtered_movies_with_ratings.nlargest(450, 'numVotes')

# Divide by level of dificulty
easy = top_450_votes.iloc[0:150]
medium = top_450_votes.iloc[150:300]
hard = top_450_votes.iloc[300:450]

In [6]:
# Join revelant data for the quiz
movie_data = filtered_title_principals.merge(name_basics, on='nconst', how='inner')

In [7]:
# Definition of points from each dificulty
def get_questions(level):
    if level == 'easy':
        movies = easy
        points = 10
    elif level == 'medium':
        movies = medium
        points = 20
    elif level == 'hard':
        movies = hard
        points = 30
    else:
        print("Invalid level selected. Please choose easy, medium, or hard.")
        return