In [1]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [4]:
print(basics.columns)

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')


In [5]:
# Title Basics
basics.replace(r'\\N', np.nan, inplace=True)
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')  # Convert 'startYear' to numeric
basics = basics[(basics['titleType'] == 'movie') & (basics['startYear'].between(2000, 2022))]
basics = basics[~basics['genres'].str.contains('Documentary', na=False)]

In [6]:
print(akas.columns)

Index(['titleId', 'ordering', 'title', 'region', 'language', 'types',
       'attributes', 'isOriginalTitle'],
      dtype='object')


In [7]:
# AKAs
akas = akas[akas['region'] == 'US']
akas.replace(r'\\N', np.nan, inplace=True)

In [8]:
print(ratings.columns)

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')


In [9]:
# Ratings
ratings.replace(r'\\N', np.nan, inplace=True)
ratings = ratings[ratings['tconst'].isin(akas['titleId'])]

In [10]:
print("Basics DataFrame Info:")
basics.info()

Basics DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 202717 entries, 11636 to 10282958
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          202717 non-null  object 
 1   titleType       202717 non-null  object 
 2   primaryTitle    202717 non-null  object 
 3   originalTitle   202717 non-null  object 
 4   isAdult         202717 non-null  object 
 5   startYear       202717 non-null  float64
 6   endYear         202717 non-null  object 
 7   runtimeMinutes  202717 non-null  object 
 8   genres          202717 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.5+ MB


In [11]:
print("AKAs DataFrame Info:")
akas.info()

AKAs DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1474948 entries, 5 to 37653297
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1474948 non-null  object
 1   ordering         1474948 non-null  int64 
 2   title            1474948 non-null  object
 3   region           1474948 non-null  object
 4   language         1474948 non-null  object
 5   types            1474948 non-null  object
 6   attributes       1474948 non-null  object
 7   isOriginalTitle  1474948 non-null  object
dtypes: int64(1), object(7)
memory usage: 101.3+ MB


In [12]:
print("Ratings DataFrame Info:")
ratings.info()

Ratings DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 513266 entries, 0 to 1365844
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         513266 non-null  object 
 1   averageRating  513266 non-null  float64
 2   numVotes       513266 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.7+ MB


In [13]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

11636       False
15176       False
34800        True
61111        True
67485        True
            ...  
10282784     True
10282816     True
10282868    False
10282948    False
10282958    False
Name: tconst, Length: 202717, dtype: bool

In [14]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
61111,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,\N,70,Drama
67485,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,\N,90,Drama
67663,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,\N,122,Drama
80548,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008.0,\N,94,Horror
...,...,...,...,...,...,...,...,...,...
10282551,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,\N,97,"Comedy,Drama,Fantasy"
10282691,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,\N,51,Drama
10282700,tt9916190,movie,Safeguard,Safeguard,0,2020.0,\N,95,"Action,Adventure,Thriller"
10282784,tt9916362,movie,Coven,Akelarre,0,2020.0,\N,92,"Drama,History"


In [15]:
os.makedirs("Data", exist_ok=True)

In [16]:
basics.to_csv("Data/basics_cleaned.csv", index=False, compression="gzip")
akas.to_csv("Data/akas_cleaned.csv", index=False, compression="gzip")
ratings.to_csv("Data/ratings_cleaned.csv", index=False, compression="gzip")

In [17]:
import os, time,json
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'akas_cleaned.csv',
 'basics_cleaned.csv',
 'ratings_cleaned.csv']

In [18]:
with open('/Users/om88920/.secret/tmdb_api.json', 'r') as f:

    login = json.load(f)

tmdb.API_KEY =  login['api-key']

In [19]:
def get_movie_with_rating(movie_id):
    #get movie object for current id
    movie = tmdb.Movies(movie_id)

    #save .info .release dicts
    info = movie.info()

    releases = movie.releases()

    # loop through countries in releases
    for c in releases['countries']:
        # if thr country abbr ==US
        if c['iso_3166_1'] =='US':
            # save a certification key in the info
            info['certifcation'] = c['certification']

    return info

In [20]:
test = get_movie_with_rating("tt0848228") #put your function name here
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 122.648,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [21]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [22]:
YEARS_TO_GET = [2000,2001]

In [23]:
errors = [ ]

In [24]:
from tqdm.notebook import tqdm_notebook
!pip install tqdm



In [25]:
# OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1859 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2013 [00:00<?, ?it/s]

In [26]:
print(f"- Total errors: {len(errors)}")

- Total errors: 1069


In [30]:
import glob

# Load CSV files for each year and concatenate them into one DataFrame
files = glob.glob('Data/final_tmdb_data_2000.csv.gz')
df_list = []

for file in files:
    df = pd.read_csv(file, compression='gzip')
    df_list.append(df)

# Concatenate all DataFrames into one
movies_df = pd.concat(df_list, ignore_index=True)

In [31]:
# Exclude movies with budget and revenue both equal to 0
valid_movies_df = movies_df[(movies_df['budget'] > 0) | (movies_df['revenue'] > 0)]

# Count the number of movies with valid financial information
valid_movies_count = len(valid_movies_df)
print(f'Number of movies with valid financial information: {valid_movies_count}')

Number of movies with valid financial information: 304


In [34]:
# Count movies in each certification category
certification_counts = valid_movies_df['certifcation'].value_counts()
print('Number of movies in each certification category:')
print(certification_counts)

Number of movies in each certification category:
R        107
PG-13     63
PG        18
NR        11
G          8
Name: certifcation, dtype: int64


In [36]:
# Calculate average revenue and budget per certification category
average_revenue_by_certification = valid_movies_df.groupby('certifcation')['revenue'].mean()
average_budget_by_certification = valid_movies_df.groupby('certifcation')['budget'].mean()

print('Average revenue per certification category:')
print(average_revenue_by_certification)

print('Average budget per certification category:')
print(average_budget_by_certification)

Average revenue per certification category:
certifcation
G        1.053431e+08
NR       2.655907e+07
PG       7.673264e+07
PG-13    9.991738e+07
R        2.510319e+07
Name: revenue, dtype: float64
Average budget per certification category:
certifcation
G        4.500000e+07
NR       1.594383e+07
PG       4.555000e+07
PG-13    4.564706e+07
R        1.606491e+07
Name: budget, dtype: float64


In [None]:
tmdb_results.to_csv('tmdb_results_combined.csv.gz', compression='gzip', index=False)

In [39]:
# Load JSON data from both years into DataFrames
data_2000 = pd.read_json('Data/tmdb_api_results_2000.json')
data_2001 = pd.read_json('Data/tmdb_api_results_2001.json')

# Concatenate DataFrames to merge the data
merged_data = pd.concat([data_2000, data_2001], ignore_index=True)

# Save merged data as a compressed CSV file
merged_data.to_csv('Data/tmdb_results_combined.csv.gz', compression='gzip', index=False)