In [43]:
#Imports
import pandas as pd
import numpy as np
import os, time, json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

# Loading Data

In [2]:
title_akas_url='https://datasets.imdbws.com/title.akas.tsv.gz'
title_basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
title_ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'

akas = pd.read_csv(title_akas_url,sep='\t', low_memory=False)
basics = pd.read_csv(title_basics_url,sep='\t', low_memory=False)
ratings = pd.read_csv(title_ratings_url,sep='\t', low_memory=False)

In [3]:
# Loading in as local files after being cleaned
#akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
#basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
#ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)

In [4]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Checking/Fixing \N Placeholder Values

In [5]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      10
originalTitle     10
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [6]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             105
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [7]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [8]:
#Handling \N placeholder values
basics.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)
ratings.replace({'\\N':np.nan}, inplace=True)

In [9]:
#sanity checking dataframes
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           10
originalTitle          10
isAdult                 1
startYear         1213610
endYear           9073997
runtimeMinutes    6705858
genres             416842
dtype: int64

In [10]:
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1860358
language            6226450
types              27668931
attributes         32678127
isOriginalTitle        2187
dtype: int64

In [11]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

Ratings does not have any null values and does not need any preprocessing.

# Preprocessing Dataframes

## Basics

In [12]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9169391 entries, 0 to 9169390
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 629.6+ MB


### Eliminating movies that are null for runtimeMinutes

In [13]:
basics = basics.dropna(axis=0, subset=['runtimeMinutes'])
basics['runtimeMinutes'].isna().sum()

0

In [14]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37657
endYear           2416909
runtimeMinutes          0
genres              67320
dtype: int64

In [15]:
### Eliminating movies that are null for genre
basics = basics.dropna(axis=0, subset=['genres'])
basics['genres'].isna().sum()

0

### Keeping only titleType==Movie

In [16]:
#checking value counts
basics['titleType'].value_counts()

tvEpisode       1063316
short            574080
movie            366829
video            175009
tvMovie           88274
tvSeries          86523
tvSpecial         16456
tvMiniSeries      16055
tvShort            9376
videoGame           295
Name: titleType, dtype: int64

In [17]:
#applying filter to only show movies
movie_filter = basics['titleType']=='movie'

#filtering dataframe
basics = basics[movie_filter]

#sanity checking
basics['titleType'].value_counts()

movie    366829
Name: titleType, dtype: int64

### Keeping startYear 2000-2022

In [18]:
basics['startYear'].value_counts()

2017    14186
2018    14131
2016    13819
2019    13798
2015    13324
        ...  
1894        1
1899        1
1904        1
1906        1
1896        1
Name: startYear, Length: 129, dtype: int64

In [24]:
#ensuring the dtype is int
basics['startYear'] = basics['startYear'].astype('int64')

In [25]:
#applying filter to only have movies that started between 2000-2022
basics = basics[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]

In [27]:
#checking results
basics['startYear'].value_counts()

2017    14186
2018    14131
2016    13819
2019    13798
2015    13324
2014    12981
2013    12272
2021    11711
2012    11540
2020    11253
2011    10680
2010    10121
2009     9268
2008     8069
2022     7654
2007     6885
2006     6428
2005     5761
2004     5129
2003     4526
2002     4085
2001     3813
2000     3591
Name: startYear, dtype: int64

### Eliminate movies that include "Documentary" in genre

In [28]:
basics['genres'].value_counts()

Documentary                    50766
Drama                          34722
Comedy                         13098
Comedy,Drama                    6250
Horror                          5598
                               ...  
Family,Musical,Sport               1
Comedy,Game-Show                   1
Horror,Music,Mystery               1
Documentary,Sci-Fi,Thriller        1
Crime,Fantasy,Sci-Fi               1
Name: genres, Length: 1186, dtype: int64

In [29]:
# Excluding movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics['genres'].value_counts()

Drama                        34722
Comedy                       13098
Comedy,Drama                  6250
Horror                        5598
Drama,Romance                 4163
                             ...  
Biography,Family,Mystery         1
Biography,Music,Mystery          1
Crime,Music,Mystery              1
Comedy,Reality-TV,Romance        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 967, dtype: int64

## AKA's

### Keeping only US entries

In [30]:
akas['region'].value_counts()

FR    3929827
JP    3929608
DE    3912894
IN    3857988
ES    3852970
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 246, dtype: int64

In [31]:
US_filter = akas['region']=='US'
akas = akas[US_filter]

In [32]:
akas['region'].value_counts()

US    1344052
Name: region, dtype: int64

### Filtering basics to only include movies with the akas filter.

In [33]:
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
92732,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


## Checking results of preprocessing


In [34]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82250 entries, 34790 to 9169156
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82250 non-null  object
 1   titleType       82250 non-null  object
 2   primaryTitle    82250 non-null  object
 3   originalTitle   82250 non-null  object
 4   isAdult         82250 non-null  object
 5   startYear       82250 non-null  int64 
 6   endYear         0 non-null      object
 7   runtimeMinutes  82250 non-null  object
 8   genres          82250 non-null  object
dtypes: int64(1), object(8)
memory usage: 6.3+ MB


In [35]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344052 entries, 5 to 32923263
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1344052 non-null  object
 1   ordering         1344052 non-null  int64 
 2   title            1344052 non-null  object
 3   region           1344052 non-null  object
 4   language         3686 non-null     object
 5   types            963565 non-null   object
 6   attributes       44795 non-null    object
 7   isOriginalTitle  1342677 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.3+ MB


In [36]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256534 entries, 0 to 1256533
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1256534 non-null  object 
 1   averageRating  1256534 non-null  float64
 2   numVotes       1256534 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [37]:
#Creating a folder to save preprocessed dataframes in
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [38]:
#Saving dataframes to folder
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [43]:
# Opening saved files and previewing again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama


In [40]:
akas.head(2)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0


In [41]:
ratings.head(2)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256


In [4]:
# Loading in as local files after being cleaned
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)

# Part 2

## Setup to use the tmdb API

In [52]:
with open('/Users/Meekrius/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [53]:
#Setting the API_KEY variable to my api-key
tmdb.API_KEY =  login['api-key']

In [40]:
# Designate a folder to save API call data in
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [62]:
#Function to get the .info results with the certification added 
#of a movie

def get_movie_with_rating(movie_id):

# Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
# save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
# Loop through countries in releases
    for c in releases['countries']:
    # if the country abbreviation==US
        if c['iso_3166_1'] =='US':
        ## save a "certification" key in the info dict with the certification
            info['certification'] = c['certification']
    return  movie_info

### Testing the Function

In [63]:
test1 = get_movie_with_rating('tt0848228')
test1

{'adult': False,
 'backdrop_path': '/nNmJRkg8wWnRmzQDe2FwKbPIsJV.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 245.974,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [61]:
test2 = get_movie_with_rating('tt0332280')
test2

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 70.325,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

In [49]:
# Function to append new results to existing JSON file
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [55]:
#Results from the first to years
YEARS_TO_GET = [2000,2001]

# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    
    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
    
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1408 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1528 [00:00<?, ?it/s]