# Imports

In [1]:
#import libraries
import pandas as pd
import numpy as np

In [2]:
#save links
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

# Preprocessing Basics

In [3]:
#creating Basics DF
basics_df = pd.read_csv(basics_url, sep='\t', low_memory=False)



In [4]:
#inspecting basics df
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9399297 entries, 0 to 9399296
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 645.4+ MB


In [5]:
#replacing missing values with nan
basics_df = basics_df.replace({'\\N':np.nan})
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [6]:
basics_df.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1261909
endYear           9300239
runtimeMinutes    6770303
genres             431060
dtype: int64

In [7]:
#drop nan Running Time and Genre 
basics_df = basics_df.dropna(subset=['runtimeMinutes', 'genres'])
basics_df.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [8]:
#confirming dropped values
basics_df.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           97593
endYear           2513454
runtimeMinutes          0
genres                  0
dtype: int64

In [9]:
#checking titleType values for movie filter
basics_df['titleType'].value_counts()

tvEpisode       1206231
short            583951
movie            372416
video            177033
tvMovie           89146
tvSeries          87958
tvSpecial         17068
tvMiniSeries      16476
tvShort            9475
videoGame           312
Name: titleType, dtype: int64

In [10]:
#only movie types for df
basics_df= basics_df.loc[basics_df['titleType'] == 'movie']
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45,Drama


In [11]:
#excluding documentaries
basics_df['genres'].value_counts()

is_documentary = basics_df['genres'].str.contains('Documentary',case=False)
basics_df = basics_df[~is_documentary]



In [12]:
#only movies after 2000
basics_df['startYear'] = basics_df['startYear'].astype(float)
basics_df.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [13]:
#inspecting akas
basics_df=basics_df.loc[(basics_df['startYear']>=2000)&(basics_df['startYear']<2022)]
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61095,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67643,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77937,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86773,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9398969,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9398978,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9399017,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9399062,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"


# Preprocessing AKAS

In [14]:
#loading akas
akas_df = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [15]:
#replacing nan values
akas_df = akas_df.replace({'\\N':np.nan})
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


# Preprocessing Ratings


In [16]:
#creating Ratings DF
ratings_df = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [17]:
ratings_df = ratings_df.replace({'\\N':np.nan})
ratings_df.head()



Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1924
1,tt0000002,5.8,259
2,tt0000003,6.5,1737
3,tt0000004,5.6,174
4,tt0000005,6.2,2550


In [18]:
#filtering only us movies
us = basics_df['tconst'].isin(akas_df['titleId'])
us

34793      True
61095      True
67643      True
77937      True
86773      True
           ... 
9398969    True
9398978    True
9399017    True
9399062    True
9399146    True
Name: tconst, Length: 137188, dtype: bool

In [19]:
#saving df with only us movies
basics_df = basics_df[us]
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61095,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67643,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77937,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86773,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


# Saving Data


In [20]:
# Saving DF to Data Folder
basics_df.to_csv("DATA/title_basics.csv.gz",compression='gzip',index=False)
akas_df.to_csv("DATA/title_akas.csv.gz",compression='gzip',index=False)
ratings_df.to_csv("DATA/title_ratings.csv.gz", compression='gzip', index=False)



In [21]:
#replacing basics variable path
basics_df = pd.read_csv("DATA/title_basics.csv.gz", low_memory = False)
basics_df.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [22]:
#replacing akas variable path
akas_df = pd.read_csv("DATA/title_akas.csv.gz", low_memory=False)
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


In [23]:
#replacing ratings variable path
ratings_df = pd.read_csv("DATA/title_ratings.csv.gz", low_memory=False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1924
1,tt0000002,5.8,259
2,tt0000003,6.5,1737
3,tt0000004,5.6,174
4,tt0000005,6.2,2550


#  Importing TMDB API

In [24]:
#importing json librarie
import json
with open(r"\Users\camer\.secret\timdb_api.json", "r") as f:
    login = json.load(f)
    
login.keys()

dict_keys(['api-key'])

In [25]:
#importing TMDB api
import tmdbsimple as tmdb
tmdb.API_KEY = login['api-key']

In [26]:
#test search to ensure success
tmdb.Movies(599).info()

{'adult': False,
 'backdrop_path': '/p47ihFj4A7EpBjmPHdTj4ipyq1S.jpg',
 'belongs_to_collection': None,
 'budget': 1752000,
 'genres': [{'id': 18, 'name': 'Drama'}],
 'homepage': '',
 'id': 599,
 'imdb_id': 'tt0043014',
 'original_language': 'en',
 'original_title': 'Sunset Boulevard',
 'overview': 'A hack screenwriter writes a screenplay for a former silent film star who has faded into Hollywood obscurity.',
 'popularity': 18.814,
 'poster_path': '/sC4Dpmn87oz9AuxZ15Lmip0Ftgr.jpg',
 'production_companies': [{'id': 4,
   'logo_path': '/gz66EfNoYPqHTYI4q9UEN4CbHRc.png',
   'name': 'Paramount',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1950-08-10',
 'revenue': 5000000,
 'runtime': 110,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en',
   'name': 'English'}],
 'status': 'Released',
 'tagline': 'A Hollywood Story.',
 'title': 'Sunset Boulevard',
 'video': False,
 'vote_average'

# Creating and Defining Our Two Functions

In [27]:
#creating function for movie rating
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1']=='US':
            info['certification'] = c['certification']
    return info

![image.png](attachment:image.png)

In [28]:
#first test
test1 = get_movie_with_rating('tt0848228')
test1

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 165.284,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [29]:
#second test 
test2 = get_movie_with_rating('tt0332280')
test2

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 70.09,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 260

In [30]:
#prepping second function
import os, time
import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [31]:
#create function to make json files
def write_json(new_data, filename): 
  
    with open(filename,'r+') as file:
       
        file_data = json.load(file)
      
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
             file.seek(0)
       
             json.dump(file_data, file)

In [32]:
#adding info for years 2000 and 2001
basics = pd.read_csv('Data/title_basics.csv.gz')
YEARS_TO_GET = [2000,2001]
errors = []
 

In [38]:
#start ouloop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id':0}], f)
    df = basics.loc[basics['startYear']==YEAR].copy()
    movie_ids = df['tconst'].copy()
    previous_df = pd.read_json(JSON_FILE)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    #inner loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc = f'Movies from {YEAR}',
                                  position = 1,
                                  leave = True):
        try: 
            temp = get_movie_with_ratings(movie_id)
            write_json(temp, JSON_FILE)
            time.sleep(0.02)
        except Exception as e:
            errors.append([movie_id, e])
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

print(f'total error {len(errors)}')
    
                                
                                                                 
                                                             

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/2722 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2847 [00:00<?, ?it/s]

total error 11138


# Exploratory Data Analysis

In [44]:
df= pd.read_csv('DATA/final_tmdb_data_2001.csv.gz')
df.head()


Unnamed: 0,imdb_id
0,0


This is far as I could make it, it triple checked the code many times but my final csv's don't have anything in them. I will shedule a one on one to go over