In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb


In [2]:
import seaborn as sns
import plotly.express as px

# Title Basics DF

In [3]:
basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
basics_df = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [4]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Replace "\N" with nan
- keep only title type 'Movie'

In [5]:
basics_df = basics_df.replace({'\\N':np.nan})
basics_df['endYear'].value_counts()

2017    5888
2018    5817
2019    5700
2020    5233
2016    4624
        ... 
1906       1
2028       1
1944       1
1925       1
1935       1
Name: endYear, Length: 97, dtype: int64

In [6]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9187629 entries, 0 to 9187628
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 630.9+ MB


In [8]:
basics_df.duplicated().sum()

0

In [9]:
basics_df.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1224986
endYear           9092101
runtimeMinutes    6722451
genres             426729
dtype: int64

In [10]:
movie_filter= basics_df['titleType']=='movie'
movie_filter

0          False
1          False
2          False
3          False
4          False
           ...  
9187624    False
9187625    False
9187626    False
9187627    False
9187628    False
Name: titleType, Length: 9187629, dtype: bool

In [11]:
basics_df[movie_filter]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama
...,...,...,...,...,...,...,...,...,...
9187519,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9187546,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
9187558,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,,Comedy
9187569,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,


In [12]:
basics_movie_df= basics_df[basics_df['titleType']=='movie']
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100.0,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama


## Eliminate movies that are null for runtime Minutes, genres, and movies that include 'Documentary' Type. 



In [13]:
basics_movie_df = basics_movie_df.dropna(subset=['runtimeMinutes', 'genres'])

In [14]:
basics_movie_df.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           5817
endYear           367090
runtimeMinutes         0
genres                 0
dtype: int64

In [15]:
is_documentary = basics_movie_df['genres'].str.contains('documentary',case=False)
basics_movie_df = basics_movie_df[~is_documentary]

In [16]:
basics_movie_df['genres'].value_counts()

Drama                        67909
Comedy                       28906
Comedy,Drama                 10394
Drama,Romance                 9786
Horror                        7358
                             ...  
Short,Thriller                   1
Comedy,Reality-TV,Romance        1
Biography,Music,Mystery          1
Adventure,Horror,Musical         1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 1154, dtype: int64

In [17]:
basics_movie_df['titleType'].value_counts()

movie    279603
Name: titleType, dtype: int64

##keep start year 2000-2022

In [18]:
basics_movie_df['startYear'].value_counts()

2018    9578
2017    9392
2019    9307
2016    8993
2015    8541
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [19]:
basics_movie_df = basics_movie_df.dropna(subset=['startYear'])

In [20]:
basics_movie_df['startYear'] = basics_movie_df['startYear'].astype(int)
print (basics_movie_df.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int64
endYear           object
runtimeMinutes    object
genres            object
dtype: object


In [21]:
basics_movie_df.loc[basics_movie_df['startYear']>=2000]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77930,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9187301,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9187310,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9187349,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9187394,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [22]:
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']>=2000]
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77930,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [23]:
basics_movie_df['startYear'].min()

2000

In [24]:
basics_movie_df['startYear'].max()

2027

In [25]:
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']>=2000]
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']<=2022]



In [26]:
basics_movie_df.max()

tconst                            tt9916538
titleType                             movie
primaryTitle      è solo questione di tempo
originalTitle     è solo questione di tempo
isAdult                                   1
startYear                              2022
endYear                                None
runtimeMinutes                          999
genres                              Western
dtype: object

In [27]:
basics_movie_df.min()

tconst                   tt0035423
titleType                    movie
primaryTitle      #1 Serial Killer
originalTitle     #1 Serial Killer
isAdult                          0
startYear                     2000
endYear                       None
runtimeMinutes                   1
genres                      Action
dtype: object

In [28]:
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77930,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [29]:
basics_movie_df['runtimeMinutes'] = basics_movie_df['runtimeMinutes'].astype(int)
print (basics_movie_df.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int64
endYear           object
runtimeMinutes     int64
genres            object
dtype: object


# Title AKA Dataframe

In [30]:
aka_url='https://datasets.imdbws.com/title.akas.tsv.gz'
aka_df = pd.read_csv(aka_url,sep='\t', low_memory=False)
aka_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [31]:
aka_df.shape

(33028351, 8)

In [32]:
aka_df = aka_df.replace({'\\N':np.nan})

In [33]:
aka_df.duplicated().sum()

0

In [34]:
aka_df.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1860704
language            6231980
types              27767976
attributes         32782777
isOriginalTitle        2187
dtype: int64

In [35]:
aka_df['region'].value_counts()

FR    3943876
JP    3943733
DE    3926949
IN    3872196
ES    3866955
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [36]:
aka_df['region'].value_counts()

FR    3943876
JP    3943733
DE    3926949
IN    3872196
ES    3866955
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [37]:
aka_region_filter= aka_df['region']== 'US'
aka_df[aka_region_filter]

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33028023,tt9916702,1,Loving London: The Playground,US,,,,0
33028060,tt9916720,10,The Demonic Nun,US,,tv,,0
33028062,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33028079,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [38]:
akas_df= aka_df[aka_df['region']=='US']
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [39]:
aka_df[aka_df['region'] == 'US']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33028023,tt9916702,1,Loving London: The Playground,US,,,,0
33028060,tt9916720,10,The Demonic Nun,US,,tv,,0
33028062,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33028079,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [40]:
akas_df['region'].value_counts()

US    1345024
Name: region, dtype: int64

# Title Ratings DF

In [41]:
ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'
ratings_df = pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000003,6.5,1705
3,tt0000004,5.6,168
4,tt0000005,6.2,2519


In [42]:
ratings_df = ratings_df.replace({'\\N':np.nan})

In [43]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000003,6.5,1705
3,tt0000004,5.6,168
4,tt0000005,6.2,2519


# Filtering one dataframe based on another - basics  and akas DF.

In [44]:
keepers = basics_movie_df['tconst'].isin(akas_df['titleId'])
keepers

34790       True
61090       True
67636       True
77930      False
86767       True
           ...  
9187301     True
9187310     True
9187349    False
9187394     True
9187478    False
Name: tconst, Length: 142483, dtype: bool

In [45]:
basics = basics_movie_df[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
92732,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy
...,...,...,...,...,...,...,...,...,...
9186765,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9187161,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9187301,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9187310,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [46]:
keepers = ratings_df['tconst'].isin(akas_df['titleId'])
ratings_df = ratings_df[keepers]
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1257559    False
1257560     True
1257561    False
1257562    False
1257563    False
Name: tconst, Length: 1257564, dtype: bool

saving_clean_files

In [47]:
basics_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142483 entries, 34790 to 9187478
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          142483 non-null  object
 1   titleType       142483 non-null  object
 2   primaryTitle    142483 non-null  object
 3   originalTitle   142483 non-null  object
 4   isAdult         142483 non-null  object
 5   startYear       142483 non-null  int64 
 6   endYear         0 non-null       object
 7   runtimeMinutes  142483 non-null  int64 
 8   genres          142483 non-null  object
dtypes: int64(2), object(7)
memory usage: 10.9+ MB


In [48]:
akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1345024 entries, 5 to 33028095
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1345024 non-null  object
 1   ordering         1345024 non-null  int64 
 2   title            1345024 non-null  object
 3   region           1345024 non-null  object
 4   language         3692 non-null     object
 5   types            963717 non-null   object
 6   attributes       44815 non-null    object
 7   isOriginalTitle  1343649 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.4+ MB


In [49]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 475008 entries, 0 to 1257560
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         475008 non-null  object 
 1   averageRating  475008 non-null  float64
 2   numVotes       475008 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [50]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82333 entries, 34790 to 9187394
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82333 non-null  object
 1   titleType       82333 non-null  object
 2   primaryTitle    82333 non-null  object
 3   originalTitle   82333 non-null  object
 4   isAdult         82333 non-null  object
 5   startYear       82333 non-null  int64 
 6   endYear         0 non-null      object
 7   runtimeMinutes  82333 non-null  int64 
 8   genres          82333 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.3+ MB


In [51]:
import os
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [52]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [53]:
ratings_df = pd.read_csv("Data/title_title.title.ratings.tsv.gz", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000005,6.2,2519
3,tt0000006,5.1,173
4,tt0000007,5.4,783


In [54]:
ratings_df.to_csv("Data/title_title.title.ratings.tsv.gz",compression='gzip',index=False)

In [55]:
akas_df.to_csv("Data/title_title.title.akas.tsv.gz",compression='gzip',index=False)

In [56]:
akas_df = pd.read_csv("Data/title_title.title.akas.tsv.gz", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000005,6.2,2519
3,tt0000006,5.1,173
4,tt0000007,5.4,783


## Using  Json , API , tmdb


In [57]:
with open('/Users/daviankalopez/.secret/tmbd_api.json', 'r') as f:
    login = json.load(f)
    
    login.keys()

In [58]:
import tmdbsimple as tmdb
tmdb.API_KEY = login['api-key']

In [59]:
movie = tmdb.Movies(603)

In [60]:
tmdb.Movies(603)

<tmdbsimple.movies.Movies at 0x31d0061f0>

In [61]:
info= movie.info()
info

{'adult': False,
 'backdrop_path': '/n2nm4aZRmXyJ9LT4xQX9X6ThcP7.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 95.086,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

In [62]:
info['budget']

63000000

In [63]:
info['revenue']

463517383

In [64]:
info['imdb_id']

'tt0133093'

In [65]:
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

In [66]:
#find certification
#example from package README
response = movie.releases()
for c in movie.countries:
    if c['iso_3166_1'] == 'US':
         print(c['certification'])
        

PG
PG
PG


In [67]:
# Get the movie object for the current id
movie = tmdb.Movies('tt1361336')
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

In [68]:
#create the function
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies('tt1361336')
# save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
# Loop through countries in releases
    for c in releases['countries']:
    # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
           info['certification'] = c['certification']
    return info

In [69]:
id= 'tt0848228'

In [70]:
get_movie_with_rating(id)

{'adult': False,
 'backdrop_path': '/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg',
 'belongs_to_collection': None,
 'budget': 50000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'},
  {'id': 16, 'name': 'Animation'}],
 'homepage': 'https://www.tomandjerrymovie.com',
 'id': 587807,
 'imdb_id': 'tt1361336',
 'original_language': 'en',
 'original_title': 'Tom & Jerry',
 'overview': 'Tom the cat and Jerry the mouse get kicked out of their home and relocate to a fancy New York hotel, where a scrappy employee named Kayla will lose her job if she can’t evict Jerry before a high-class wedding at the hotel. Her solution? Hiring Tom to get rid of the pesky mouse.',
 'popularity': 151.699,
 'poster_path': '/8XZI9QZ7Pm3fVkigWJPbrXCMzjq.jpg',
 'production_companies': [{'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png',
   'name': 'Warner Bros. Pictures',
   'origin_country': 'US'},
  {'id': 8922,
   'logo_path': '/yZWehAyjfKi4KvKeg1bkJ1bm5H8.png',
   'name': 'Turner En

# Before the loops

In [71]:
FOLDER= "Data/"
os.makedirs(FOLDER, exist_ok = True)
os.listdir(FOLDER)

['final_tmdb_data_2006.csv.gz',
 'tmdb_api_results_2010.json',
 'tmdb_api_results_2006.json',
 'title_title.title.akas.tsv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'tmdb_api_results_2007.json',
 'title_title.title.ratings.tsv.gz',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2002.json',
 'final_tmdb_data_2007.csv.gz',
 'tmdb_api_results_2003.json',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2005.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'tmdb_api_results_2004.json',
 'tmdb_api_results_2008.json',
 'tmdb_api_results_2009.json',
 'final_tmdb_data_2003.csv.gz',
 'tmdb_api_results_2005.json']

In [72]:
#using 
YEARS_TO_GET = [2000,2001, 2002, 2003, 2004, 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]

In [73]:
#YEAR=2000

## Querying Movies by ID

In [74]:
#Load in the dataframe from project part 1 as basics
basics = pd.read_csv('data/title_basics.csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [75]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [76]:
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS', position = 0):
    #DEFINING the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    #CHECK IF THE FILE EXISTS    
    file_exists = os.path.isfile(JSON_FILE)

    #IF IT DOESN'T EXIST, CREATE IT.
    if file_exists == False:
        #save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id':0}],f)

    #Saving new Year as the current df
    df=basics.loc[basics['startYear']==YEAR].copy()

    #SAVING MOVIE ID'S TO LIST
    movie_ids= df['tconst'].copy()#to_list()
    movie_ids

    #LOAD EXISTING DATA FROM JSON INTO A DF CALLED 'PREVIOUS_DF'
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    #Filter out any IDs that are already in the JSON_FILE
    movie_ids_to_get= movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            # Attempt to retrieve then data for the movie id
            try:
                temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
                # Append/extend results to existing file using a pre-made function
                write_json(temp,JSON_FILE)
                # Short 20 ms sleep to prevent overwhelming server
                time.sleep(0.02)

            # If it fails,  make a dict with just the id and None for certification.
            except Exception as e:
                continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)


YEARS:   0%|          | 0/21 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1410 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1529 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1518 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1637 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1843 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2135 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2356 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2487 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2837 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3463 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3766 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4143 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4436 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4640 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4785 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/4931 [00:00<?, ?it/s]

Movies from 2016:   0%|          | 0/5147 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5506 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5632 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5732 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4871 [00:00<?, ?it/s]

# Exploratory data analysis 

In [78]:
df_2000=pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
df_2000.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG


In [79]:
df_2001=pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
df_2001.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG


In [80]:
df_2000.shape

(3146, 26)

In [81]:
df_2001.shape

(3059, 26)

In [83]:
df_2000['imdb_id'].value_counts()

tt1361336    3145
0               1
Name: imdb_id, dtype: int64