In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb


In [2]:
import seaborn as sns
import plotly.express as px

# Title Basics DF

In [3]:
basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
basics_df = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [4]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Replace "\N" with nan
- keep only title type 'Movie'

In [5]:
basics_df = basics_df.replace({'\\N':np.nan})
basics_df['endYear'].value_counts()

2017    5893
2018    5826
2019    5716
2020    5245
2016    4630
        ... 
1906       1
2028       1
1944       1
1925       1
1935       1
Name: endYear, Length: 97, dtype: int64

In [6]:
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9204231 entries, 0 to 9204230
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 632.0+ MB


In [8]:
basics_df.duplicated().sum()

0

In [9]:
basics_df.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1225777
endYear           9108442
runtimeMinutes    6735442
genres             426954
dtype: int64

In [10]:
movie_filter= basics_df['titleType']=='movie'
movie_filter

0          False
1          False
2          False
3          False
4          False
           ...  
9204226    False
9204227    False
9204228    False
9204229    False
9204230    False
Name: titleType, Length: 9204231, dtype: bool

In [11]:
basics_df[movie_filter]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama
...,...,...,...,...,...,...,...,...,...
9204121,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9204148,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
9204160,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,,Comedy
9204171,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,


In [12]:
basics_movie_df= basics_df[basics_df['titleType']=='movie']
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100.0,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama


## Eliminate movies that are null for runtime Minutes, genres, and movies that include 'Documentary' Type. 



In [13]:
basics_movie_df = basics_movie_df.dropna(subset=['runtimeMinutes', 'genres'])

In [14]:
basics_movie_df.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           5852
endYear           367542
runtimeMinutes         0
genres                 0
dtype: int64

In [15]:
is_documentary = basics_movie_df['genres'].str.contains('documentary',case=False)
basics_movie_df = basics_movie_df[~is_documentary]

In [16]:
basics_movie_df['genres'].value_counts()

Drama                        68020
Comedy                       28925
Comedy,Drama                 10412
Drama,Romance                 9794
Horror                        7372
                             ...  
Short,Thriller                   1
Comedy,Reality-TV,Romance        1
Biography,Music,Mystery          1
Adventure,Horror,Musical         1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 1157, dtype: int64

In [17]:
basics_movie_df['titleType'].value_counts()

movie    279956
Name: titleType, dtype: int64

##keep start year 2000-2022

In [18]:
basics_movie_df['startYear'].value_counts()

2018    9580
2017    9399
2019    9314
2016    8996
2015    8544
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [19]:
basics_movie_df = basics_movie_df.dropna(subset=['startYear'])

In [20]:
basics_movie_df['startYear'] = basics_movie_df['startYear'].astype(int)
print (basics_movie_df.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int64
endYear           object
runtimeMinutes    object
genres            object
dtype: object


In [21]:
basics_movie_df.loc[basics_movie_df['startYear']>=2000]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61092,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67638,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77932,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86769,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9203903,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9203912,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9203951,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9203996,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [22]:
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']>=2000]
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61092,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67638,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77932,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86769,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [23]:
basics_movie_df['startYear'].min()

2000

In [24]:
basics_movie_df['startYear'].max()

2027

In [25]:
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']>=2000]
basics_movie_df = basics_movie_df.loc[basics_movie_df['startYear']<=2022]



In [26]:
basics_movie_df.max()

tconst                            tt9916538
titleType                             movie
primaryTitle      è solo questione di tempo
originalTitle     è solo questione di tempo
isAdult                                   1
startYear                              2022
endYear                                None
runtimeMinutes                          999
genres                              Western
dtype: object

In [27]:
basics_movie_df.min()

tconst                   tt0035423
titleType                    movie
primaryTitle      #1 Serial Killer
originalTitle     #1 Serial Killer
isAdult                          0
startYear                     2000
endYear                       None
runtimeMinutes                   1
genres                      Action
dtype: object

In [28]:
basics_movie_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61092,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67638,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77932,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86769,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [29]:
basics_movie_df['runtimeMinutes'] = basics_movie_df['runtimeMinutes'].astype(int)
print (basics_movie_df.dtypes)

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int64
endYear           object
runtimeMinutes     int64
genres            object
dtype: object


# Title AKA Dataframe

In [30]:
aka_url='https://datasets.imdbws.com/title.akas.tsv.gz'
aka_df = pd.read_csv(aka_url,sep='\t', low_memory=False)
aka_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [31]:
aka_df.shape

(33097846, 8)

In [32]:
aka_df = aka_df.replace({'\\N':np.nan})

In [33]:
aka_df.duplicated().sum()

0

In [34]:
aka_df.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1861546
language            6242523
types              27826707
attributes         32851584
isOriginalTitle        2187
dtype: int64

In [35]:
aka_df['region'].value_counts()

JP    3951923
FR    3951792
DE    3934949
IN    3880396
ES    3874828
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [36]:
aka_df['region'].value_counts()

JP    3951923
FR    3951792
DE    3934949
IN    3880396
ES    3874828
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [37]:
aka_region_filter= aka_df['region']== 'US'
aka_df[aka_region_filter]

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33097518,tt9916702,1,Loving London: The Playground,US,,,,0
33097555,tt9916720,10,The Demonic Nun,US,,tv,,0
33097557,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33097574,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [38]:
akas_df= aka_df[aka_df['region']=='US']
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [39]:
aka_df[aka_df['region'] == 'US']

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33097518,tt9916702,1,Loving London: The Playground,US,,,,0
33097555,tt9916720,10,The Demonic Nun,US,,tv,,0
33097557,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33097574,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


In [40]:
akas_df['region'].value_counts()

US    1347304
Name: region, dtype: int64

# Title Ratings DF

In [41]:
ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'
ratings_df = pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1909
1,tt0000002,5.8,256
2,tt0000003,6.5,1710
3,tt0000004,5.6,169
4,tt0000005,6.2,2525


In [42]:
ratings_df = ratings_df.replace({'\\N':np.nan})

In [43]:
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1909
1,tt0000002,5.8,256
2,tt0000003,6.5,1710
3,tt0000004,5.6,169
4,tt0000005,6.2,2525


# Filtering one dataframe based on another - basics  and akas DF.

In [44]:
keepers = basics_movie_df['tconst'].isin(akas_df['titleId'])
keepers

34791       True
61092       True
67638       True
77932      False
86769       True
           ...  
9203903     True
9203912     True
9203951    False
9203996     True
9204080    False
Name: tconst, Length: 142674, dtype: bool

In [45]:
basics = basics_movie_df[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61092,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67638,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86769,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93905,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9203367,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9203763,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9203903,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9203912,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [46]:
keepers = ratings_df['tconst'].isin(akas_df['titleId'])
ratings_df = ratings_df[keepers]
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1258871    False
1258872     True
1258873    False
1258874    False
1258875    False
Name: tconst, Length: 1258876, dtype: bool

saving_clean_files

In [47]:
basics_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142674 entries, 34791 to 9204080
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          142674 non-null  object
 1   titleType       142674 non-null  object
 2   primaryTitle    142674 non-null  object
 3   originalTitle   142674 non-null  object
 4   isAdult         142674 non-null  object
 5   startYear       142674 non-null  int64 
 6   endYear         0 non-null       object
 7   runtimeMinutes  142674 non-null  int64 
 8   genres          142674 non-null  object
dtypes: int64(2), object(7)
memory usage: 10.9+ MB


In [48]:
akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1347304 entries, 5 to 33097590
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1347304 non-null  object
 1   ordering         1347304 non-null  int64 
 2   title            1347304 non-null  object
 3   region           1347304 non-null  object
 4   language         3702 non-null     object
 5   types            963869 non-null   object
 6   attributes       44877 non-null    object
 7   isOriginalTitle  1345929 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.5+ MB


In [49]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 475857 entries, 0 to 1258872
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         475857 non-null  object 
 1   averageRating  475857 non-null  float64
 2   numVotes       475857 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [50]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82423 entries, 34791 to 9203996
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82423 non-null  object
 1   titleType       82423 non-null  object
 2   primaryTitle    82423 non-null  object
 3   originalTitle   82423 non-null  object
 4   isAdult         82423 non-null  object
 5   startYear       82423 non-null  int64 
 6   endYear         0 non-null      object
 7   runtimeMinutes  82423 non-null  int64 
 8   genres          82423 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.3+ MB


In [51]:
import os
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [52]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [53]:
ratings_df = pd.read_csv("Data/title_title.title.ratings.tsv.gz", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000005,6.2,2519
3,tt0000006,5.1,173
4,tt0000007,5.4,783


In [54]:
ratings_df.to_csv("Data/title_title.title.ratings.tsv.gz",compression='gzip',index=False)

In [55]:
akas_df.to_csv("Data/title_title.title.akas.tsv.gz",compression='gzip',index=False)

In [184]:
akas_df = pd.read_csv("Data/title_title.title.akas.tsv.gz", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000005,6.2,2519
3,tt0000006,5.1,173
4,tt0000007,5.4,783


# Using  Json , API , tmdb


In [193]:
with open('/users/daviankalopez/.secret/tmdb_api.json') as f:
    login = json.load(f)
    
    login.keys()

In [194]:
import tmdbsimple as tmdb
tmdb.API_KEY = login['api-key']

In [195]:
movie = tmdb.Movies(603)

In [196]:
tmdb.Movies(603)

<tmdbsimple.movies.Movies at 0x2bce0e220>

In [197]:
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/n2nm4aZRmXyJ9LT4xQX9X6ThcP7.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 68.051,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

In [199]:
info['budget']

63000000

In [200]:
info['revenue']

463517383

In [201]:
info['imdb_id']

'tt0133093'

In [202]:
#find certification
#example from package README
response = movie.releases()
for c in movie.countries:
    if c['iso_3166_1'] == 'US':
         print(c['certification'])
        

R
R


# Get the movie object for the current id
movie = tmdb.Movies()
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

In [214]:
#create the function
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
# save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
# Loop through countries in releases
    for c in releases['countries']:
    # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
           info['certification'] = c['certification']
    return info

In [215]:
test = get_movie_with_rating('tt0848228') #put your function name here
test


{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 196.797,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

# Before the loops

In [216]:
FOLDER= "Data/"
os.makedirs(FOLDER, exist_ok = True)
os.listdir(FOLDER)

['tmdb_api_results_2010.json',
 'final_tmdb_data_2018.csv.gz',
 'tmdb_api_results_2006.json',
 'title_title.title.akas.tsv.gz',
 'final_tmdb_data_2016.csv.gz',
 'tmdb_api_results_2007.json',
 'final_tmdb_data_2020.csv.gz',
 'tmdb_api_results_2011.json',
 'tmdb_api_results_2020.json',
 'title_title.title.ratings.tsv.gz',
 'tmdb_api_results_2016.json',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'final_tmdb_data_2002.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2017.json',
 'tmdb_api_results_2018.json',
 'tmdb_api_results_2002.json',
 'final_tmdb_data_2019.csv.gz',
 'tmdb_api_results_2014.json',
 'final_tmdb_data_2015.csv.gz',
 'tmdb_api_results_2015.json',
 'tmdb_api_results_2003.json',
 'final_tmdb_data_2017.csv.gz',
 'tmdb_api_results_2019.json',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'tmdb_api_results_2004.json',
 'tmdb_api_results_2012.json',
 'tmdb_api_results_2008.json',
 'tmdb_api_results_2009.json',
 'tmd

In [234]:
#using 
#YEARS_TO_GET = [2003,2004 ,2005 , 2006, 2007 , 2008, 2009, 2010]

In [241]:
#YEARS_TO_GET = [2011,2012,2013,2014,2015,2016, 2017, 2018, 2019, 2020]
YEARS_TO_GET = [2020]

## Querying Movies by ID

In [230]:
#Load in the dataframe from project part 1 as basics
basics = pd.read_csv('data/title_basics.csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [231]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [242]:
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS', position = 0):
    #DEFINING the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    #CHECK IF THE FILE EXISTS    
    file_exists = os.path.isfile(JSON_FILE)

    #IF IT DOESN'T EXIST, CREATE IT.
    if file_exists == False:
        #save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, 'w') as f:
            json.dump([{'imdb_id':0}],f)

    #Saving new Year as the current df
    df=basics.loc[basics['startYear']==YEAR].copy()

    #SAVING MOVIE ID'S TO LIST
    movie_ids= df['tconst'].copy()#to_list()
    movie_ids

    #LOAD EXISTING DATA FROM JSON INTO A DF CALLED 'PREVIOUS_DF'
    previous_df = pd.read_json(JSON_FILE)
    previous_df

    #Filter out any IDs that are already in the JSON_FILE
    movie_ids_to_get= movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            # Attempt to retrieve then data for the movie id
            try:
                temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
                # Append/extend results to existing file using a pre-made function
                write_json(temp,JSON_FILE)
                # Short 20 ms sleep to prevent overwhelming server
                time.sleep(0.02)

            # If it fails,  make a dict with just the id and None for certification.
            except Exception as e:
                #print(e)
                continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)


YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/2288 [00:00<?, ?it/s]

# Exploratory data analysis 

In [243]:
df_2000=pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
df_2000.tail()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
4347,tt6174238,0.0,,"{'id': 404302, 'name': 'Cold War Collection', ...",0.0,"[{'id': 80, 'name': 'Crime'}]",,223878.0,cn,冷战,...,0.0,0.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,Cold War,0.0,2.0,2.0,
4348,tt7029820,0.0,,,0.0,[],,604889.0,en,Scream For Christmas,...,0.0,80.0,[],Released,,Scream For Christmas,0.0,0.0,0.0,
4349,tt7197642,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,872676.0,en,"Goodbye, Merry-Go-Round",...,0.0,90.0,[],Released,,"Goodbye, Merry-Go-Round",0.0,0.0,0.0,
4350,tt7631368,0.0,/sF0gUHE0YzZNXYugTB2LFxJIppf.jpg,,10000000.0,"[{'id': 27, 'name': 'Horror'}]",,97186.0,fr,"I, Vampire",...,0.0,85.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,"I, Vampire",0.0,6.4,4.0,NR
4351,tt7802790,0.0,/etxml2M8GQzb31jcH0cdA489WCX.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",,610992.0,fa,مومیایی ۳,...,0.0,98.0,"[{'english_name': 'Persian', 'iso_639_1': 'fa'...",Released,,The Mummy 3,0.0,7.0,1.0,


In [244]:
df_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
df_2001.tail()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
4341,tt7797670,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",,956214.0,en,Edmund Kemper Part 2: La Mort C'est La Vie,...,0.0,91.0,[],Released,,Edmund Kemper Part 2: La Mort C'est La Vie,0.0,0.0,0.0,
4342,tt7797790,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",,956219.0,en,Edmund Kemper Part 3: La mort sévit,...,0.0,72.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,,Edmund Kemper Part 3: La mort sévit,0.0,0.0,0.0,
4343,tt8665056,0.0,,,0.0,"[{'id': 37, 'name': 'Western'}]",http://skeletoncreekproductions.com/p-movie-br...,885436.0,en,Guns Along The Bravo,...,0.0,85.0,[],Released,Evil came to the Southwest until three blazing...,Guns Along The Bravo,0.0,0.0,0.0,
4344,tt8795764,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}]",https://www.utahwolf.com/films/coming-soon-new...,871624.0,en,New Breed,...,0.0,57.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,New Breed,0.0,0.0,0.0,NR
4345,tt9071078,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}]",http://www.hkcinemagic.com/en/movie.asp?id=6627,201706.0,cn,致命密函,...,0.0,90.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,Chinese Heroes,0.0,3.0,2.0,


In [246]:
df_2002 = pd.read_csv('Data/final_tmdb_data_2002.csv.gz')
df_2002.tail()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
4264,tt6449044,0.0,/a9pkw8stijESGx1flSGPqcXLkHu.jpg,"{'id': 957260, 'name': 'The Conman Collection'...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,314105.0,cn,賭俠2002,...,0.0,97.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,The Conman 2002,0.0,6.0,2.0,
4265,tt6694126,0.0,/sXjVpTZyDvwzPVZve3AmyCUBeHk.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,819174.0,fa,عروس خوش‌قدم,...,0.0,101.0,"[{'english_name': 'Persian', 'iso_639_1': 'fa'...",Released,,The Lucky Bride,0.0,0.0,0.0,
4266,tt8302928,0.0,,,0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",,866533.0,el,Movie Toons: Treasure Island,...,0.0,0.0,[],Released,,Movie Toons: Treasure Island,0.0,0.0,0.0,
4267,tt8474326,0.0,,,0.0,[],,292027.0,en,Skin Eating Jungle Vampires,...,0.0,0.0,[],Released,,Skin Eating Jungle Vampires,0.0,0.0,0.0,
4268,tt8825252,0.0,,,0.0,[],,989195.0,en,Circle of Fire: The Dark Lord Kylnor,...,0.0,45.0,[],Released,,Circle of Fire: The Dark Lord Kylnor,0.0,0.0,0.0,


In [249]:
basics.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [256]:
test_df = [df_2000, df_2001]
df = pd.concat(test_df)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8698 entries, 0 to 4345
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                8698 non-null   object 
 1   adult                  8696 non-null   float64
 2   backdrop_path          7498 non-null   object 
 3   belongs_to_collection  197 non-null    object 
 4   budget                 8696 non-null   float64
 5   genres                 8696 non-null   object 
 6   homepage               6377 non-null   object 
 7   id                     8696 non-null   float64
 8   original_language      8696 non-null   object 
 9   original_title         8696 non-null   object 
 10  overview               8646 non-null   object 
 11  popularity             8696 non-null   float64
 12  poster_path            8403 non-null   object 
 13  production_companies   8696 non-null   object 
 14  production_countries   8696 non-null   object 
 15  rele

In [257]:
df.to_csv("Data/Two_Years_tmdb_combined_data.csv.gz", compression="gzip", index=False)

In [263]:
financials = df[['title', 'budget', 'revenue']]
financial_filter = financials.groupby(['budget', 'revenue'])
financials.head()

Unnamed: 0,title,budget,revenue
0,,,
1,Tom & Jerry,50000000.0,132000000.0
2,Tom & Jerry,50000000.0,132000000.0
3,Tom & Jerry,50000000.0,132000000.0
4,Tom & Jerry,50000000.0,132000000.0


In [265]:
budget_fil = df['budget'] > 0
revenue_fil = df['revenue'] >0

In [267]:
financials= df.loc[budget_fil | revenue_fil, :]
financials

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
1,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
2,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
3,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
4,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
5,tt1361336,0.0,/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg,,50000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",https://www.tomandjerrymovie.com,587807.0,en,Tom & Jerry,...,132000000.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Best of enemies. Worst of friends.,Tom & Jerry,0.0,6.975,2069.0,PG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4285,tt0445841,0.0,,,12500000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,233308.0,zh,一个烂赌的传说,...,123021750.0,94.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,A Gambler's Story,0.0,6.800,3.0,
4301,tt0867181,0.0,,,5000.0,"[{'id': 53, 'name': 'Thriller'}]",,749151.0,en,Patient,...,0.0,88.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Only a miracle can save Sean now,Patient,0.0,0.000,0.0,
4303,tt1039952,0.0,/yTGmGjAd0gfpVK0ezoqWgx7AJkp.jpg,,500000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,69399.0,ta,சிட்டிசன்,...,500000.0,157.0,"[{'english_name': 'Tamil', 'iso_639_1': 'ta', ...",Released,,Citizen,0.0,6.500,12.0,
4315,tt1764172,0.0,,,250000.0,"[{'id': 53, 'name': 'Thriller'}]",,228977.0,en,Among Thieves,...,0.0,0.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Among Thieves,0.0,0.000,0.0,


In [268]:
financials.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6825 entries, 1 to 4326
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                6825 non-null   object 
 1   adult                  6825 non-null   float64
 2   backdrop_path          6724 non-null   object 
 3   belongs_to_collection  102 non-null    object 
 4   budget                 6825 non-null   float64
 5   genres                 6825 non-null   object 
 6   homepage               6283 non-null   object 
 7   id                     6825 non-null   float64
 8   original_language      6825 non-null   object 
 9   original_title         6825 non-null   object 
 10  overview               6822 non-null   object 
 11  popularity             6825 non-null   float64
 12  poster_path            6804 non-null   object 
 13  production_companies   6825 non-null   object 
 14  production_countries   6825 non-null   object 
 15  rele

In [270]:
fig, ax = plt.subplots()
valid_financials['budget'].hist(alpha =0.5)
valid_financials['revenue'].hist(alpha = 0.5)
ax.set_xlabel("Amount budget or revenue in $")
ax.set_ylabel("Count")


NameError: name 'plt' is not defined