In [43]:
import imdb
import pandas as pd
import csv

In [44]:
data = imdb.IMDb() # How we fetch the data

In [45]:
top = data.get_top250_movies() # Getting the 250 movie objects that are the highest ranked on the website

In [47]:
top_IDs = list()
for i in range(len(top)):
    top_IDs.append(top[i].getID())
    
top_titles = list()
for i in range(len(top)):
    top_titles.append(top[i]['title'])

### Why is this next chunk (defining get_top_billing) necessary?
- The IMDb movie objects imported into the *top* variable are **not** the same as movie objects that are retrieved by calling the *get_movie()* function. The information in each movie object from the *get_top250_movies()* function is severly limited. 
- The way we access the information per each movie object is to use different keys, ie
        movie_obj['cast']
        
  this should print out a large number of people objects, each containing the person's name and information
- But the keys that are available to be used with the objects fetched by using the get_top250_movies() function are limited to the following:
        ['rating',
         'title',
         'year',
         'votes',
         'top 250 rank',
         'kind',
         'canonical title',
         'long imdb title',
         'long imdb canonical title',
         'smart canonical title',
         'smart long imdb canonical title']
         
- While the keys that are available to movies fetched by the *get_movie()* function are much more extensive:
        ['cast',
         'genres',
         'runtimes',
         'countries',
         'country codes',
         'language codes',
         'color info',
         'aspect ratio',
         'sound mix',
         'box office',
         'certificates',
         'original air date',
         'rating',
         'votes',
         'cover url',
         'plot outline',
         'languages',
         'title',
         'year',
         'kind',
         'directors',
         'writers',
         'producers',
         'composers',
         'cinematographers',
         'editors',
         'editorial department',
         'casting directors',
         'production designers',
         'art directors',
         'set decorators',
         'costume designers',
         'make up department',
         'production managers',
         'assistant directors',
         'art department',
         'sound department',
         'special effects',
         'visual effects',
         'stunts',
         'camera department',
         'animation department',
         'casting department',
         'costume departmen',
         'location management',
         'music department',
         'transportation department',
         'miscellaneous',
         'thanks',
         'akas',
         'writer',
         'director',
         'production companies',
         'distributors',
         'special effects companies',
         'other companies',
         'plot',
         'synopsis',
         'canonical title',
         'long imdb title',
         'long imdb canonical title',
         'smart canonical title',
         'smart long imdb canonical title',
         'full-size cover url']

There are very few items in this list that I desire to work with but things like director and cast are rather important to include. Instead of finding the top 250 films by hand and fetch them as Movie objects by hand, which would take a considerable amount of time, I am defining a function that will only pull the top 5 billed casts' names.

In [48]:
## Why is this necessary?

def get_top_billing(self, size = 5):
    " Get the top 5 highest billed actors"
    "must be passed a get_top250 movies object, not the original movie object"
    top_billed = []
    if(type(self) == list):
        for i in range(len(self)):
            top_billed.append(data.get_movie(self[i].getID())['cast'][0:size])
    else:
        top_billed = data.get_movie(self.getID())['cast'][0:size]
    return top_billed


In [49]:
## NOTE: This line of code takes a considerable amount of time to run
top_cast = get_top_billing(top) ## Already ran this line of code

In [54]:
with open("top_IDs.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(top_cast)
    
with open("top_titles.csv", "w", newline = "") as f:
    writer = csv.writer(f)
    writer.writerows(top_titles)
    
with open("top_IDs.csv", "w", newline = "") as f:
    writer = csv.writer(f)
    writer.writerows(top_IDs)

In [67]:
top250 = pd.DataFrame({"IDs" : top_IDs, "Titles" : top_titles})

In [73]:
top_cast

[[<Person id:0000209[http] name:_Tim Robbins_>,
  <Person id:0000151[http] name:_Morgan Freeman_>,
  <Person id:0348409[http] name:_Bob Gunton_>,
  <Person id:0006669[http] name:_William Sadler_>,
  <Person id:0000317[http] name:_Clancy Brown_>],
 [<Person id:0000008[http] name:_Marlon Brando_>,
  <Person id:0000199[http] name:_Al Pacino_>,
  <Person id:0001001[http] name:_James Caan_>,
  <Person id:0144710[http] name:_Richard S. Castellano_>,
  <Person id:0000380[http] name:_Robert Duvall_>],
 [<Person id:0000199[http] name:_Al Pacino_>,
  <Person id:0000380[http] name:_Robert Duvall_>,
  <Person id:0000473[http] name:_Diane Keaton_>,
  <Person id:0000134[http] name:_Robert De Niro_>,
  <Person id:0001030[http] name:_John Cazale_>],
 [<Person id:0000288[http] name:_Christian Bale_>,
  <Person id:0005132[http] name:_Heath Ledger_>,
  <Person id:0001173[http] name:_Aaron Eckhart_>,
  <Person id:0000323[http] name:_Michael Caine_>,
  <Person id:0350454[http] name:_Maggie Gyllenhaal_>],
 

['The Shawshank Redemption',
 'The Godfather',
 'The Godfather: Part II',
 'The Dark Knight',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Good, the Bad and the Ugly',
 'Fight Club',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Forrest Gump',
 'Inception',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Lord of the Rings: The Two Towers',
 'The Matrix',
 "One Flew Over the Cuckoo's Nest",
 'Goodfellas',
 'Seven Samurai',
 'Se7en',
 'City of God',
 'Life Is Beautiful',
 'The Silence of the Lambs',
 'Star Wars: Episode IV - A New Hope',
 "It's a Wonderful Life",
 'Saving Private Ryan',
 'Spirited Away',
 'The Green Mile',
 'Léon: The Professional',
 'Harakiri',
 'Interstellar',
 'The Usual Suspects',
 'The Lion King',
 'American History X',
 'Back to the Future',
 'The Pianist',
 'Modern Times',
 'Terminator 2: Judgment Day',
 'The Intouchables',
 'Psycho',
 'Gladiator',
 'City Lights',
 'The Departed',

In [187]:
top = data.get_top250_movies()

In [334]:
shawshankID = '0111161'
ironmanID = '0371746'
ironman = data.get_movie(ironmanID)
shawshank = data.get_movie(shawshankID)
top_titles = list()
top_objects = list()
top100 = top[0:100]

In [408]:
df = pd.DataFrame()
df['id'] = top_IDs
df['Title'] = top_titles
df['Year'] = top[i]['year'] for i in range(len(top))


SyntaxError: invalid syntax (<ipython-input-408-0ff9fbd57db5>, line 4)

In [304]:

    
ironman_stars = get_top_billing(ironman)

In [252]:
keys = shawshank.infoset2keys
keys
my_key = {'my_info': ['cast',
                      'title',
                     'year',
                     'directors',
                     'writers',
                     'producers']}
ironman._additional_keys()


['canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title',
 'full-size cover url']

In [320]:
top100_cast

with open("out.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(top_cast)

In [333]:
top_IDs

[]

In [199]:
df = pd.DataFrame()
df['ID'] = top_IDs
df['Title'] = top_titles


In [201]:
df.head()

Unnamed: 0,ID,Title
0,111161,The Shawshank Redemption
1,68646,The Godfather
2,71562,The Godfather: Part II
3,468569,The Dark Knight
4,50083,12 Angry Men


In [149]:
top[0].keys()

['rating',
 'title',
 'year',
 'votes',
 'top 250 rank',
 'kind',
 'canonical title',
 'long imdb title',
 'long imdb canonical title',
 'smart canonical title',
 'smart long imdb canonical title']

AttributeError: 'IMDbHTTPAccessSystem' object has no attribute 'default_info'