In [43]:
import imdb
import pandas as pd
import csv

In [44]:
data = imdb.IMDb() # How we fetch the data

In [45]:
top = data.get_top250_movies() # Getting the 250 movie objects that are the highest ranked on the website

In [112]:
top_IDs = list()
for i in range(len(top)):
    top_IDs.append(top[i].getID())
    
top_titles = list()
for i in range(len(top)):
    top_titles.append(top[i]['title'])
    
top_years = list()
for i in range(len(top)):
    top_years.append(top[i]['year'])

### Why is this next chunk (defining get_top_billing) necessary?
- The IMDb movie objects imported into the *top* variable are **not** the same as movie objects that are retrieved by calling the *get_movie()* function. The information in each movie object from the *get_top250_movies()* function is severly limited. 
- The way we access the information per each movie object is to use different keys, ie
        movie_obj['cast']
        
  this should print out a large number of people objects, each containing the person's name and information
- But the keys that are available to be used with the objects fetched by using the get_top250_movies() function are limited to the following:
        ['rating',
         'title',
         'year',
         'votes',
         'top 250 rank',
         'kind',
         'canonical title',
         'long imdb title',
         'long imdb canonical title',
         'smart canonical title',
         'smart long imdb canonical title']
         
- While the keys that are available to movies fetched by the *get_movie()* function are much more extensive:
        ['cast',
         'genres',
         'runtimes',
         'countries',
         'country codes',
         'language codes',
         'color info',
         'aspect ratio',
         'sound mix',
         'box office',
         'certificates',
         'original air date',
         'rating',
         'votes',
         'cover url',
         'plot outline',
         'languages',
         'title',
         'year',
         'kind',
         'directors',
         'writers',
           ... etc

There are very few items in this list that I desire to work with but things like director and cast are rather important to include. Instead of finding the top 250 films by hand and fetch them as Movie objects by hand, which would take a considerable amount of time, I am defining a function that will only pull the top 5 billed casts' names.

In [48]:
## Why is this necessary?

def get_top_billing(self, size = 5):
    " Get the top 5 highest billed actors"
    "must be passed a get_top250 movies object, not the original movie object"
    top_billed = []
    if(type(self) == list):
        for i in range(len(self)):
            top_billed.append(data.get_movie(self[i].getID())['cast'][0:size])
    else:
        top_billed = data.get_movie(self.getID())['cast'][0:size]
    return top_billed


In [49]:
## NOTE: This line of code takes a considerable amount of time to run
top_cast = get_top_billing(top) ## Already ran this line of code

In [114]:
with open("top_IDs.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(top_cast)
    
with open("top_titles.csv", "w", newline = "") as f:
    writer = csv.writer(f)
    writer.writerows(top_titles)
    
with open("top_IDs.csv", "w", newline = "") as f:
    writer = csv.writer(f)
    writer.writerows(top_IDs)

In [115]:
top_cast_name = []
for i in range(len(top_cast)):
    temp_list = []
    for j in range(5):
        temp_list.append(top_cast[i][j]['name'])
    top_cast_name.append(temp_list)

In [116]:
cast_df = pd.DataFrame(top_cast_name)

In [117]:
top250 = pd.DataFrame({"IDs" : top_IDs, "Titles" : top_titles, "Year": top_years})

In [118]:
top250 = pd.concat([top250, cast_df], axis=1)

In [119]:
top250.columns = ["ID", "Title", "Year", "Star1", "Star2", "Star3", "Star4", "Star5"]

Unnamed: 0,ID,Title,Year,Star1,Star2,Star3,Star4,Star5
0,0111161,The Shawshank Redemption,1994,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,Clancy Brown
1,0068646,The Godfather,1972,Marlon Brando,Al Pacino,James Caan,Richard S. Castellano,Robert Duvall
2,0071562,The Godfather: Part II,1974,Al Pacino,Robert Duvall,Diane Keaton,Robert De Niro,John Cazale
3,0468569,The Dark Knight,2008,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,Maggie Gyllenhaal
4,0050083,12 Angry Men,1957,Martin Balsam,John Fiedler,Lee J. Cobb,E.G. Marshall,Jack Klugman
5,0108052,Schindler's List,1993,Liam Neeson,Ben Kingsley,Ralph Fiennes,Caroline Goodall,Jonathan Sagall
6,0167260,The Lord of the Rings: The Return of the King,2003,Noel Appleby,Ali Astin,Sean Astin,David Aston,John Bach
7,0110912,Pulp Fiction,1994,Tim Roth,Amanda Plummer,Laura Lovelace,John Travolta,Samuel L. Jackson
8,0060196,"The Good, the Bad and the Ugly",1966,Eli Wallach,Clint Eastwood,Lee Van Cleef,Aldo Giuffrè,Luigi Pistilli
9,0137523,Fight Club,1999,Edward Norton,Brad Pitt,Meat Loaf,Zach Grenier,Richmond Arquette
