In [215]:
import gender_guesser.detector as gender
import pandas as pd
from imdb import IMDb
from lxml import etree
from tqdm.auto import tqdm

In [235]:
def extract_film_ids(list_of_lists):
    parser = etree.HTMLParser()
    imdb_ids = []
    for file in list_of_lists:
        tree = etree.parse("imdb_lists/" + file, parser)
        links = tree.xpath("/html/body/div/div/div/div/div/div/div/div/div/div/h3/a/@href")
        ids = [link.split("/")[2][2:] for link in links]
        imdb_ids = imdb_ids + ids
    return imdb_ids

In [242]:
def query_api(imdb_ids):
    ia = IMDb()
    
    movie_objects = []
    for imdbid in tqdm(imdb_ids):
        movie_objects.append(ia.get_movie(imdbid))
    return movie_objects

In [237]:
def create_movies_df(imdb_ids, movie_objects):
    movies = pd.DataFrame(index=imdb_ids)
    movies['imdb_rank'] = list(range(1,101))

    for imdbid, m in zip(imdb_ids, movie_objects):
        movies.loc[imdbid, "Title"] = m.get("title")
        movies.loc[imdbid, "Year"] = m.get("year")
        movies.loc[imdbid, "Countries"] = ",".join(m.get("country"))
        box_office = m.get("box office")
        if box_office:
            try:
                movies.loc[imdbid, "Budget"] = box_office['Budget']
            except:
                pass
            try:
                movies.loc[imdbid, "Cumulative Worldwide Gross"] = box_office['Cumulative Worldwide Gross']
            except:
                pass
            try:
                movies.loc[imdbid, "Opening Weekend United States"] = box_office['Opening Weekend United States']
            except:
                pass
        movies.loc[imdbid, "Rating"] = m.get("rating")
    
    return movies

def create_people_df(imdb_ids, movie_objects):
    people = pd.DataFrame(columns=["Role", "Name", "Movie"])

    for imdbid, m in zip(imdb_ids, movie_objects):
        for role in ['writers', 'directors', 'actors']:
            folks = m.get(role)
            if folks:
                if role == "actors":
                    folks = folks[0:3]

                for folk in folks:
                    if 'name' in folk.data:
                        pid = len(people)+1
                        people.loc[pid, 'Role'] = role[:-1]
                        people.loc[pid, 'Movie'] = imdbid
                        people.loc[pid, 'Name'] = folk.data['name']

    d = gender.Detector()
    people['gender'] = people.Name.map(lambda x: d.get_gender(x.split(" ")[0]))
    
    return people

In [None]:
list1 = ["top100_comedy_1.html", "top100_comedy_2.html"]
list2 = ["top100_comedy_selection_1.html", "top100_comedy_selection_2.html"]
list_lists = [list1, list2]

top_types = ["top100", "top100_selection"]

In [None]:
for l, t in zip(list_lists, top_types):
    # Extract IMDB movie IDs
    imdb_ids = extract_film_ids(l)
    
    # Query API for movie metadata
    movie_objects = query_api(imdb_ids)
    
    # Create movies and people dataframes
    movies = create_movies_df(imdb_ids, movie_objects)
    people = create_people_df(imdb_ids, movie_objects)
    
    # Export dataframes
    movies.to_csv(t + "/movies.csv")
    people.to_csv(t + "/people.csv")