In [31]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

In [36]:
def open_href_in_new_window(driver, href, wait_by):
    driver.execute_script("window.open('{}', '_blank');".format(href))
    driver.switch_to.window(driver.window_handles[-1])
    driver.implicitly_wait(10)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located(wait_by))
    time.sleep(2)
    
def close_current_window(driver):
    driver.close()
    driver.switch_to.window(driver.window_handles[-1])

In [33]:
url = "https://m.imdb.com/chart/top/?sort=release_date%2Cdesc"
driver = webdriver.Firefox()
driver.install_addon('uBlock0@raymondhill.net.xpi', temporary=True)
driver.get(url)
driver.implicitly_wait(10)

In [34]:
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
ul = soup.find("ul", class_="ipc-metadata-list")
li_items = ul.find_all("li")
assert len(li_items) == 250

In [37]:
movies_dict = {"id": [], "name": [], "metascore": [], "imdbscore": [], "categories": [], "director_id": []}
actors_dict = {"id": [], "name": [], "movie_id": [], "role": []}
failed_lis = []

for li in li_items:
    a = li.find("a")
    if not a:
        failed_lis.append(li)
    try:
        href = a.get("href")
        open_href_in_new_window(driver, href, (By.CSS_SELECTOR, 'li[data-testid="title-pc-principal-credit"]'))
        movie_html = driver.page_source
        movie_soup = BeautifulSoup(movie_html, "html.parser")
        url_split = driver.current_url.split('/')
        movie_id = url_split[url_split.index("title") + 1]
        
        # wird auch für Actortable benötigt
        director_div = movie_soup.find('li', {'data-testid': 'title-pc-principal-credit'})
        director_id = director_div.find('a').get("href").split('/')[2]
        director_name = director_div.find('a').text
        
        actors_dict["id"].append(director_id)
        actors_dict["name"].append(director_name)
        actors_dict["role"].append("Director")
        actors_dict["movie_id"].append(movie_id)
        
        movies_dict["id"].append(movie_id)
        movies_dict["name"].append(movie_soup.find("span", class_="hero__primary-text").text)
        metascore_span = movie_soup.find('span', class_="metacritic-score-box")
        movies_dict["metascore"].append(metascore_span.text if metascore_span else None)
        movies_dict["imdbscore"].append(movie_soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).find('span').text.replace(',', '.'))
        movies_dict["categories"].append([category.find("span").text for category in movie_soup.find("div", {'data-testid': 'genres'}).find_all("a")])
        movies_dict["director_id"].append(director_id)
        
        actor_href = movie_soup.find("section", {'data-testid': 'title-cast'}).find("a").get("href")
        open_href_in_new_window(driver, actor_href, (By.ID, 'fullcredits-content'))
        actor_html = driver.page_source
        actor_soup = BeautifulSoup(actor_html, "html.parser")
        actor_items = actor_soup.find("section", {'id': 'fullcredits-content'}).find_all("a")
        for actor in actor_items:
            actors_dict["id"].append(actor.get("href").split('/')[2])
            actors_dict["name"].append(actor.find("h4").text)
            actors_dict["role"].append(actor.find("p").text if actor.find("p") else None)
            actors_dict["movie_id"].append(movie_id)
            
        close_current_window(driver) # close actor window
        close_current_window(driver) # close movie window
    except:
        failed_lis.append(li)


if len(failed_lis) > 0:
    print(f"{len(failed_lis)} konnten nicht vollständig erfasst werden.")

In [38]:
len(movies_dict["id"])

250

In [39]:
movies = pd.DataFrame(movies_dict)
actors = pd.DataFrame(actors_dict)
movies.to_csv("./data/movies.csv", index=False)
actors.to_csv("./data/actors.csv", index=False)

In [40]:
driver.quit()