In [1]:
from bs4 import BeautifulSoup
from typing import Optional
import concurrent.futures
import pandas as pd
import requests
import os

ROOT_URL="https://en.wikipedia.org" 
URL=f"{ROOT_URL}/wiki/List_of_Academy_Award%E2%80%93winning_films"
NOMINEES_PATH = os.path.join(os.getcwd(), "oscar_nominees")

In [2]:
nominees_response=requests.get(URL)
nominees_html=nominees_response.content.decode()
nominees_soup=BeautifulSoup(nominees_html, 'html.parser')
nominees_table=nominees_soup.select_one(".wikitable")
nominees_links=[a["href"] for a in nominees_table.select("a") if "_in_film" not in a["href"] and "#" not in a["href"]]

In [17]:
# EXECUTE ONLY IF IT FOLDER DOES NOT EXISTS
if os.path.isdir(NOMINEES_PATH):
    os.removedirs(NOMINEES_PATH)

os.makedirs(NOMINEES_PATH)

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    get_nominee_page=lambda link: requests.get(ROOT_URL + link).content.decode()
    write_page_to_file=lambda link, index: open(os.path.join(NOMINEES_PATH, f"{index}.html"), "w").write(get_nominee_page(link))
    
    futures = [executor.submit(write_page_to_file, link, index) for index, link in enumerate(nominees_links)]
    for future in futures:
        future.result()

In [6]:
nominee_page_file_content=lambda file: open(os.path.join(NOMINEES_PATH, file), 'r').read()
nominee_page_as_bs4=lambda file: BeautifulSoup(nominee_page_file_content(file), "html.parser") if ".html" in file else None

def nominee_page_data(file: str) -> Optional[tuple[str, str]]:
    try:
    
        info_table = nominee_page_as_bs4(file).select_one(".infobox").select_one('tbody')
    
        if info_table is not None:
            country = info_table.find_all(lambda tag: tag.text == "Country")[0].find_next("td")
            title = info_table.select_one(".infobox-above").text
    
            return (country, title)
            
    except IndexError:
        return None
    except AttributeError:
        return None

    return None
        

def nominee_page_data_proccess() -> dict[str, list]:    
    data: dict[str, list] = {"countries": [], "titles": []}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        
        futures = [executor.submit(nominee_page_data, file) for file in os.listdir(NOMINEES_PATH)]
        results = [future.result() for future in futures if future.result() is not None]
    
        for result in results:
            data["countries"].append(result[0])
            data["titles"].append(result[1])
    
    return data


df = pd.DataFrame(nominee_page_data_proccess())
df.dropna()
df

Unnamed: 0,countries,titles
0,[United States],Jaws
1,[Germany],The Titan: Story of Michelangelo
2,[United States],E.T. the Extra-Terrestrial
3,[United States],Bill and Coo
4,[United States],Bram Stoker's Dracula
...,...,...
927,[United Kingdom],Anne of the Thousand Days
928,[United States],The Neighbors' Window
929,[United States],The Living Desert
930,[United States],Dr. Jekyll and Mr. Hyde


In [8]:
df.value_counts()

TypeError: '<' not supported between instances of 'Tag' and 'Tag'