In [394]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [395]:
def get_director_and_country(link):
    """
    Retrieves the director's name and country of origin for a given film title
    from its Wikipedia page.

    Args:
        title (str): The title of the film.

    Returns:
        tuple: A tuple containing the director's name and the country of origin.
               Returns (None, None) if the information is not found or if an error occurs.
    """
    response = requests.get(link)
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    soup = BeautifulSoup(response.content, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox vevent'})

    if infobox is None:
        print("No infobox found on this page.")
        return None

    director = []
    country = []

    # Find director(s)
    director_row = infobox.find('th', string='Directed by')
    if not director_row:
        country_row = infobox.find('th', string='director')
    if director_row:
        director_values = director_row.find_next('td')
        if director_values:
            for link in director_values.find_all('a', href=True):
                director.append(re.sub(r'[^a-zA-Z\s]', '', link.get_text()).strip()) # Extract text from links, TODO make directors a list of correct strings


    # Find country(ies)
    country_row = infobox.find('th', string='Country')
    if not country_row:
        country_row = infobox.find('th', string='Country of origin')
    if not country_row:
        country_row = infobox.find('th', string='Countries')
    if country_row:
        country_values = country_row.find_next('td')
        if country_values:
            plain_text = country_values.get_text(strip=True)  # Get all text, stripped
            if plain_text:
                countries = [re.sub(r'[^a-zA-Z\s]', '', c).strip() for c in plain_text.split(',')] # TODO make country a list of correct strings
                country.extend(countries)
    
    return director, country

In [None]:
def get_film_data():
    url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Error during the GET request: {response.status_code}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='wikitable')
    data = []
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all(['td', 'th'])  # Include th for header and row headers
            row_data = []
            for cell in cells:
                #if there is a link, extract it
                link = cell.find('a')
                if link and 'href' in link.attrs and 'title' in link.attrs:
                    url = f"https://en.wikipedia.org{link['href']}"
                else:
                    url = None
                text = cell.get_text(strip=True)
                row_data.append(text)
                if url:
                    row_data.append(url)
            data.append(row_data)

    df = pd.DataFrame(data)
    df = df.iloc[1:]  # Skip the original header row

    df = df.dropna(axis=1, how='all')
    df = df.rename(columns={df.columns[2]: 'Title', 
                            df.columns[4]: 'Worldwide gross', 
                            df.columns[5]: 'Year',
                            df.columns[6]: 'Ref',
                            df.columns[0]: 'Rank',
                            df.columns[1]: 'Peak',
                            df.columns[3]: 'Link'})

    # change to appropriate types
    try:
        df['Title'] = df['Title'].astype(str).replace('†', '', regex=True) 
        df['Worldwide gross'] = df['Worldwide gross'].replace({'.*\$': '', ',': ''}, regex=True).astype(int)
        df['Year'] = df['Year'].astype(int)
        df['Rank'] = df['Rank'].astype(int)
    except KeyError as e:
        print(f"Error: Column '{e}' not found. Please check your column names.")
    except ValueError as e:
        print(f"Error: Could not convert data type.  Check the contents of the columns. {e}")

    # add columns for directors and countries
    directors = []
    countries = []
    for title in df['Link']:
        director, country = get_director_and_country(title)
        directors.append(director)
        countries.append(country)


    df['Director'] = directors
    df['Country'] = countries

    df = df.drop(['Peak', 'Ref', 'Link'], axis=1)
    df.to_csv('output.txt', sep='|', index=False)
    return df


  df['Worldwide gross'] = df['Worldwide gross'].replace({'.*\$': '', ',': ''}, regex=True).astype(int)


In [397]:
if __name__ == "__main__":
    get_film_data()