In [7]:
# attempt 3, parsing html source files
    # The HTML files being parsed are formated in "brief display." Previously, I tried to parse HTML files whose display was listed "for print"
    # but the "for print" HTML files were not structured with any kind of hierarchy so they were very hard to parse


from bs4 import BeautifulSoup
import json

# define an empty variable to hold the information from the html files
bfi_early_collection = []

# a loop to parse through the html files
for num in range(1,5):
    html_file_path = f"BFI_collection_search_results_{num}.html"

    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    soup = BeautifulSoup(html_content, "html.parser")
    
    films_html = soup.find_all("li")
    
    for film_html in films_html:
        single_film_data = {
            'title' : None,
            'production_company' : None,
            'director' : None,
            'production_info' : None
        }

        # define value for title key
        film_title = film_html.find("span", attrs={'style': 'font-size:110%; font-weight:bold;'})
        if film_title:
            single_film_data['title'] = film_title.text.strip()

        # define values for production company and director keys
        try:
            production_co = film_html.find_all("em")
            if production_co:
                prod_co_text = production_co[0].text.strip()
                single_film_data['production_company'] = prod_co_text[:-21]
                # 21 characters were subtracted from 'production_company' to erase the words "(Production company)," from that variable
                
                if production_co[1]:
                    single_film_data['director'] = production_co[1].text.strip()
                    # the 'director' variable is defined as the <em> text following the production company info, if that text exists
                    # it is more common for the films in this data to have a production company but no director (rather than a director but no production co), so this method mostly works
                    # there are a few films that have a director but no production co, and those films will be represented inaccurately as a result. I couldn't come up with a better method that worked
        except IndexError:
            pass

        # define value for production_info key
        production_info = film_html.find_all("p")[1]
        if production_info:
            single_film_data['production_info'] = production_info.text.strip()
            # it would be nice to break down this information between release date, country and film genre, but that info is concatenated and varies in length from film to film


        # if I had more time, I would have liked to add keys and values noting which image icons were grouped with the film described in each <li> item
        # that would allow my final dataset to include the types of materials contained in the BFI collections for each film title (like the film itself, digital images, scripts and ephemera, etc)
            # which is a very useful datapoint that would make the dataset more useful!


        # plug all this information into the empty variable created outside the loop
        bfi_early_collection.append(single_film_data)
        # print(single_film_data)
            # the print command above was to test that I was generating data that looked good


    # generate a json data file
    with open ('output_bfi_early_collection.json', 'w') as json_file:
        json.dump(bfi_early_collection, json_file, indent=2)