In [None]:
from importlib.resources import files

import pandas as pd
from bs4 import BeautifulSoup

from src import time_conversions

In [None]:
# Load the HTML content
data_file = [f for f in files('src').joinpath('data').iterdir() if f.name.endswith('.html')][0]
with open(data_file, 'r') as file:
    content = file.read()

In [None]:
# Parse the HTML content
soup = BeautifulSoup(content, 'html.parser')

In [None]:
# Extract the table that has type=results
table = soup.find('table', {'class': 'results'})
table

In [None]:
# Extract the table rows
rows = table.find_all('tr')

In [None]:
# Extract the table headers
header = rows[0]

In [None]:
# Extract the table data
data = rows[1:]

In [None]:
# Put the data into a dictionary
data_dict = {}
for row in data:
    cells = row.find_all('td')
    data_dict[cells[0].text] = [cell.text for cell in cells[1:]]
data_dict

In [None]:
# Extract the table headers
header_cells = header.find_all('td')
header_cells

In [None]:
# Extract the header names
header_names = [cell.text for cell in header_cells if cell.text != '']
header_names

In [None]:
# Put the data into a pandas DataFrame
df = pd.DataFrame(data_dict).T
df.columns = header_names
df

In [None]:
# Remove the TOTAL row
df = df[df.Building != 'TOTAL']

In [None]:
# Drop the index
df = df.reset_index(drop=True)

In [42]:
# Convert the columns to numeric
non_numic_cols = ['Building', 'Duration']
cols_to_convert = [col for col in df.columns if col not in non_numic_cols]
df[cols_to_convert] = df[cols_to_convert].apply(lambda s: s.str.replace(',', ''))
df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric)

In [45]:
# Create a new column for Duration in minutes
df['Duration (min)'] = df['Duration'].apply(time_conversions.to_minutes)

In [46]:
# Save the DataFrame to a CSV file
df.to_csv('data.csv', index=False)