In [18]:
import requests
from bs4 import BeautifulSoup
import json
import jsonschema
from jsonschema import validate
module_schema = {
    "type": "object",
    "properties": {
        "Activity": {"type": "string"},
        "Name": {"type": "string"},
        "Type": {"type": "string"},
        "Start": {"type": "string"},
        "End": {"type": "string"},
        "Duration": {"type": "string"},
        "Weeks": {"type": "string"},
        "Room": {"type": "string"},
        "Staff": {"type": "string"},
        "Student_Groups": {"type": "string"},
    },
    "additionalProperties": False,
}

day_schema = {
    "type": "array",
    "items": module_schema,
    "additionalProperties": False,
}

timetable_info_schema = {
    "type": "object",
    "properties": {
        "Monday": day_schema,
        "Tuesday": day_schema,
        "Wednesday": day_schema,
        "Thursday": day_schema,
        "Friday": day_schema,
        "Saturday": day_schema,
        "Sunday": day_schema,
    },
    "additionalProperties": False,
}

In [20]:
timetable_id = """KCSOFD%5F3A%20CW207%20C3"""
text_timetable_url = f"http://timetable.itcarlow.ie/reporting/textspreadsheet;student+set;id;{timetable_id}?t=student+set+textspreadsheet&days=1-5&weeks=&periods=5-40&template=student+set+textspreadsheet"
timetable_html = requests.get(text_timetable_url)
timetable_soup = BeautifulSoup(timetable_html.text, features="html.parser")
body = timetable_soup.find("body")
days = [p.find("span").string for p in body.find_all("p", recursive=False)]
tables = body.findChildren("table", recursive=False)
timetable_days = tables[1:-1]

week_modules = {}
for day, modules in zip(days, timetable_days):
    modules = modules.find_all("tr")[1:]
    day_modules = []
    for module in modules:
        (
            module_activity,
            module_name,
            module_type,
            module_start,
            module_end,
            module_duration,
            module_weeks,
            module_room,
            module_staff,
            module_student_groups,
        ) = (module_info.string for module_info in module.find_all("td"))
        day_modules.append(
            {
                "Activity": module_activity,
                "Name": module_name,
                "Type": module_type,
                "Start": module_start,
                "End": module_end,
                "Duration": module_duration,
                "Weeks": module_weeks,
                "Room": module_room,
                "Staff": module_staff,
                "Student_Groups": module_student_groups,
            }
        )
    week_modules[day] = day_modules
    # validate(instance=json.loads(json.dumps(day_modules, indent=4)), schema=day_schema)

validate(instance=json.loads(json.dumps(week_modules, indent=4)), schema=timetable_info_schema)