In [1]:
COS = "Courses-of-Study_2020-2021.pdf"
COURSE_DESCRIPTION_PAGE_FIRST = 161
COURSE_DESCRIPTION_PAGE_LAST = 370

In [2]:
import fitz  # PyMuPDF
import re

In [3]:
COURSE_PATTERN = r"^[A-Z]{3}\d{3}"
CREDIT_PATTERN = r"^\d"
PREREQ_PATTERN = r"^Pre-requi"
OVERLAP_PATTERN = r"^Overlaps"

STANDARD_COS_PAGE_X1 = 49
STANDARD_COS_PAGE_X2 = 300
STANDARD_COS_PAGE_X3 = 596
STANDARD_COS_PAGE_Y1 = 84
STANDARD_COS_PAGE_Y2 = 790

In [4]:
def extract_text_by_coordinates(pdf_document, page_number, x1, y1, x2, y2):
    page = pdf_document[page_number - 1]
    rect = fitz.Rect(x1, y1, x2, y2)
    
    selected_text = page.get_text("text", clip=rect)
    return selected_text

def standard_cos_page_scraper(pdf_doc, page):
    x1, y1 = STANDARD_COS_PAGE_X1, STANDARD_COS_PAGE_Y1  # Starting coordinates of the rectangle
    x2, y2 = STANDARD_COS_PAGE_X2, STANDARD_COS_PAGE_Y2  # Ending coordinates of the rectangle
    x3, y3 = STANDARD_COS_PAGE_X2, STANDARD_COS_PAGE_Y1
    x4, y4 = STANDARD_COS_PAGE_X3, STANDARD_COS_PAGE_Y2
    left_side = extract_text_by_coordinates(pdf_doc, page, x1, y1, x2, y2)
    right_side = extract_text_by_coordinates(pdf_doc, page, x3, y3, x4, y4)
    return [left_side, right_side]


def scrape_courses_description():
    scraped_dataset = []
    
    first_page_i = COURSE_DESCRIPTION_PAGE_FIRST
    last_page_i = COURSE_DESCRIPTION_PAGE_LAST


    pdf_doc = fitz.open(COS)
    for page in range(first_page_i, last_page_i + 1):
        scraped_dataset.extend(standard_cos_page_scraper(pdf_doc, page))

    pdf_doc.close()
    
    return scraped_dataset

def attach_courses_description(scraped_dataset):
    for i in range(len(scraped_dataset)):
        data = scraped_dataset[i]
        if re.match(COURSE_PATTERN, data):
            scraped_dataset[i] = "\n" + data
        else :
            scraped_dataset[i] = " " + data
    return "".join(scraped_dataset)

def string_course_dataset_to_json(single_string_dataset):
    all_lines = single_string_dataset.split("\n")[1:]
    all_courses = []
    course = {}
    is_first_course = True
    for line in all_lines:
        if re.match(COURSE_PATTERN, line):
            if is_first_course: is_first_course = False
            else: all_courses.append(course)
            course = {
                "code" : line[:6],
                "name" : line[7:],
                "description" : "",
                "preReq" : "",
                "overLap" : "",
                "credits" : ""
            }
        elif re.match(CREDIT_PATTERN, line): course["credits"] = line
        elif re.match(PREREQ_PATTERN, line): course["preReq"] = line
        elif re.match(OVERLAP_PATTERN, line): course["overLap"] = line
        else: course["description"]  += line
    all_courses.append(course)

    return all_courses



def get_all_courses_description():
    scraped_dataset : list[str] = scrape_courses_description()
    single_string_dataset : str = attach_courses_description(scraped_dataset)
    courses_json_list : list[object] = string_course_dataset_to_json(single_string_dataset)
    return courses_json_list



# -------------------------------------------------------------------------------------------------------------
all_courses = get_all_courses_description()
# -------------------------------------------------------------------------------------------------------------


In [5]:
#TODO
# modify the pre-req structure and credit structure

# COS 20-21
# edge-case : page number 365
# prereq wrong in HUL275
# weird description in SBL724
# multiline pre-req in SBL720, SBL708, SBL710
# multiline course name in ASD882, and also there is a not allowed condition there in the name
# multiline course anem in CRL725
# Non-Credit Mandatory field in TXT800
# HUL modifications

for ele in all_courses:
    if ele["code"] == "ELL820":
        print(ele)

{'code': 'ELL820', 'name': 'Photonic Switching and Networking', 'description': 'Study of different types of networks, the enabling technologies and devices. Broadcast and Select network. Single and Multi-hop networks with example of Access networks, PoNS etc., Wavelength Routing network, virtual topology, Metro and Wide area networks. Wavelength Routing and Assignment, Traffic Grooming and Protection, Network Control and Management, optical packet and burst switching, Network Simulation Tools and Design guidelines.', 'preReq': '', 'overLap': '', 'credits': '3 Credits (3-0-0)'}
