In [1]:
# COS = "Courses-of-Study_2020-2021.pdf"
# COS = "Courses-of-Study-2020-2021.pdf"
COS = R"Courses of Study 2023-24.pdf"

COURSE_DESCRIPTION_PAGE_FIRST = 178
COURSE_DESCRIPTION_PAGE_LAST = 417

PROGRAMME_PAGE_FIRST = 46
PROGRAMME_PAGE_LAST = 96

In [2]:
import fitz  # PyMuPDF
import re

In [8]:
COURSE_PATTERN = r"^[A-Z]{3}\d{3}"
CREDIT_PATTERN = r"^\d"
PREREQ_PATTERN = r"^Pre-requi"
OVERLAP_PATTERN = r"^[Oo]verlaps"

STANDARD_COS_PAGE_X1 = 49
STANDARD_COS_PAGE_X2 = 300
STANDARD_COS_PAGE_X3 = 596
STANDARD_COS_PAGE_Y1 = 84
STANDARD_COS_PAGE_Y2 = 790


In [9]:
def extract_text_by_coordinates(pdf_document, page_number, x1, y1, x2, y2):
    page = pdf_document[page_number - 1]
    rect = fitz.Rect(x1, y1, x2, y2)
    
    selected_text = page.get_text("text", clip=rect)
    return selected_text

def standard_cos_page_scraper(pdf_doc, page):
    x1, y1 = STANDARD_COS_PAGE_X1, STANDARD_COS_PAGE_Y1  # Starting coordinates of the rectangle
    x2, y2 = STANDARD_COS_PAGE_X2, STANDARD_COS_PAGE_Y2  # Ending coordinates of the rectangle
    x3, y3 = STANDARD_COS_PAGE_X2, STANDARD_COS_PAGE_Y1
    x4, y4 = STANDARD_COS_PAGE_X3, STANDARD_COS_PAGE_Y2
    left_side = extract_text_by_coordinates(pdf_doc, page, x1, y1, x2, y2)
    right_side = extract_text_by_coordinates(pdf_doc, page, x3, y3, x4, y4)
    return [left_side, right_side]


def scrape_courses_description():
    scraped_dataset = []
    
    first_page_i = COURSE_DESCRIPTION_PAGE_FIRST
    last_page_i = COURSE_DESCRIPTION_PAGE_LAST


    pdf_doc = fitz.open(COS)
    for page in range(first_page_i, last_page_i + 1):
        scraped_dataset.extend(standard_cos_page_scraper(pdf_doc, page))

    pdf_doc.close()
    
    return scraped_dataset

def attach_courses_description(scraped_dataset):
    for i in range(len(scraped_dataset)):
        data = scraped_dataset[i]
        if re.match(COURSE_PATTERN, data):
            scraped_dataset[i] = "\n" + data
        else :
            scraped_dataset[i] = " " + data
    return "".join(scraped_dataset)

def string_course_dataset_to_json(single_string_dataset):
    all_lines = single_string_dataset.split("\n")[1:]
    all_courses = []
    course = {}
    is_first_course = True
    for line in all_lines:
        if re.match(COURSE_PATTERN, line):
            if is_first_course: is_first_course = False
            else: all_courses.append(course)
            course = {
                "code" : line[:6],
                "name" : line[7:],
                "description" : "",
                "preReq" : "",
                "overLap" : "",
                "credits" : ""
            }
        elif re.match(CREDIT_PATTERN, line): course["credits"] = line
        elif re.match(PREREQ_PATTERN, line): course["preReq"] = line
        elif re.match(OVERLAP_PATTERN, line): course["overLap"] = line
        else: course["description"]  += line
    all_courses.append(course)

    return all_courses



def get_all_courses_description():
    scraped_dataset : list[str] = scrape_courses_description()
    single_string_dataset : str = attach_courses_description(scraped_dataset)
    courses_json_list : list[object] = string_course_dataset_to_json(single_string_dataset)
    return courses_json_list



# -------------------------------------------------------------------------------------------------------------
all_courses = get_all_courses_description()
# -------------------------------------------------------------------------------------------------------------


## TODO
- modify the pre-req structure and credit structure



#### THE EDGE CASES NOT YET HANDLED
##### COS 20-21
- edge-case : page number 365
- prereq wrong in HUL275
- weird description in SBL724
- multiline pre-req in SBL720, SBL708, SBL710
- multiline course name in ASD882, and also there is a not allowed condition there in the name
- multiline course anem in CRL725
- Non-Credit Mandatory field in TXT800
- HUL modifications


In [10]:
for ele in all_courses:
    if ele["code"] == "COL106":
        for a in ele:
            print(f"{a} :: {ele[a]}")

code :: COL106
name :: Data Structures & Algorithms
description :: Introduction to object-oriented programming through stacks queues and linked lists. Dictionaries; skip-lists, hashing, analysis of collision resolution techniques. Trees, traversals, binary search trees, optimal and average BSTs. Balanced BST: AVL Trees, 2-4 trees, red-black trees, B-trees. Tries and suffix trees. Priority queues and binary heaps. Sorting: merge, quick, radix, selection and heap sort, Graphs: Breadth first search and connected components. Depth first search in directed and undirected graphs. Disjkra’s algorithm, directed acyclic graphs and topological sort. Some geometric data-structures.
preReq :: Pre-requisite(s): CoL100
overLap :: 
credits :: 5 Credits (3-0-4)


In [None]:
for course in all_courses:
    print(course['code'], course['preReq'])

In [43]:
class ProgrammeInformationClass:
    def __init__(self):
        self.insti_core = {
            "BS": 0,
            "BS_list": [],
            "EAS": 0,
            "EAS_list": [],
            "HuSS": 0,
        }
        self.programme_linked = 0
        self.departmental = {
            "core": 0,
            "core_list": [],
            "elective": 0,
            "elective_list": 0,
        }
        self.OC = 0
        self.total_graded = 0
        self.non_graded = 0
        self.suggested_OEC = []
        
    def __getitem__(self, key):
        # Implement __getitem__ to access attributes using square brackets
        if hasattr(self, key):
            return getattr(self, key)
        else:
            raise KeyError(f"'{type(self).__name__}' object has no attribute '{key}'")


In [50]:
# def data_modifier(course_scrapped_body):
course_1_body = course_body.split("\n")
programmeInfo = ProgrammeInformationClass()
print(programmeInfo)
print(course_1_body)
saveNextNumber = False
isCourseArea = False
currProperty = ""
for string in course_1_body:
    string = string.strip()
    if string == "": continue
    elif re.search(r"\(BS\)", string): 
        currProperty = "BS"
        saveNextNumber = True
    elif re.serarch(re"\(EAS\)", string):
        currProperty = 
    elif ""

SyntaxError: invalid syntax (Temp/ipykernel_15008/2491001320.py, line 15)

In [58]:
def scrape_course_header(pdf_doc, page_i):
    X1, X2 = 49, 596
    Y1, Y2 = 0, 84
    return extract_text_by_coordinates(pdf_doc, page_i, X1, Y1, X2, Y2).split("\n")

def is_course_table_page(pdf_doc, page_i):
    X1, X2 = 526, 596
    Y1, Y2 = 60, 200
    extractedText = extract_text_by_coordinates(pdf_doc, page_i, X1, Y1, X2, Y2)
    return "total" in extractedText.lower()
    
def get_all_programme_details():
    FIRST_PAGE_I = PROGRAMME_PAGE_FIRST
    LAST_PAGE_I = PROGRAMME_PAGE_LAST

    pdf_doc = fitz.open(COS)
    all_courses = []
    curr_course = {
        "header" : "",
        "body" : []
    }
    prev_table_page = True
    for page_i in range(FIRST_PAGE_I, LAST_PAGE_I + 1):
#         print(page_i)
        if is_course_table_page(pdf_doc, page_i):
            temp = "\n".join(curr_course["body"])
            curr_course['body'] = list(map(lambda x : x.strip(), temp.split('\n')))
            # curr_course["body_joined"] = 
#             print(curr_course["all_data_joined"], 'yu')
            # del curr_course["body"]
            all_courses.append(curr_course)
            curr_course = {
                "header" : "",
                "body" : []
            }
            prev_table_page = True
            continue
        if prev_table_page:
            prev_table_page = False
            curr_course["header"] = scrape_course_header(pdf_doc, page_i)
#             print("course header : ", curr_course["header"])
        curr_course["body"] += standard_cos_page_scraper(pdf_doc, page_i)
    pdf_doc.close()
    print(len(all_courses))
    return all_courses
#     for ele in all_courses:
#         print(ele["header"])
#     for course in all_courses:
#         print(course["header"])
#     print(curr_course)
#     print("------------------------------------------------------------")
#     print(all_courses[0]["header"])
#     return all_courses[0]["all_data_joined"]
#     for programme in all_courses:
#         programme[] = refactorCourseBody(programme)
#     return all_courses
    
all_courses = get_all_programme_details()

18


### Ignored Stuff in the programme code

- the some starred coordinates, we will need to add these as additional comments(couldn't handle this with code because each * / ** / ***  represents has different things attached to it). Although later we could automate the additional comments and also show the courses other than this which have the same star in the programme


In [75]:
for course in all_courses:
    print('\n'.join(course['body'][:64]))

The overall Credit Structure
Course Category
Credits
Institute Core Courses
Basic Sciences (BS)

24
Engineering Arts and Science (EAS)

19
Humanities and Social Sciences (HuSS)

15
Programme-linked Courses
12.5
Departmental Courses
Departmental Core
65.5
Departmental Electives

12
Open Category Courses

10
Total Graded Credit requirement
158
Non Graded Units

11
Institute Core : Basic Sciences
CML101 Introduction to Chemistry
3 1 0 4
CMP100 Chemistry Laboratory
0 0 4 2
MTL100 Calculus
3 1 0 4
MTL101 Linear Algebra and Differential Equations
3 1 0 4
PYL101 Electromagnetism & Quantum Mechanics
3 1 0 4
PYP100 Physics Laboratory
0 0 4 2
SBL100 Introductory Biology for Engineers
3 0 2 4
Total Credits



24
Institute Core: Engineering Arts and Sciences
APL100 Engineering Mechanics
3 1 0 4
COL100 Introduction to Computer Science
3 0 2 4
CVL100 Environmental Science
2 0 0 2
ELL101 Introduction to Electrical Engineering
3 1 0 4
ELP101 Introduction to Electrical Engineering (Lab)
0 0 2 1
MCP100 

In [143]:
print('\n'.join(all_courses[15]['body']))

The overall Credit Structure
Course Category
Credits
Institute Core Courses
Basic Sciences (BS)

24
Engineering Arts and Science (EAS)

19
Humanities and Social Sciences (HuSS)

15
Programme-linked Courses
12.5
Departmental Courses
Departmental Core
59.5
Departmental Electives

6
Open Category Courses

12
Total B.Tech. Credit requirement
148
Non Graded Units

11
M.Tech. Part
Programme Core Courses

24
Programme Electives Courses

18
Total M.Tech. Requirement

42
Total Graded Requirement
190
Institute Core : Basic Sciences
CML101 Introduction to Chemistry
3 1 0 4
CMP100 Chemistry Laboratory
0 0 4 2
MTL100 Calculus
3 1 0 4
MTL101 Linear Algebra and Differential Equations
3 1 0 4
PYL101 Electromagnetism & Quantum Mechanics
3 1 0 4
PYP100 Physics Laboratory
0 0 4 2
SBL100 Introductory Biology for Engineers
3 0 2 4
Total Credits



24
Institute Core: Engineering Arts and Sciences
APL100 Engineering Mechanics
3 1 0 4
COL100 Introduction to Computer Science
3 0 2 4
CVL100 Environmental Scienc

## TODO

- Extract core course codes for each program
- Construct database of core courses.

In [None]:
def parse_course(course):
    
    

In [None]:
print(course_body)

In [130]:
def parse_course(course):

    template = {
            "code": "CS1",
            "name": "B.Tech in Computer Science and Engineeering",
            "credits" : {
                "BS": 24,
                "EAS": 19,
                "HuSS": 15,
                "PL": 0,
                "DC": 0,
                "DE": 0,
                "OC": 0,
                "MTech" : {
                    "PC": 0,
                    "PE": 0,
                }
            },
            "courses": {
                "PL": [],
                "DC": [],
                "DE": [],
                "MTech": {
                    "PC": [],
                    "PE": []
                }
            },
            "recommended" : { i+1: [] for i in range(8) }
        }

    template['code'] = course['header'][0].split(' ')[-1]
    template['name'] = course['header'][1]
    if template['code'] == 'DD1':
        return template

    i = 0
    cbody = course['body']

    line_map = {
        'Programme-linked Courses': 'PL',
        'Departmental Core': 'DC',
        'Departmental Electives': 'DE',
        'Open Category Courses': 'OC'
    }
    
    while i < len(cbody) :
        for (l, c) in line_map.items():
            if cbody[i] == l and template['credits'][c] == 0:
                i += 1
                while i < len(cbody) and cbody[i] == '':
                    i += 1
                template['credits'][c] = float(cbody[i])

        i += 1

    assert template['credits']['PL'] > 0, f"Could not parse PL for code {template['code']}"
    assert template['credits']['DC'] > 0, f"Could not parse DC for code {template['code']}"
    assert template['credits']['DE'] > 0, f"Could not parse DE for code {template['code']}"
    assert template['credits']['OC'] > 0, f"Could not parse OC for code {template['code']}"

    course_map = {
        'Programme-Linked Basic / Engineering Arts / Sciences Core' : 'PL',
        'Departmental Core': 'DC',
        'Departmental Electives': 'DE'
    }

    re_digit = r'\d'

    i = 28
    while i < len(cbody):
        for (l, c) in course_map.items():
            if cbody[i] == l and not template['courses'][c]:
                # print(f"Parsing {c} for {template['code']}")
                # parse courses 
                while i < len(cbody)-1:
                    i += 1
                    if cbody[i] == '':
                        continue
                    if cbody[i] == "Total Credits":
                        break
                    toks = cbody[i].split(' ')
                    if re.match(COURSE_PATTERN, toks[0]):
                        # print(f"Got course {toks[0]}")
                        template['courses'][c].append(toks[0])
        i += 1

    assert template['courses']['PL'] != [], f"Could not parse PL courses for code {template['code']}"
    assert template['courses']['DC'] != [], f"Could not parse DC courses for code {template['code']}"
    assert template['courses']['DE'] != [], f"Could not parse DE courses for code {template['code']}"
    
    return template

In [132]:
parsed_courses = {}

for course in all_courses:
    p = parse_course(course)
    parsed_courses[p['code']] = p

In [134]:
parsed_courses['EE1']

{'code': 'EE1',
 'name': 'Bachelor of Technology in Electrical Engineering',
 'credits': {'BS': 24,
  'EAS': 19,
  'HuSS': 15,
  'PL': 15.0,
  'DC': 60.0,
  'DE': 10.0,
  'OC': 10.0,
  'MTech': {'PC': 0, 'PE': 0}},
 'courses': {'PL': ['COL106', 'MTL106', 'MCL142', 'PYL102'],
  'DC': ['ELL201',
   'ELL202',
   'ELL203',
   'ELP203',
   'ELL205',
   'ELL211',
   'ELL212',
   'ELP212',
   'ELL225',
   'ELP225',
   'ELL302',
   'ELP302',
   'ELL303',
   'ELP303',
   'ELL304',
   'ELL305',
   'ELP305',
   'ELL311',
   'ELP311',
   'ELD411'],
  'DE': ['ELL301',
   'ELL312',
   'ELL313',
   'ELL315',
   'ELL316',
   'ELL318',
   'ELL319',
   'ELL332',
   'ELL333',
   'ELL365',
   'ELL400',
   'ELL401',
   'ELL402',
   'ELL405',
   'ELL406',
   'ELL407',
   'ELL409',
   'ELL410',
   'ELL411',
   'ELL703',
   'ELL710',
   'ELL715',
   'ELL716',
   'ELL725',
   'ELL730',
   'ELL738',
   'ELL740',
   'ELL758',
   'ELL765',
   'ELS310'],
  'MTech': {'PC': [], 'PE': []}},
 'recommended': {1: [], 2: