In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import networkx as nx
import re
import os
import matplotlib.pyplot as plt
from networkx.drawing.nx_pydot import write_dot
from tqdm.notebook import tqdm

In [2]:
RCDATA = "data/cos_course_data_raw.xml"

In [4]:
with open(RCDATA, 'r', encoding='utf-8') as cdata:
    s = "".join(cdata.readlines())
    tree = BeautifulSoup(s)

In [5]:
rows = []
for dep in tree.findAll("courses"):
    for course in dep.findAll("course"):
        rows.append({
            "code": getattr(course.find("code"), "string", None),
            "name": getattr(course.find("name"), "string", None),
            "credits": getattr(course.find("credits"), "string", None),
            "credit-structure": getattr(course.find("credit-structure"), "string", None),
            "pre-requisites": getattr(course.find("pre-requisites"), "string", None),
            "overlap": getattr(course.find("overlap"), "string", None),
            "department": dep.get("department"),
            "description": getattr(course.find("description"), "string", None)
        })
        
df = pd.DataFrame(rows)

In [6]:
df

Unnamed: 0,code,name,credits,credit-structure,pre-requisites,overlap,department,description
0,APL100,Engineering Mechanics,4,3-1-0,,,Department of Applied Mechanics,"Kinematics, Statics, Equations of Motion, Rigi..."
1,APL101,Applied Mathematics in Engineering Applications,3,3-0-0,,,Department of Applied Mechanics,ordinary Differential Equation: Second order o...
2,APL102,Introduction to Materials Science and Engineering,4,3-0-2,,,Department of Applied Mechanics,Structure of Solids: atomic and inter-atomic b...
3,APL103,Experimental Methods,4,3-0-2,,,Department of Applied Mechanics,Experimental Analysis: Types of measurements a...
4,APL104,Solid Mechanics,4,3-1-0,APL100,"APL105, APL108",Department of Applied Mechanics,"Introduction, State of stress at a point, equa..."
...,...,...,...,...,...,...,...,...
2387,JOL794,Selected Topics-II,3,3-0-0,,,Department of Textile and Fibre Engineering,
2388,JOS795,Independent Study,3,0-3-0,,,Department of Textile and Fibre Engineering,
2389,JOV796,Selected Topics in Photonics,1,1-0-0,,,Department of Textile and Fibre Engineering,
2390,JOD801,Major Project Part-I,6,0-0-12,,,Department of Textile and Fibre Engineering,


## Pre-Processing
Add year + semwise data

In [7]:
YEARS = ["2122", "2021", "1920"]
COURSES_OFFERED_PATH = "data/courses_offered_anon"
year = YEARS[2]
i = 2
for year in YEARS:
    for i in [1,2]:
        sem_yr_data = pd.read_csv(f"{COURSES_OFFERED_PATH}/{i}_{year}.csv")
        sem_yr_data = sem_yr_data.rename(columns={'Course Code': 'code'})
        # drop all courses with the slot as SU1 or X
        sem_yr_data = sem_yr_data.loc[(sem_yr_data['Slot Name'] != 'X') & (~sem_yr_data['Slot Name'].str.startswith('SU'))]
        df = df.merge(sem_yr_data[['code','Slot Name', 'Vacancy', 'Current Strength']],on='code',how='left')
        df = df.rename(columns={'Slot Name': f"{i}_{year}_slot", 'Vacancy': f"{i}_{year}_vacancy", 'Current Strength': f"{i}_{year}_strength"})

df

Unnamed: 0,code,name,credits,credit-structure,pre-requisites,overlap,department,description,1_2122_slot,1_2122_vacancy,...,1_2021_strength,2_2021_slot,2_2021_vacancy,2_2021_strength,1_1920_slot,1_1920_vacancy,1_1920_strength,2_1920_slot,2_1920_vacancy,2_1920_strength
0,APL100,Engineering Mechanics,4,3-1-0,,,Department of Applied Mechanics,"Kinematics, Statics, Equations of Motion, Rigi...",A,500.0,...,24.0,A,600.0,0.0,A,500.0,589.0,A,250.0,568.0
1,APL101,Applied Mathematics in Engineering Applications,3,3-0-0,,,Department of Applied Mechanics,ordinary Differential Equation: Second order o...,B,200.0,...,,,,,,,,,,
2,APL102,Introduction to Materials Science and Engineering,4,3-0-2,,,Department of Applied Mechanics,Structure of Solids: atomic and inter-atomic b...,,,...,371.0,E,225.0,149.0,E,350.0,307.0,E,300.0,134.0
3,APL103,Experimental Methods,4,3-0-2,,,Department of Applied Mechanics,Experimental Analysis: Types of measurements a...,F,150.0,...,,,,,F,150.0,97.0,,,
4,APL104,Solid Mechanics,4,3-1-0,APL100,"APL105, APL108",Department of Applied Mechanics,"Introduction, State of stress at a point, equa...",D,300.0,...,178.0,,,,D,150.0,153.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,JOL794,Selected Topics-II,3,3-0-0,,,Department of Textile and Fibre Engineering,,,,...,,,,,,,,,,
2748,JOS795,Independent Study,3,0-3-0,,,Department of Textile and Fibre Engineering,,,,...,,,,,,,,,,
2749,JOV796,Selected Topics in Photonics,1,1-0-0,,,Department of Textile and Fibre Engineering,,,,...,,,,,,,,,,
2750,JOD801,Major Project Part-I,6,0-0-12,,,Department of Textile and Fibre Engineering,,,,...,13.0,P,50.0,0.0,P,60.0,23.0,,,


In [8]:
pd.concat(g for _, g in df.groupby("code") if len(g) > 1)

Unnamed: 0,code,name,credits,credit-structure,pre-requisites,overlap,department,description,1_2122_slot,1_2122_vacancy,...,1_2021_strength,2_2021_slot,2_2021_vacancy,2_2021_strength,1_1920_slot,1_1920_vacancy,1_1920_strength,2_1920_slot,2_1920_vacancy,2_1920_strength
564,COL106,Data Structures and Algorithms,5,3-0-4,COL100,,Department of Computer Science and Engineering,Introduction to object-oriented programming th...,F,405.0,...,473.0,F,200.0,199.0,F,180.0,373.0,F,385.0,344.0
565,COL106,Data Structures and Algorithms,5,3-0-4,COL100,,Department of Computer Science and Engineering,Introduction to object-oriented programming th...,F,405.0,...,473.0,AD,200.0,202.0,F,180.0,373.0,F,385.0,344.0
325,CVL100,Environmental Science,2,2-0-0,,,Department of Chemistry,Pollutant sources and control in air and water...,C,800.0,...,1.0,C,0.0,0.0,C,500.0,529.0,C,533.0,486.0
326,CVL100,Environmental Science,2,2-0-0,,,Department of Chemistry,Pollutant sources and control in air and water...,C,800.0,...,1.0,C1,250.0,308.0,C,500.0,529.0,C,533.0,486.0
327,CVL100,Environmental Science,2,2-0-0,,,Department of Chemistry,Pollutant sources and control in air and water...,C,800.0,...,1.0,C2,260.0,302.0,C,500.0,529.0,C,533.0,486.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,SPL810,Advanced Topics in Policy Studies,3,3-0-0,,,Department of Textile and Fibre Engineering,This course will introduce students to advance...,,,...,,AC,35.0,3.0,,,,,,
2288,TXP222,Yarn Manufacture Laboratory-II,1,0-0-2,TXP221,,Department of Textile and Fibre Engineering,Experiments related to the lecture course enti...,B,120.0,...,,P,0.0,0.0,B,120.0,117.0,,,
2289,TXP222,Yarn Manufacture Laboratory-II,1,0-0-2,TXP221,,Department of Textile and Fibre Engineering,Experiments related to the lecture course enti...,P,0.0,...,,P,0.0,0.0,B,120.0,117.0,,,
2297,TXP242,Technology of Textile Coloration Lab,1,0-0-3,TXL241 and TXP241,,Department of Textile and Fibre Engineering,The principles of dyeing and printing of texti...,E,120.0,...,,P,0.0,0.0,E,120.0,111.0,,,


Naive bayes prediction: Say a course has historically been offered in two slots, A and B. The probability that the course is offered in slot A, given that it's offered is
$$P\left(A|\text{Off}\right) = \frac{P(A)\cdot P(\text{Off}|A)}{P(Off)} = \frac{P(A)}{P(\text{Off})} = \frac{n(A)}{n(\text{Off})}$$

This is easy enough to compute and predict for all courses.

We'll need to do something different for years, though. First year cores alternate, while programme cores (CVL/SBL) are offered in every sem. Department cores generally alternate. DE's/PE's are the main targets of this. We can do a similar classification to the above for them.

In [9]:
#
# calculate the offering scores for the different courses.
# it's ok to do the list comprehension: not too bad a performance hit here.
#

for y in years:
    for i in [1,2]:
        sltdata = df[f'{i}_{y}_slot']

NameError: name 'years' is not defined