In [79]:
from openpyxl import Workbook
from openpyxl import load_workbook
from tqdm.notebook import tqdm
from os import listdir
import pandas as pd
import tabula

## Cleaning up Courses Offered

IITD Eacads puts the courses offered up as a very very messy CSV file (they probably export it from the excel themselves) and the excel has a lot of blank merged columns, so it's easier to download the excel file and strip the useless stuff out of it, then export to CSV.

In [71]:
COURSES_OFFERED_RAW_PATH = "data/courses_offered_raw"
COURSES_OFFERED_CLEAN_PATH = "data/courses_offered"

In [72]:
datasheets = listdir(COURSES_OFFERED_RAW_PATH)

In [73]:
def clean_dataframe(df):
    df.columns = df.iloc[0]
    df = df.drop([0])
    df = df.astype({'S.No': 'int32', "Vacancy": "int32", "Current Strength": "int32"})
    df = df.set_index(["S.No"])
    df['Instructor'] = df['Instructor'].str.strip()    

In [74]:
# takes around 10.5 seconds per file, so quite slow
for name in tqdm(datasheets):
    name_no_ext = name.split(".")[0]
    wb = load_workbook(f"{COURSES_OFFERED_RAW_PATH}/{name}")
    ws = wb[wb.sheetnames[0]]
    cols_to_delete = [3,5,7,10,12,13,16,18,21]
    i = 0
    for col in cols_to_delete:
        ws.delete_cols(col-i)
        i += 1
    ws.delete_rows(1,4)
    df = pd.DataFrame(ws.values)
    clean_dataframe(df)
    df.to_csv(f"{COURSES_OFFERED_CLEAN_PATH}/{name_no_ext}.csv")

  0%|          | 0/7 [00:00<?, ?it/s]

## Course Data 

The course structures have to be taken from the Courses of Study; There's no _clean_ way that I could find of doing this, so it's partly manual labour and partly using some utility scripts to generate the credit structure/timetable/DE/PL data.

In [182]:
COS = "Courses-of-Study_2020-2021.pdf"

PROGRAM_4Y_PAGE = {
    "AM1": 44,
    "BB1": 46,
    "CH1": 48,
    "CE1": 53,
    "CS1": 55,
    "EE1": 60,
    "EE3": 62,
    "MS1": 64,
    "ME1": 66,
    "ME2": 68,
    "MT1": 70,
    "PH1": 75,
    "TT1": 77
}
PROGRAM_5Y_PAGE = {
    "CH7": 50,
    "CS5": 57,
    "MT6": 72
}

COLUMNS_LHS = [80, 257, 265, 276, 287]
AREA_LHS = [220, 40, 800, 297]

COLUMNS_RHS = [343, 511, 520, 530, 540]
AREA_RHS = [94, 309, 800, 560]

COLUMNS_CRED_STRUCT = [242]
AREA_CRED_STRUCT_4Y = [106, 40, 220, 295]
AREA_CRED_STRUCT_5Y = [106, 40, 290, 295]

CPLAN_IDX_4Y = [0, 3, 7, 10, 13, 16, 19, 22]

COMMON_COURSES = """
    <eas>
      <course>APL100</course>
      <course>CVL100</course>
      <course>COL100</course>
      <course>ELL101</course>
      <course>MCP100</course>
      <course>MCP101</course>
    </eas>
    <bs>
      <course>CML101</course>
      <course>CMP100</course>
      <course>MTL100</course>
      <course>MTL101</course>
      <course>PYL101</course>
      <course>PYP100</course>
      <course>SBL100</course>
    </bs>
"""

FILE_LOC = "data/course_plans_temp"

In [208]:
for dep in tqdm(PROGRAM_4Y_PAGE):
    strlist = []
    pgno = PROGRAM_4Y_PAGE[dep]
    cred_struct = tabula.read_pdf(COS, area=AREA_CRED_STRUCT_4Y, columns=[242], pages=[pgno])[0]
    cs = {'bs': 24, 'eas': 19, 'huss': 15}
    cs['pl'] = cred_struct.iloc[4][1]
    cs['dc'] = cred_struct.iloc[6][1]
    cs['de'] = cred_struct.iloc[7][1]
    cs['oe'] = cred_struct.iloc[8][1]
    strlist.append(f'<program name="{dep}" type="4y">\n  <credits>\n')
    for key in cs:
        strlist.append(f"    <{key}>{cs[key]:g}</{key}>\n")
    strlist.append('  </credits>\n  <courses>')
    strlist.append(COMMON_COURSES)
    strlist.append('  </courses>\n  <recommended>\n')
    
    plan = tabula.read_pdf(COS, pages=[pgno+1])[0]
    for (i,sem) in zip(range(8),CPLAN_IDX_4Y):
        strlist.append(f'    <sem num="{i+1}">\n')
        for c in plan.iloc[sem][1:10]:
            if not pd.isna(c):
                course = c.replace(" ","")
                strlist.append(f'      <course>{course}</course>\n')
        strlist.append(f'    </sem>\n')
    strlist.append('  </recommended>\n</program>')
        
    with open(f"{FILE_LOC}/{dep}.xml", "w") as f:
        f.write("".join(strlist))

  0%|          | 0/13 [00:00<?, ?it/s]

### Scratch

In [185]:
df = tabula.read_pdf(COS, pages=[61])[0]

In [175]:
cred_struct

Unnamed: 0,Course Category,Credits
0,Institute Core Courses,
1,Basic Sciences (BS),24.0
2,Engineering Arts and Science (EAS),19.0
3,Humanities and Social Sciences (HuSS),15.0
4,Programme-linked Courses,15.0
5,Departmental Courses,
6,Departmental Core,60.0
7,Departmental Electives,10.0
8,Open Category Courses,10.0
9,Total Graded Credit requirement,153.0


In [202]:
#print(df.dtypes)
for c in df.iloc[22][1:10]:
    if not pd.isna(c):
        print(c)

DE 3
OC2
OC3
HUL3XX
