# Course Graph Notebook

Goal: Processing raw data from course.csv into a JSON file for D3.js library network graph in FE.

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Load data
df = pd.read_csv('./courses.csv', index_col=0)
df.head()

Unnamed: 0,Code,Name,Minor,Certificate,Division,Course Description,Department,Pre-requisites,Course Level,UTSC Breadth,...,Recommended Preparation,Arts and Science Breadth,Arts and Science Distribution,Later term course details,Course,FASEAvailable,MaybeRestricted,MajorsOutcomes,MinorsOutcomes,AIPreReqs
0,ENGB29H3,Shakespeare and Film,[],[],University of Toronto Scarborough,The history of Shakespeare and (on) film is lo...,English (UTSC),"['ENGA11H3', 'ENGA10H3', 'ENGB70H3']",2,"Arts, Literature & Language",...,[],,,,<a href=/course/ENGB29H3>ENGB29H3</a>,False,False,[],[],[]
1,JMU421H1,Advanced Jazz Arranging & Orchestration II,[],[],Faculty of Music,Continuation of JMU420H1.,Faculty of Music,['JMU420H1'],4,,...,[],,,,<a href=/course/JMU421H1>JMU421H1</a>,False,False,[],[],[]
2,PSY220H5,Introduction to Social Psychology,[],[],University of Toronto Mississauga,A survey of classic and contemporary research ...,Psychology,['PSY100Y5'],2,,...,[],,,,<a href=/course/PSY220H5>PSY220H5</a>,False,False,[],[],[]
3,ARC382H1,"Structures, Building Systems, and Environments II",[],[],"John H. Daniels Faculty of Architecture, Lands...",Continued exploration of the principles of str...,"John H. Daniels Faculty of Architecture, Lands...",['ARC281H1'],3,,...,[],,,,<a href=/course/ARC382H1>ARC382H1</a>,False,False,[],[],[]
4,ACMC01H3,ACMEE Applied Practice I,[],[],University of Toronto Scarborough,"A study of the arts, culture and/or media sect...","Dept. of Arts, Culture & Media (UTSC)","['VPAB17H3', 'VPAB16H3']",3,"Arts, Literature & Language",...,[],,,,<a href=/course/ACMC01H3>ACMC01H3</a>,False,False,[],[],[]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4698 entries, 0 to 4697
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Code                           4698 non-null   object
 1   Name                           4698 non-null   object
 2   Minor                          4698 non-null   object
 3   Certificate                    4698 non-null   object
 4   Division                       4698 non-null   object
 5   Course Description             4624 non-null   object
 6   Department                     4698 non-null   object
 7   Pre-requisites                 4698 non-null   object
 8   Course Level                   4698 non-null   int64 
 9   UTSC Breadth                   1251 non-null   object
 10  APSC Electives                 1793 non-null   object
 11  Campus                         4698 non-null   object
 12  Term                           4698 non-null   object
 13  Act

In [4]:
# Filter out relevant columns
cols = ['Code', 'Name', 'Division', 'Department', 'Pre-requisites', 'Recommended Preparation']
df = df[cols]

# Rename columns (convert all to underscore case)
df.columns = list(map(lambda s: s.replace(' ', '_').replace('-', '_').lower(), cols))
print(df.shape)

# Filter for only courses in Faculty of Applied Science & Engineering
df = df[df.division == 'Faculty of Applied Science & Engineering'].reset_index()
df.head()


(4698, 6)


Unnamed: 0,index,code,name,division,department,pre_requisites,recommended_preparation
0,11,ECE557H1,Linear Control Theory,Faculty of Applied Science & Engineering,Division of Engineering Science,['ECE356H1'],[]
1,13,ECE344H1,Operating Systems,Faculty of Applied Science & Engineering,Edward S. Rogers Sr. Dept. of Electrical & Com...,"['ECE243H1', 'ECE244H1']",[]
2,15,AER210H1,Vector Calculus & Fluid Mechanics,Faculty of Applied Science & Engineering,Division of Engineering Science,[],['PHY180H1']
3,30,MIE498Y1,Research Thesis,Faculty of Applied Science & Engineering,Mechanical & Industrial Engineering,[],[]
4,38,CHE210H1,Heat and Mass Transfer,Faculty of Applied Science & Engineering,Chemical Engineering and Applied Chemistry,['CHE221H1'],[]


In [5]:
# Explore how many courses have non-empty recommended preparation field
df[df.recommended_preparation != '[]'].shape

(16, 7)

In [6]:
import ast # Abstract Syntax Trees

# Convert string representation of list to list type
df.pre_requisites = df.pre_requisites.apply(ast.literal_eval)
df.recommended_preparation = df.recommended_preparation.apply(ast.literal_eval)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    473 non-null    int64 
 1   code                     473 non-null    object
 2   name                     473 non-null    object
 3   division                 473 non-null    object
 4   department               473 non-null    object
 5   pre_requisites           473 non-null    object
 6   recommended_preparation  473 non-null    object
dtypes: int64(1), object(6)
memory usage: 26.0+ KB


In [7]:
df.head()

Unnamed: 0,index,code,name,division,department,pre_requisites,recommended_preparation
0,11,ECE557H1,Linear Control Theory,Faculty of Applied Science & Engineering,Division of Engineering Science,[ECE356H1],[]
1,13,ECE344H1,Operating Systems,Faculty of Applied Science & Engineering,Edward S. Rogers Sr. Dept. of Electrical & Com...,"[ECE243H1, ECE244H1]",[]
2,15,AER210H1,Vector Calculus & Fluid Mechanics,Faculty of Applied Science & Engineering,Division of Engineering Science,[],[PHY180H1]
3,30,MIE498Y1,Research Thesis,Faculty of Applied Science & Engineering,Mechanical & Industrial Engineering,[],[]
4,38,CHE210H1,Heat and Mass Transfer,Faculty of Applied Science & Engineering,Chemical Engineering and Applied Chemistry,[CHE221H1],[]


In [9]:
from sklearn import preprocessing

nodes = []
uniq_nodes = list(df.code.unique())
links = []

# Get unique nodes + their group (Department) first
df_temp = df[['code', 'department']].copy()
df_temp.columns = ['id', 'group'] # to match D3.js library convention
# Encode group column to numbers from 0 to n_classes - 1
df_temp['group'] = preprocessing.LabelEncoder().fit_transform(df_temp['group'])
# Convert df to array of dicts
nodes = df_temp.to_dict(orient='records')

# Then directed links between pre-requisites to target course
for i in range(df.shape[0]):
    
    course_code = df.at[i, 'code']
    # department = df.at[i, 'department']
    pre_reqs = df.at[i, 'pre_requisites']
    rec_preps = df.at[i, 'recommended_preparation']

    # nodes.append({'id': course_code, 'group': department})
    for pre in (pre_reqs + rec_preps):
        # Check if pre is in nodes, if not skip the links
        if (pre in uniq_nodes):
            links.append({
                'source': pre,
                'target': course_code,
                'value': 1
            })
        
print(f'number of nodes: {len(nodes)}')
print(f'number of links: {len(links)}')

number of nodes: 473
number of links: 438


In [10]:
links_df = pd.DataFrame(links)
print(f'before aggregating rows: {links_df.shape[0]}')

# Aggregate rows with same source and target
links_df = links_df.groupby(['source', 'target'], as_index=0).sum()
print(f'after aggregating rows: {links_df.shape[0]}')
links_df.head()


before aggregating rows: 438
after aggregating rows: 438


Unnamed: 0,source,target,value
0,AER210H1,AER301H1,1
1,AER210H1,AER307H1,1
2,AER210H1,ECE259H1,1
3,AER210H1,ECE367H1,1
4,AER210H1,MIE520H1,1


In [11]:
# Convert aggregated dataframe back to an array of dicts
links = links_df.to_dict(orient='records')
links[:5]

[{'source': 'AER210H1', 'target': 'AER301H1', 'value': 1},
 {'source': 'AER210H1', 'target': 'AER307H1', 'value': 1},
 {'source': 'AER210H1', 'target': 'ECE259H1', 'value': 1},
 {'source': 'AER210H1', 'target': 'ECE367H1', 'value': 1},
 {'source': 'AER210H1', 'target': 'MIE520H1', 'value': 1}]

In [12]:
# Combine nodes and links lists and convert into a JSON object
g = {
    'nodes': nodes,
    'links': links
}

with open('./course_graph.json', 'w') as f:
    json.dump(g, f)