# HMC Math Courses Web Scraping

### Imports

In [3]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import requests

from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|█████████████████| 8.84M/8.84M [00:00<00:00, 27.7MB/s]


### General Definitions/Setup

In [6]:
url = "https://www.hmc.edu/mathematics/program/mathematics-course-descriptions/"

driver.get(url)

#content = driver.page_source
#soup = BeautifulSoup(content)

#result = requests.get(url).text
#doc = BeautifulSoup(result, "html.parser")

r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')

content = soup.find('article', attrs = {'class':'content common'})

# example use:
#res = soup.find(id = "content")
#print(res)

# r = requests.get(url)
# print(r.content) --> gives the raw html of the page

# soup = BeautifulSoup(r.content, 'html5lib') # creates a BeautifulSoup object with content r.content and using the html parser html5lib
# print(soup.prettify()) printing a more readable version of the html

### Dictionary Between Course Num+Name and its html Class

In [8]:
############
course_class = {}
############

for course in content.findAll('h2', attrs = {'class':'d-inline font-weight-normal course-li-heading'}):
    name = course.a.text
    box = course.a['data-target']
    courseCL = box.split('-')[-1]
    
    course_class[name] = courseCL
    
    #print(name)
    #print(box)
    #print(courseCL)

#print(course_class)

{'CSMT181 HM - Special Topics in Computer Science and Mathematics': '6961', 'CSMT183 HM - Computer Science and Mathematics Clinic I': '6897', 'CSMT184 HM - Computer Science and Mathematics Clinic II': '6898', 'MATH019 HM - Single and Multivariable Calculus': '6943', 'MATH021 HM - Mathematics of Games and Puzzles': '6950', 'MATH055 HM - Discrete Mathematics': '6772', 'MATH055A HM - Topics in Discrete Mathematics': '6902', 'MATH062 HM - Introduction to Probability and Statistics': '6966', 'MATH073 HM - Linear Algebra': '6948', 'MATH082 HM - Differential Equations': '6949', 'MATH092 HM - Mathematical Contest in Modeling/Interdisciplinary Contest in Modeling Seminar': '6773', 'MATH093 HM - Putnam Seminar': '6774', 'MATH094 HM - Problem Solving Seminar': '6775', 'MATH104 HM - Graph Theory': '6776', 'MATH106 HM - Combinatorics': '6777', 'MATH108 PZ - History of Mathematics': '6778', 'MATH109 CM - Introduction to the Mathematics of Finance': '6779', 'MATH119 HM - Advanced Mathematical Biology

##### List of All Course Numbers

In [26]:
course_numbers = []

for numname in course_class:
    num = numname.split(' ')[0]
    course_numbers.append(num)

print(course_numbers)

['CSMT181', 'CSMT183', 'CSMT184', 'MATH019', 'MATH021', 'MATH055', 'MATH055A', 'MATH062', 'MATH073', 'MATH082', 'MATH092', 'MATH093', 'MATH094', 'MATH104', 'MATH106', 'MATH108', 'MATH109', 'MATH119', 'MATH131', 'MATH132', 'MATH136', 'MATH137', 'MATH138', 'MATH142', 'MATH143', 'MATH147', 'MATH148', 'MATH152', 'MATH153', 'MATH155', 'MATH156', 'MATH157', 'MATH158', 'MATH164', 'MATH165', 'MATH167', 'MATH168', 'MATH171', 'MATH172', 'MATH173', 'MATH174', 'MATH175', 'MATH176', 'MATH178', 'MATH179', 'MATH180', 'MATH181', 'MATH184', 'MATH187', 'MATH188', 'MATH189', 'MATH193', 'MATH196', 'MATH197', 'MATH198', 'MATH199', 'MCBI117', 'MCBI118A', 'MCBI118B', 'MCBI199']


### Dictionary Between Course Number and its html Class

In [10]:
coursenum_class = {}

for numname in course_class:
    num = numname.split(' ')[0]
    coursenum_class[num] = course_class[numname]
    
#print(coursenum_class)

### Dictionary Between Course Name and its html Class

In [11]:
coursename_class = {}

for numname in course_class:
    name = numname.split('- ')[1]
    coursename_class[name] = course_class[numname]
    
#print(coursename_class)

### Immediate Prereqs of a Course

In [14]:
def prereqs(coursenum):
    '''
    Input: a course number
    Output: immediate prerequisites to the given course. prereqs are represented by their course number
    '''
    
    prerequisites = []
    
    cID = "course-id-" + coursenum_class[coursenum]
    
    c = content.find('li', attrs = {'id':cID})
    
    prereqClass = c.findAll('p', attrs = {'class':'no-bottom-margin my-0'})[-1]
    
    messy_prerequisites = prereqClass.findAll('a', attrs = {'class':'pre-co-con-link'})
    
    for pre in messy_prerequisites:
        prerequisites.append(pre.text.split('</')[0].split(' ')[0])
        
    return(prerequisites)

prereqs('MATH147')

['MATH131']

### Does This Course Have Any Prereqs?

In [217]:
def hasprereqs(coursenum):
    return prereqs(coursenum) != []

hasprereqs('CSMT181')

False

### Total Prereqs of a Course

In [28]:
def totalprereqs(coursenum):
    '''
    Input: course number
    Output: a list of all the prerequisites of the given course. prereqs are represented by their course number
    '''
    
    total_prerequisites = set()
    
    def recurse(coursenum):
        if(coursenum not in course_numbers):
            return total_prerequisites
        for prereq in prereqs(coursenum):
            total_prerequisites.add(prereq)
            total_prerequisites.update(recurse(prereq))
        return total_prerequisites
    
    return recurse(coursenum)
        

totalprereqs('MATH168')

{'CSCI042',
 'CSCI060',
 'CSCI070',
 'CSCI081',
 'MATH019',
 'MATH055',
 'MATH073',
 'MATH131'}

#### Finding the course w the most prereqs

In [130]:
largest = 0

for num in course_numbers:
    if len(prereqs(num)) > largest:
        largest = len(prereqs(num))

print(largest)

5


### Dictionary Between Courses and Their Total Prerequisites

In [248]:
totalPrereqs = {}

for course in course_numbers:
    reqs = totalprereqs(course)
    totalPrereqs[course] = reqs

print(totalPrereqs)

{'CSMT181': set(), 'CSMT183': set(), 'CSMT184': {'CSMT183'}, 'MATH019': set(), 'MATH021': set(), 'MATH055': {'MATH019', 'MATH073'}, 'MATH055A': set(), 'MATH062': {'MATH019', 'MATH073'}, 'MATH073': {'MATH019'}, 'MATH082': {'MATH073', 'MATH019'}, 'MATH092': set(), 'MATH093': set(), 'MATH094': set(), 'MATH104': {'MATH019', 'MATH073', 'MATH055'}, 'MATH106': {'MATH073', 'MATH019', 'MATH055'}, 'MATH108': {'MATH019'}, 'MATH109': {'MATH082', 'MATH019', 'MATH073'}, 'MATH119': {'MATH082', 'MATH073', 'BIOL046', 'MATH019', 'MCBI118A'}, 'MATH131': {'MATH073', 'MATH019', 'MATH055'}, 'MATH132': {'MATH131', 'MATH073', 'MATH019', 'MATH055'}, 'MATH136': {'MATH082', 'MATH019', 'MATH073'}, 'MATH137': {'MATH131', 'MATH073', 'MATH055', 'MATH132', 'MATH019'}, 'MATH138': {'MATH131', 'MATH073', 'MATH055', 'MATH137', 'MATH132', 'MATH019'}, 'MATH142': {'MATH082', 'MATH019', 'MATH073'}, 'MATH143': {'MATH131', 'MATH082', 'MATH147', 'MATH073', 'MATH055', 'MATH142', 'MATH019'}, 'MATH147': {'MATH131', 'MATH073', 'MAT

In [27]:
prereqAmounts = {}

for course in course_numbers:
    numReqs = len(totalprereqs(course))
    prereqAmounts[course] = numReqs

print(prereqAmounts)

{'CSMT181': 0, 'CSMT183': 0, 'CSMT184': 1, 'MATH019': 0, 'MATH021': 0, 'MATH055': 2, 'MATH055A': 0, 'MATH062': 2, 'MATH073': 1, 'MATH082': 2, 'MATH092': 0, 'MATH093': 0, 'MATH094': 0, 'MATH104': 3, 'MATH106': 3, 'MATH108': 1, 'MATH109': 3, 'MATH119': 5, 'MATH131': 3, 'MATH132': 4, 'MATH136': 3, 'MATH137': 5, 'MATH138': 6, 'MATH142': 3, 'MATH143': 7, 'MATH147': 4, 'MATH148': 2, 'MATH152': 1, 'MATH153': 0, 'MATH155': 0, 'MATH156': 5, 'MATH157': 4, 'MATH158': 0, 'MATH164': 4, 'MATH165': 3, 'MATH167': 5, 'MATH168': 8, 'MATH171': 3, 'MATH172': 4, 'MATH173': 4, 'MATH174': 4, 'MATH175': 3, 'MATH176': 4, 'MATH178': 13, 'MATH179': 3, 'MATH180': 5, 'MATH181': 6, 'MATH184': 7, 'MATH187': 2, 'MATH188': 3, 'MATH189': 0, 'MATH193': 0, 'MATH196': 0, 'MATH197': 0, 'MATH198': 0, 'MATH199': 0, 'MCBI117': 0, 'MCBI118A': 4, 'MCBI118B': 2, 'MCBI199': 0}
