# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# 1. Create URL map

In [2]:
# base URL 
base = 'https://www.scu.edu/bulletin/undergraduate/'

In [3]:
# Send a GET request to fetch the page content
response = requests.get(base)

# Check if the request was successful
if response.status_code == 200:
    page_content = response.text
else:
    print(f"Failed to retrieve content from {base}")
    exit()

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(page_content, 'html.parser')

# Initialize an empty list to store course details
courses = []

In [4]:
print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<title>SCU Undergraduate Bulletin 2024-25 - </title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/css/bootstrap.min.css" rel="stylesheet"/>
<script src="https://cdn.jsdelivr.net/npm/jquery@3.6.4/dist/jquery.slim.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.1/dist/umd/popper.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/js/bootstrap.bundle.min.js"></script>
<link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.15.3/css/all.css" integrity="sha384-iKbFRxucmOHIcpWdX9NTZ5WETOPm0Goy0WmfyNcl52qSYtc2Buk0NCe6jU1sWWNB" rel="stylesheet"/>
<link href="https://getbootstrap.com/docs/4.0/components/navbar/" rel="canonical"/>
<link href="https://assets.scu.edu/public/scu-bulletin.css" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://pro.fontawesome.com/

In [5]:
sidebar = soup.find_all('ul', class_='bltFolder')

for i, elm in enumerate(sidebar):
    print(i, elm)

0 <ul class="bltFolder"></ul>
1 <ul class="bltFolder"><li id="dd5c50445b36"><span class="lv-2"><a class="a_dd5c50445b36" href="./chapter-1-university-mission/santa-clara-university.html#dd5c50445b36"> Santa Clara University</a></span><li id="3f81b93f4992"><span class="lv-2"><a class="a_3f81b93f4992" href="./chapter-1-university-mission/academic-programs.html#3f81b93f4992"> Academic Programs</a></span><li id="4c6670fd2265"><span class="lv-2"><a class="a_4c6670fd2265" href="./chapter-1-university-mission/centers-of-distinction.html#4c6670fd2265"> Centers of Distinction</a></span><li id="67ac4cdd0bb9"><span class="lv-2"><a class="a_67ac4cdd0bb9" href="./chapter-1-university-mission/faculty.html#67ac4cdd0bb9"> Faculty</a></span><li id="257a1d692343"><span class="lv-2"><a class="a_257a1d692343" href="./chapter-1-university-mission/student-body.html#257a1d692343"> Student Body</a></span><li id="5d68052938a6"><span class="lv-2"><a class="a_5d68052938a6" href="./chapter-1-university-mission/al

In [6]:
college_tag = {
    'CAS': sidebar[3],
    'LSB': sidebar[4],
    'SOE': sidebar[5]
}

print(type(sidebar[0]))

<class 'bs4.element.Tag'>


In [7]:
url_map = {}
skip = set([
    ' Undergraduate Degrees',
    ' Centers Institutes and Special Programs'
])

for college, tag in college_tag.items(): 
    print(college)
    for anchor in tag.find_all('a'):
        department = anchor.get_text()
        if department in skip: 
            continue

        url = base + anchor.get('href')[2:]

        url_map[(college, department)] = url

        print('\t', department, ':', url) 

    print('')

CAS
	  Anthropology : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/anthropology.html#f7bf97d9b9e0
	  Art  and  Art  History : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/art-and-art-history.html#f7994386b4fd
	  Biology : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/biology.html#3aca1358926e
	  Chemistry and Biochemistry : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/chemistry-and-biochemistry.html#cb812009d91c
	  Child Studies : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/child-studies.html#07937c069e5d
	  Classics : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/classics.html#23ebc4ecf83a
	  Communication : https://www.scu.edu/bulletin/undergraduate/chapter-3-college-of-arts-and-sciences/communication.html#fc3a72b696d8
	  Economics : https://www.scu.edu/bulletin/unde

# 2. Retrieve Course Data

In [8]:
def get_course_info(soupObj: BeautifulSoup, college, department, show=False):
    content = soupObj.find('body', class_='doc-content')

    col = []
    dep = []
    num = []
    cou = []
    des = []

    next = False
    for elm in content.children:
        if next:
            des.append(elm.get_text())
            
            next = False

            if show:
                print(elm.get_text())

        if elm.name == 'h3':
            arr = elm.get_text().split('.')
            number = arr[0]
            course = ''.join(arr[1:])

            col.append(college)
            dep.append(department)
            num.append(number)
            cou.append(course)

            next = True

            if show:
                print(elm.get_text())

    if next:
        des.append('-')
    
    return col, dep, num, cou, des 

In [9]:
colleges = []
departments = []
numbers = []
courses = []
descriptions = []

for (college, department), url in url_map.items():
    # Send a GET request to fetch the page content
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        page_content = response.text
    else:
        print(f"Failed to retrieve content from {base}")
        exit()

    # Create a BeautifulSoup object to parse the HTML content
    soupObj = BeautifulSoup(page_content, 'html.parser')
    col, dep, num, cou, des = get_course_info(soupObj, college, department)

    colleges += col
    departments += dep
    numbers += num 
    courses += cou
    descriptions += des

In [10]:
print(len(colleges))
print(len(departments))
print(len(numbers))
print(len(courses))
print(len(descriptions))

2686
2686
2686
2686
2686


In [None]:
tag_map = {'Anthropology': 'ANTH',
 'Art  and  Art  History': 'ARTH',
 'Biology': 'BIOL',
 'Chemistry and Biochemistry': 'CHEM',
 'Child Studies': 'CHST',
 'Classics': 'CLAS',
 'Communication': 'COMM',
 'Economics': 'ECON',
 'English': 'ENGL',
 'Environmental  Studies  and  Sciences': 'ENVS',
 'Ethnic Studies': 'ETHN',
 'Gender and Sexuality Studies': 'WGST',
 'History': 'HIST',
 'Mathematics and Computer Science': 'MATH',
 'Arabic Studies': 'ARAB',
 'Chinese Studies': 'CHIN',
 'French Studies': 'FREN',
 'German Studies': 'GERM',
 'Italian Studies': 'ITAL',
 'Japanese Studies': 'JAPN',
 'Spanish Studies': 'SPAN',
 'Music': 'MUSC',
 'Neuroscience': 'NEUR',
 'Philosophy': 'PHIL',
 'Physics': 'PHYS',
 'Political Science': 'POLI',
 'Psychology': 'PSYC',
 'Public  Health  Department': 'PHSC',
 'Scripture and Tradition (SCTR)' : 'SCTR',
 'Theology, Ethics, and Spirituality (TESP)': 'TESP',
 'Religion and Society (RSOC)': 'RSOC',
 'Sociology': 'SOCI',
 'Theatre': 'THTR',
 'Dance': 'DANC',
 'Accounting': 'ACTG',
 'Economics': 'ECON',
 'Finance': 'FNCE',
 'Management': 'MGMT',
 'Marketing': 'MKTG',
 'Information  Systems &  Analytics': 'OMIS',
 'Applied Mathematics': 'AMTH',
 'Bioengineering': 'BIOE',
 'Civil, Environmental, and Sustainable  Engineering': 'CENG',
 'Computer  Science  and  Engineering': 'CSEN',
 'Electrical and Computer Engineering': 'ECEN',
 'General Engineering': 'ENGR',
 'Mechanical  Engineering': 'MECH',
}

# 3. Create Pandas DF

In [12]:
course_df = pd.DataFrame({
    'college': colleges,
    'department': departments,
    'number': numbers,
    'course': courses,
    'description': descriptions
})

In [13]:
course_df

Unnamed: 0,college,department,number,course,description
0,CAS,Anthropology,Emphasis Programs in Anthropology,,Anthropology majors have the option of complet...
1,CAS,Anthropology,1,Introduction to Biological Anthropology,"Using an evolutionary framework, we examine ho..."
2,CAS,Anthropology,2,Introduction to Archaeology,How do archaeologists understand the past? Thi...
3,CAS,Anthropology,3,Introduction Cultural Anthropology,This course provides an introduction to the su...
4,CAS,Anthropology,4,Vanished Peoples and Lost Civilizations,“Popular archaeology” is addressed by examinin...
...,...,...,...,...,...
2681,SOE,Mechanical Engineering,194,Advanced Design I: Tools,Design tools basic to all aspects of mechanica...
2682,SOE,Mechanical Engineering,195,Advanced Design II: Implementation,Implementation of design strategy. Detail desi...
2683,SOE,Mechanical Engineering,196,Advanced Design III: Completion and Evaluation,"Design projects completed, assembled, tested, ..."
2684,SOE,Mechanical Engineering,198,Independent Study,By arrangement with faculty. (1–5 units)


In [None]:
import re
# loop through description and regex parse out the prereqs and ignore coreqs parsing
# create a new column for prereqs
course_df['prereqs'] = None

for i in range(len(course_df)):
    result = re.search(r'Prerequisite[s]*: (.*)', course_df['description'][i])
    if result:
        course_df['prereqs'][i] = result.group(1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  course_df['prereqs'][i] = result.group(1)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame 

In [None]:
# save to csv
course_df.to_csv('courses.csv', index=False)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  course_df['coreqs'][i] = result.group(1)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame o