In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url1: str = "https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses" # Undergraduate courses
url2: str = "https://www.cs.stonybrook.edu/students/Graduate-Studies/courses" # Graduate courses

In [3]:
def get_sbu_cse_undergrad_course_offered_info(url: str) -> pd.DataFrame:
    # Get the HTML content of the page
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing the course information
    table = soup.find("table", class_="views-table views-view-table cols-7")

    # Get the headers of the table
    headers = [header.text.strip() for header in table.find_all("th")]

    # Get table rows
    rows = []

    # Iterate over each row in the table, skip the header row
    for row in table.find_all("tr")[1:]:
        columns = []
        for idx, col in enumerate(row.find_all("td")):
            text = col.text.strip()
            # Check if the header for this column is a semester column
            if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]):
                # Process for presence of '✔'
                if "✔" in text:
                    columns.append(1)
                else:
                    columns.append(0)
            else:
                # Keep original text for non-semester columns
                columns.append(text)
        rows.append(columns)
    
    df = pd.DataFrame(rows, columns=headers)
    return df

In [4]:
df1 = get_sbu_cse_undergrad_course_offered_info(url=url1)
df1

Unnamed: 0,Course Name,Course Title,Spring 2023,Fall 2023,Spring 2024,Summer 2024,Fall 2024
0,CSE101,Computer Science Principles,1,1,1,1,1
1,CSE102,Introduction to Web Design and Programming,0,0,0,1,0
2,CSE114,Introduction to Object-Oriented Programming,1,1,1,1,1
3,CSE150,Foundations of Computer Science: Honors,0,1,0,0,1
4,CSE160,Computer Science A: Honors,1,0,1,0,0
...,...,...,...,...,...,...,...
57,CSE475,Undergraduate Teaching Practicum,0,0,1,1,1
58,CSE487,Research in Computer Science,0,1,1,1,1
59,CSE488,Internship in Computer Science,0,0,1,1,1
60,CSE495,Senior Honors Research Project I,0,0,1,0,1


In [5]:
def get_sbu_cse_grad_course_offered_info(url: str) -> pd.DataFrame:
    # Get the HTML content of the page
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing the course information
    table = soup.find("table", class_="views-table views-view-table cols-6")

    # Get the headers of the table
    headers = [header.text.strip() for header in table.find_all("th")]

    # Get table rows
    rows = []

    # Iterate over each row in the table, skip the header row
    for row in table.find_all("tr")[1:]:
        columns = []
        for idx, col in enumerate(row.find_all("td")):
            text = col.text.strip()
            # Check if the header for this column is a semester column
            if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]):
                # Process for presence of '✔'
                if "✔" in text:
                    columns.append(1)
                else:
                    columns.append(0)
            else:
                # Keep original text for non-semester columns
                columns.append(text)
        rows.append(columns)

    df = pd.DataFrame(rows, columns=headers)
    return df

In [6]:
df2 = get_sbu_cse_grad_course_offered_info(url=url2)
df2

Unnamed: 0,Course Name,Course Title,Spring 2023,Fall 2023,Spring 2024,Fall 2024
0,CSE500,Patterns in Programming,0,0,0,0
1,CSE502,Computer Architecture,0,0,1,1
2,CSE504,Compiler Design,0,0,0,0
3,CSE505,Computing with Logic,0,0,1,0
4,CSE506,Operating Systems,1,0,1,0
5,CSE507,Computational Linguistics,0,0,0,0
6,CSE508,Network Security,0,0,1,0
7,CSE509,Computer System Security,1,1,0,1
8,CSE510,Hybrid Systems,0,1,0,1
9,CSE511,Brain and Memory Modeling,0,0,0,0


In [9]:
df = pd.concat([df1, df2], axis=0, ignore_index=True).drop(["Summer 2024"],axis=1)
df

Unnamed: 0,Course Name,Course Title,Spring 2023,Fall 2023,Spring 2024,Fall 2024
0,CSE101,Computer Science Principles,1,1,1,1
1,CSE102,Introduction to Web Design and Programming,0,0,0,0
2,CSE114,Introduction to Object-Oriented Programming,1,1,1,1
3,CSE150,Foundations of Computer Science: Honors,0,1,0,1
4,CSE160,Computer Science A: Honors,1,0,1,0
...,...,...,...,...,...,...
106,CSE577,Medical Imaging,1,0,0,0
107,CSE590,Advanced Topics in Computer Science,0,0,0,1
108,CSE591,Advanced Topics in Computer Science,0,0,0,0
109,CSE592,Advanced Topics in Computer Science,0,0,0,0


In [14]:
def get_sbu_cse_course_offered_info(undergrad_url: str, grad_url: str) -> pd.DataFrame:
    # Scrape undergraduate course information
    df1 = get_sbu_cse_undergrad_course_offered_info(url=undergrad_url)

    # Scrape graduate course information
    df2 = get_sbu_cse_grad_course_offered_info(url=grad_url)

    # Combine both dataframes
    df = pd.concat([df1, df2], axis=0, ignore_index=True).drop(["Summer 2024"],axis=1)

    # Rename columns
    df.rename(
        columns={
            "Course Name": "CourseNumber",
            "Course Title": "CourseTitle",
            "Spring 2023": "spring1",
            "Fall 2023": "fall1",
            "Spring 2024": "spring2",
            "Fall 2024": "fall2",
        },
        inplace=True,
    )
    return df

In [16]:
df = get_sbu_cse_course_offered_info(url1, url2)
df

Unnamed: 0,CourseNumber,CourseTitle,spring1,fall1,spring2,fall2
0,CSE101,Computer Science Principles,1,1,1,1
1,CSE102,Introduction to Web Design and Programming,0,0,0,0
2,CSE114,Introduction to Object-Oriented Programming,1,1,1,1
3,CSE150,Foundations of Computer Science: Honors,0,1,0,1
4,CSE160,Computer Science A: Honors,1,0,1,0
...,...,...,...,...,...,...
106,CSE577,Medical Imaging,1,0,0,0
107,CSE590,Advanced Topics in Computer Science,0,0,0,1
108,CSE591,Advanced Topics in Computer Science,0,0,0,0
109,CSE592,Advanced Topics in Computer Science,0,0,0,0


In [41]:
df.index.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110])

In [40]:
# for rows in df.iterrows():
#     print(rows[0],rows[1][0])

In [39]:
for rows in df.itertuples():
    print(rows[0],rows[1])

0 CSE101
1 CSE102
2 CSE114
3 CSE150
4 CSE160
5 CSE161
6 CSE214
7 CSE215
8 CSE216
9 CSE220
10 CSE260
11 CSE261
12 CSE300
13 CSE301
14 CSE303
15 CSE304
16 CSE305
17 CSE306
18 CSE307
19 CSE310
20 CSE311
21 CSE312
22 CSE316
23 CSE320
24 CSE323
25 CSE327
26 CSE328
27 CSE331
28 CSE332
29 CSE333
30 CSE334
31 CSE337
32 CSE346
33 CSE350
34 CSE351
35 CSE352
36 CSE353
37 CSE354
38 CSE355
39 CSE356
40 CSE357
41 CSE360
42 CSE361
43 CSE362
44 CSE363
45 CSE364
46 CSE366
47 CSE371
48 CSE373
49 CSE376
50 CSE377
51 CSE378
52 CSE380
53 CSE381
54 CSE385
55 CSE390-394
56 CSE416
57 CSE475
58 CSE487
59 CSE488
60 CSE495
61 CSE496
62 CSE500
63 CSE502
64 CSE504
65 CSE505
66 CSE506
67 CSE507
68 CSE508
69 CSE509
70 CSE510
71 CSE511
72 CSE512
73 CSE515
74 CSE518
75 CSE519
76 CSE521
77 CSE525
78 CSE526
79 CSE527
80 CSE528
81 CSE529
82 CSE530
83 CSE532
84 CSE533
85 CSE534
86 CSE535
87 CSE536
88 CSE537
89 CSE538
90 CSE540
91 CSE541
92 CSE542
93 CSE544
94 CSE545
95 CSE546
96 CSE547
97 CSE548
98 CSE549
99 CSE550
100 CS