<a href="https://colab.research.google.com/github/Anna1ia23/Anna1ia23.github.io/blob/main/SeniorProjWebScrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This python script is used to collect course information from LTU's course catalog (MCS courses ONLY). To do so, simply upload a pdf of the course catalog page
*   This script scrapes the PDF to collect the course numbers from all course names ("Starting with 'MCS')
*   These course numbers are used to access each course description page on LTU's course description pages
*   The information from each page is appended to a txt file called 'course_data.txt'
*   Save the course_data file and use for further processing




In [3]:
#pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
import requests
from bs4 import BeautifulSoup
import urllib3
import re
import PyPDF2

In [6]:
def extract_course_codes(pdf_path):
    # Initialize a list to store course codes
    course_codes = []

    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        # Iterate through each page
        for page in reader.pages:
            text = page.extract_text()

            # Find all occurrences of the pattern "MCS #### -"
            matches = re.findall(r'MCS (\d{4}) -', text)

            # Add the extracted course codes to the list
            course_codes.extend(matches)

    return course_codes

# Use the function and print the course codes
pdf_path = 'catalogEntries.pdf'
course_numbers = extract_course_codes(pdf_path)
#print(course_numbers)

In [7]:
# Create list of URL's needed for scraping
course_prefix = 'https://bannerweb.ltu.edu/ssbprod/bwckctlg.p_disp_course_detail?cat_term_in=202610&subj_code_in=MCS&crse_numb_in='
course_urls = []
#print(course_numbers)


for course in course_numbers:
  course_name = course_prefix + course
  course_urls.append(course_name)
  #print(course_name)

In [8]:
# Extract data from each webpage - bypassed SSL verification with verify=False. This makes the scraper super unsecure and should be fixed in the future
course_data = []
for url in course_urls:
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract information based on HTML tags, classes, or IDs
    course_name = soup.find(class_="nttitle").get_text(strip=True)
    course_description = soup.find(class_="ntdefault").get_text(separator='\n',strip=True)

    #testing
    #print(course_name.get_text)
    #print(course_description.get_text)

    #split the sections for clarity (this is where the output file is formatted):
    sections = course_description.split('\n\n')
    formatted_description = "\n".join([
        f"Description: {sections[0]}" if len(sections) > 0 else "",
        f"Credit and Lecture Hours: {sections[1].replace(chr(10),' ')}" if len(sections) > 1 else "",
        f"Levels: {sections[2].replace(chr(10),' ')}" if len(sections) > 2 else "",
        f"Schedule Types: {sections[3].replace(chr(10),' ')}" if len(sections) > 3 else "",
        f"Academic Division and Department: {sections[4].replace(chr(10),' ')}" if len(sections) > 4 else "",
        f"Course Attributes: {sections[5].replace(chr(10),' ')}" if len(sections) > 5 else "",
        f"Prerequisites: {sections[6].replace(chr(10),' ')}" if len(sections) > 6 else ""
    ])

    course_data.append({
      'course_name': course_name,
      'course_description': formatted_description
    })



In [9]:
# Write the extracted data to a text file (course_data.txt)
with open('course_data.txt', 'w') as file:
    for course in course_data:
        file.write(f"Course Name: {course['course_name']}\n")
        file.write(f"Course Description:\n {course['course_description']}\n")
        file.write("\n" + "="*40 + "\n\n")

print("Course data has been written to course_data.txt")

Course data has been written to course_data.txt
