Project Overview: Build an ML system that predicts where courses from different universities are eligible for transfer credit using NLP and classification algorithms.

In [13]:
'''Core Environment Setup'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
import os
warnings.filterwarnings('ignore')

In [14]:
'''Data Collection Environment Setup'''
import requests
from bs4 import BeautifulSoup
import time
import re
import json
from urllib.parse import urljoin, urlparse

In [15]:
'''NLP Environment Setup'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import nltk
import spacy

In [16]:
'''Deep Learning Environment Setup'''
from transformers import AutoTokenizer, AutoModel
import torch

In [17]:
'''Plotting'''
plt.style.use('seaborn-v0_8')
sb.set_palette("husl")

Project Configuration can now be completed.

In [18]:
'''Project Config'''
CONFIG = {
    'data_dir':'data/',
    'output_dir':'outputs/',
    'models_dir':'models/',
    'random_seed':42,
    'target_unis':['Purdue University', 'Cal', 'UMich', 'Georgia Tech', 'UIUC'],
    'target_departs':['CS', 'ECE', 'MATH', 'PHYS', 'ENGR'],
    'max_courses_per_dept':200,
    'min_descript_length':20
}

In [21]:
'''Project Directory Creation'''
for dir_name in [CONFIG['data_dir'], CONFIG['output_dir'], CONFIG['models_dir']]:
    os.makedirs(dir_name, exist_ok=True)

In [22]:
'''Random Seed'''
np.random.seed(CONFIG['random_seed'])
torch.manual_seed(CONFIG['random_seed'])

<torch._C.Generator at 0x32ff6f3d0>

In [23]:
print(f"Target Universities: {CONFIG['target_unis']}")
print(f"Target Departments: {CONFIG['target_departs']}")

Target Universities: ['Purdue University', 'Cal', 'UMich', 'Georgia Tech', 'UIUC']
Target Departments: ['CS', 'ECE', 'MATH', 'PHYS', 'ENGR']


Course Catalog Scraper Class

In [None]:
'''Catalog Scraper Class'''
class CourseCatalogScraper:

    #Initialization Constructor for Web Scraping
    def _init_(self, delay=1.0):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
        })
        self.delay = delay
        self.courses_collected = []

    #Purdue Scraper Method
    def scrape_purdue_courses(self, departments=['CS', 'ECE', 'MA', 'PHYS', 'ENGR']):
        print("Starting Purdue course scraping....")
        courses = []

        for dept in departments:
            print(f"Scraping {dept} courses....")

            #Purdue Catalog Search URL
            search_url = f"https://catalog.purdue.edu/content.php?catoid=18&navoid=23635"

            try:
                response = self.session.get(search_url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')

                #Course Block Finding
                course_blocks = soup.find_all('div', class_='searchresult')

                for block in course_blocks[:CONFIG['max_courses_per_dept']]:
                    course_data = self._extract_purdue_course_info(block)
                    if course_data:
                        courses.append(course_data)

                print(f"Found {len([c for c in courses if c['department'] == dept])} {dept} courses")
                time.sleep(self.delay)

            except:
                print(f"Error Scraping {dept}: {e}")
                continue

        print(f"Purdue Course Scraping Complete: {len(courses)} total courses")

        return pd.DataFrame(courses)
    
    #Extracting Information About Courses
    def _extract_purdue_course_info(self, course_block):
        try:
            #Course Title and Code Extraction
            title_elem = course_block.find('h2')
            if not title_elem:
                return None
            
            title_text = title_elem.get_text(strip=True)

            #CS180 - Problem Solving and OOP (4 Credits)
            match = re.match(r'([A-Z]+)\s*(\d+)\s*-\s*(.+?)\s*\((\d+(?:\.\d+)?)\s*credits?\)', title_text)
            if not match:
                return None
            
            dept, number, title, credits = match.groups()

            #Description Extraction
            desc_elem = course_block.find('div', class_='searchresult_desc')
            description = desc_elem.get_text(strip=True) if desc_elem else ""

            if len(description) < CONFIG['min_description_length']:
                return None
            
            #Prereq Extraction
            prereq_pattern = r'Prerequisites[s]?:?\s*(.+?)(?:\.|Corequisite|$)'
            prereq_match = re.search(prereq_pattern, description, re.IGNORECASE)
            prerequisites = prereq_match.group(1).strip() if prereq_match else ""

            return {
                'university':'Purdue',
                'department': dept,
                'course_number': number,
                'course_code': f"{dept} {number}",
                'title': title.strip(),
                'credits': float(credits),
                'description': description,
                'prerequisites': prerequisites,
                'level': int(number[0]) if number.isdigit() else 0,
                'url': course_block.find('a')['href'] if course_block.find('a') else ""
            }
        
        except Exception as e:
            return None
