Project Overview: Build an ML system that predicts where courses from different universities are eligible for transfer credit using NLP and classification algorithms.

In [71]:
'''Core Environment Setup'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
import os
warnings.filterwarnings('ignore')

In [72]:
'''Data Collection Environment Setup'''
import requests
from bs4 import BeautifulSoup
import time
import re
import json
from urllib.parse import urljoin, urlparse

In [73]:
'''NLP Environment Setup'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import nltk
import spacy

In [74]:
'''Deep Learning Environment Setup'''
from transformers import AutoTokenizer, AutoModel
import torch

In [75]:
'''Plotting'''
plt.style.use('seaborn-v0_8')
sb.set_palette("husl")

Project Configuration can now be completed.

In [77]:
'''Project Config'''
CONFIG = {
    'data_dir':'data/',
    'output_dir':'outputs/',
    'models_dir':'models/',
    'random_seed':42,
    'target_unis':['Purdue University', 'Cal', 'UMich', 'Georgia Tech', 'UIUC'],
    'target_departs':['CS', 'ECE', 'MATH', 'PHYS', 'ENGR'],
    'max_courses_per_dept':200,
    'min_descript_length':20
}

In [78]:
'''Project Directory Creation'''
for dir_name in [CONFIG['data_dir'], CONFIG['output_dir'], CONFIG['models_dir']]:
    os.makedirs(dir_name, exist_ok=True)

In [79]:
'''Random Seed'''
np.random.seed(CONFIG['random_seed'])
torch.manual_seed(CONFIG['random_seed'])

<torch._C.Generator at 0x32ff6f3d0>

In [80]:
print(f"Target Universities: {CONFIG['target_unis']}")
print(f"Target Departments: {CONFIG['target_departs']}")

Target Universities: ['Purdue University', 'Cal', 'UMich', 'Georgia Tech', 'UIUC']
Target Departments: ['CS', 'ECE', 'MATH', 'PHYS', 'ENGR']


Course Catalog Scraper Class

In [83]:
PURDUE_NAVOID = {
    'CS': 2928,
    'ECE': 2930,
    'MA': 2935,
    'PHYS': 2938,
    'ENGR': 2942
}

BERKELEY_DEPTS = ['COMPSCI', 'EECS', 'MATH', 'PHYSICS']

class CourseCatalogScraper:
    def __init__(self, delay=1.0):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/116.0.0.0 Safari/537.36'
        })
        self.delay = delay

    # Purdue Scraper
    def scrape_purdue_courses(self, departments=['CS']):
        all_courses = []
        for dept in departments:
            url = f"https://catalog.purdue.edu/content.php?catoid=7&navoid={PURDUE_NAVOID[dept]}"
            print(f"Scraping Purdue {dept} courses from {url}...")
            try:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')

                # Debug: print first 500 chars
                # print(soup.prettify()[:500])

                # Purdue courses: often in <h3> with title text, description in next <p>
                course_blocks = soup.find_all(['h3', 'p'])
                for i, block in enumerate(course_blocks):
                    text = block.get_text(strip=True)
                    # Look for pattern like "CS 180 Problem Solving and OOP 4 credits"
                    match = re.match(r'([A-Z]+)\s*(\d+[A-Z]*)\s+(.+?)\s+(\d+(?:\.\d+)?)\s*(credits|units)?', text, re.IGNORECASE)
                    if match:
                        dept_code, number, title, credits, _ = match.groups()
                        # Description: next sibling <p>
                        description = ""
                        if i + 1 < len(course_blocks):
                            desc_candidate = course_blocks[i+1]
                            description = desc_candidate.get_text(strip=True)
                            if len(description) < CONFIG['min_description_length']:
                                description = ""
                        all_courses.append({
                            'university': 'Purdue',
                            'department': dept_code,
                            'course_number': number,
                            'course_code': f"{dept_code} {number}",
                            'title': title.strip(),
                            'credits': float(credits),
                            'description': description,
                            'prerequisites': "",
                            'level': int(number[0]) if number[0].isdigit() else 0,
                            'url': url
                        })
                    if len(all_courses) >= CONFIG['max_courses_per_dept']:
                        break
                time.sleep(self.delay)
                print(f"Found {len(all_courses)} courses for {dept}")
            except Exception as e:
                print(f"Error scraping Purdue {dept}: {e}")
        return pd.DataFrame(all_courses)

    # Berkeley Scraper
    def scrape_berkeley_courses(self, departments=BERKELEY_DEPTS):
        all_courses = []
        for dept in departments:
            url = f"https://guide.berkeley.edu/courses/{dept.lower()}/"
            print(f"Scraping Berkeley {dept} courses from {url}...")
            try:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                course_blocks = soup.find_all('div', class_='courseblock')
                for block in course_blocks[:CONFIG['max_courses_per_dept']]:
                    # Course Title
                    title_elem = block.find('p', class_='courseblocktitle')
                    desc_elem = block.find('p', class_='courseblockdesc')
                    if not title_elem:
                        continue
                    title_text = title_elem.get_text(strip=True)
                    match = re.match(r'([A-Z]+)\s*(\d+[A-Z]*)\s+(.+?)\s+(\d+(?:\.\d+)?)\s+Units?', title_text)
                    if not match:
                        continue
                    dept_code, number, title, credits = match.groups()
                    description = desc_elem.get_text(strip=True) if desc_elem else ""
                    if len(description) < CONFIG['min_description_length']:
                        description = ""
                    all_courses.append({
                        'university': 'Berkeley',
                        'department': 'CS' if dept_code == 'COMPSCI' else dept_code,
                        'course_number': number,
                        'course_code': f"{dept_code} {number}",
                        'title': title.strip(),
                        'credits': float(credits),
                        'description': description,
                        'prerequisites': "",
                        'level': int(number[0]) if number[0].isdigit() else 0,
                        'url': url
                    })
                time.sleep(self.delay)
                print(f"Found {len(all_courses)} courses for {dept}")
            except Exception as e:
                print(f"Error scraping Berkeley {dept}: {e}")
        return pd.DataFrame(all_courses)


With the Course Scraper Class completed, data collection can now be executed.

In [84]:
# Initialize scraper
scraper = CourseCatalogScraper()

print("Starting data collection process....")
print("="*50)

# 1️⃣ Scrape Purdue courses
purdue_courses = scraper.scrape_purdue_courses(['CS', 'ECE', 'MA', 'PHYS', 'ENGR'])
print(f"Purdue courses collected: {len(purdue_courses)}")

# 2️⃣ Scrape Berkeley courses
berkeley_courses = scraper.scrape_berkeley_courses(['COMPSCI', 'EECS', 'MATH', 'PHYSICS'])
print(f"Berkeley courses collected: {len(berkeley_courses)}")

# 3️⃣ Standardize columns for merging
standard_columns = ['university', 'department', 'course_number', 'course_code',
                    'title', 'credits', 'description', 'prerequisites', 'level', 'url']

# Add missing columns if necessary
for df in [purdue_courses, berkeley_courses]:
    for col in standard_columns:
        if col not in df.columns:
            df[col] = None

# 4️⃣ Merge datasets
allCourses = pd.concat([purdue_courses[standard_columns], berkeley_courses[standard_columns]],
                       ignore_index=True)

# 5️⃣ Save raw data
csv_path = f"{CONFIG['data_dir']}raw_courses.csv"
allCourses.to_csv(csv_path, index=False)
print(f"Raw data saved as csv file: {csv_path} ({len(allCourses)} total courses)")

# 6️⃣ Display sample
print("\nSample course data:")
print(allCourses[['university', 'course_code', 'title', 'credits']].head().to_string())

Starting data collection process....
Scraping CS courses...
Found 0 CS courses
Scraping ECE courses...
Found 0 ECE courses
Scraping MA courses...
Found 0 MA courses
Scraping PHYS courses...
Found 0 PHYS courses
Scraping ENGR courses...
Found 0 ENGR courses
Total courses scraped: 0
Purdue courses collected: 0
Starting Berkeley course scraping....
Scraping COMPSCI courses.....
Found 0 COMPSCI courses
Scraping EECS courses.....
Found 0 EECS courses
Scraping MATH courses.....
Found 0 MATH courses
Scraping PHYSICS courses.....
Found 0 PHYSICS courses
Berkeley course scraping complete: 0 total courses
Berkeley courses colleced: 0
Raw data saved as csv file: 0 total courses

 Sample course data.


KeyError: "None of [Index(['university', 'course_code', 'title', 'credits'], dtype='object')] are in the [columns]"