Project Overview: Build an ML system that predicts where courses from different universities are eligible for transfer credit using NLP and classification algorithms.

In [28]:
'''Core Environment Setup'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
import os
warnings.filterwarnings('ignore')

In [29]:
'''NLP Environment Setup'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import nltk
import spacy

In [30]:
'''Deep Learning Environment Setup'''
from transformers import AutoTokenizer, AutoModel
import torch

In [31]:
'''Plotting'''
plt.style.use('seaborn-v0_8')
sb.set_palette("husl")

In [32]:
'''NLTK Data'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Project Configuration can now be completed. Data loading and exploration is the next step.

In [33]:
'''Sample Data Function'''

def create_sample_data():
#Sample Purdue Courses
        purdue_samp = {'course_code': ['CS 180', 'CS 250', 'CS159'], 
               'title': ['Problem Solving and Object-Oriented Programming', 
                        'Computer Architecture',
                        'C Programming (Applications for Engineers)'], 
                'description':['Intro to programming using Python',
                               'Computer organization and architecture...',
                               'Intro to programming using C'],
                'credits':[4, 4, 3],
                'department':['CS', 'CS', 'CS'],
                'level':['Intro', 'Intermediate', 'Intro']
        }

        #Sample Berkeley Courses
        berkeley_samp = { 'course_code':['CS 61A', 'CS 61C', 'CS 164', 'MATH 1A', 'PHYS 7A'],
                'title': ['Structure and Interpretation of Computer Programs',
                        'Machine Structures', 
                        'Programming Languages and Compilers',
                        'Calculus',
                        'Physics for Scientists and Engineers'],
                'description': ['Introduction to programming and computer science...',
                       'Machine structures, assembly language...',
                       'Survey of programming languages, compilers...',
                       'Differential and integral calculus...',
                       'Mechanics, oscillations, waves...'],
                'credits':[4, 4, 4, 4, 4],
                'department':['CS', 'CS', 'CS', 'MATH', 'PHYS'],
                'level':['Intro', 'Intermediate', 'Advanced', 'Intro', 'Intro']

        }

        return pd.DataFrame(berkeley_samp), pd.DataFrame(purdue_samp)

In [34]:
'''Data Loading'''
def load_course_data():
    try:
        purdueCS_courses = pd.read_csv('Course_CSV_Files/Purdue_CS_Courses_CSV.csv')
        berkeley_courses = pd.read_csv('Course_CSV_Files/UCB_Courses.csv')

        print(f"Purdue CS courses loaded: {len(purdueCS_courses)}")
        print(f"Berkeley courses loaded: {len(berkeley_courses)}")

        return purdueCS_courses, berkeley_courses
    
    except FileNotFoundError:
        print("CSV Files not found. Sample data will be created for demo.")
        return create_sample_data()
    

In [40]:
'''Loading and Displaying Basic Info for Purdue'''
purdue_df, berkeley_df = load_course_data()

print("\n=== PURDUE COURSES PREVIEW ===")
display(purdue_df)
print(f"\nShape: {purdue_df.shape}")
print(f"Columns: {list(purdue_df.columns)}")

print("\n=== BERKELEY COURSES PREVIEW ===")
display(berkeley_df)
print(f"\nShape: {berkeley_df.shape}")
print(f"Columns: {list(berkeley_df.columns)}")

Purdue CS courses loaded: 1803
Berkeley courses loaded: 11406

=== PURDUE COURSES PREVIEW ===


Unnamed: 0,Id,Number,SubjectId,Title,CreditHours,Description
0,97744585-87e3-4616-8a1f-bff2ab88471b,9200,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice II,0,
1,631d471f-f14b-47b1-a43f-1efae5ec584a,9300,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice III,0,
2,5237a73f-f2db-4130-8cc8-33f03a1bab55,9400,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice IV,0,
3,1782c85f-49f5-4b08-ad99-62910a6794bd,9500,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice V,0,
4,5f26945a-ff4b-429a-8cb7-37cdba96e319,10100,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Digital Literacy,3,
...,...,...,...,...,...,...
1798,8840e6c2-6020-4eb8-8dd4-15bafaadfb24,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Software Trust Management,3,
1799,1b868766-2557-40b9-b224-61976374b9aa,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Git Based Data Model For Nosql,3,
1800,e02608d1-96d1-4516-85d8-f3343852e40c,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Cryptography II,3,
1801,70f6cb0f-86e6-4ec4-893d-eb6d22698488,69800,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Research MS Thesis,1,



Shape: (1803, 6)
Columns: ['Id', 'Number', 'SubjectId', 'Title', 'CreditHours', 'Description']

=== BERKELEY COURSES PREVIEW ===


Unnamed: 0,Subject,Course Number,Department(s),Credits - Units - Minimum Units,Credits - Units - Maximum Units,Terms Offered,Course Description,Cross-Listed Course(s),Repeat Rules,Repeat Rule: Special Circumstances,Offering Information,Additional Offering Information
0,AEROENG,1,Mechanical Engineering,1,1,-,This is a freshman-level seminar course offere...,-,Course is not repeatable for credit.,-,Offered every fall.,-
1,AEROENG,10,Mechanical Engineering,4,4,-,This course introduces mathematical engineerin...,-,Course is not repeatable for credit.,-,-,-
2,AEROENG,100,Mechanical Engineering,4,4,-,This capstone course challenges students to in...,-,Course is not repeatable for credit.,-,-,-
3,AEROENG,193,Mechanical Engineering,1,4,-,This course covers current topics of interest ...,-,Course may be repeated for credit when topic c...,-,-,-
4,AEROENG,196,Mechanical Engineering,2,4,-,Undergraduate students in good standing who ha...,-,Course may be repeated for credit without rest...,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...
11401,YIDDISH,103,German,3,3,-,This course will trace the development of Yidd...,-,Course may be repeated for credit without rest...,-,-,-
11402,YIDDISH,104,German,4,4,-,This course will trace the development of Yidd...,-,Course is not repeatable for credit.,-,-,-
11403,YIDDISH,105,German,4,4,-,This course will trace the literary journey of...,-,Course is not repeatable for credit.,-,-,-
11404,YIDDISH,106,German,4,4,-,This course will trace the history of Yiddish ...,-,Course is not repeatable for credit.,-,-,-



Shape: (11406, 12)
Columns: ['Subject', 'Course Number', 'Department(s)', 'Credits - Units - Minimum Units', 'Credits - Units - Maximum Units', 'Terms Offered', 'Course Description', 'Cross-Listed Course(s)', 'Repeat Rules', 'Repeat Rule: Special Circumstances', 'Offering Information', 'Additional Offering Information']


Data preprocessing and Cleaning Section. Course IDs and others need to be separated and/or removed.

In [None]:
'''Data Cleaning'''
#Cleaning and standardizing course data
def clean_course_data(df, university_name):
    df = df.copy() #creates a copy dataframe that will not change the original

    #University Identifier
    df['university'] = university_name

    #Course codes
    df['course_code'] = df['course_code'].str.strip().str.upper()

    #Clean titles and descriptions
    df['title'] = df['title'].str.strip()
    df['description'] = df['description'].fillna("").str.strip()

    #Credit standardization

    