Project Overview: Build an ML system that predicts where courses from different universities are eligible for transfer credit using NLP and classification algorithms.

In [7]:
'''Core Environment Setup'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
import os
warnings.filterwarnings('ignore')

In [8]:
'''NLP Environment Setup'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import nltk
import spacy

In [9]:
'''Deep Learning Environment Setup'''
from transformers import AutoTokenizer, AutoModel
import torch

In [10]:
'''Plotting'''
plt.style.use('seaborn-v0_8')
sb.set_palette("husl")

In [11]:
'''NLTK Data'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Project Configuration can now be completed. Data loading and exploration is the next step.

In [19]:
'''Data Loading'''
def load_course_data():
    try:
        purdueCS_courses = pd.read_csv('Course_CSV_Files/Purdue_CS_Courses_CSV.csv')
        berkeley_courses = pd.read_csv('Course_CSV_Files/UCB_Courses.csv')

        print(f"Purdue CS courses loaded: {len(purdueCS_courses)}")
        print(f"Berkeley courses loaded: {len(berkeley_courses)}")

        return purdueCS_courses, berkeley_courses
    
    except FileNotFoundError:
        print("CSV Files not found. Sample data will be created for demo.")
        return create_sample_data()
    

In [25]:
'''Sample Data Function'''

def create_sample_data():
#Sample Purdue Courses
        purdue_samp = {'course_code': ['CS 180', 'CS 250', 'CS159'], 
               'title': ['Problem Solving and Object-Oriented Programming', 
                        'Computer Architecture',
                        'C Programming (Applications for Engineers)'], 
                'description':['Intro to programming using Python',
                               'Computer organization and architecture...',
                               'Intro to programming using C'],
                'credits':[4, 4, 3],
                'department':['CS', 'CS', 'CS'],
                'level':['Intro', 'Intermediate', 'Intro']
        }

        #Sample Berkeley Courses
        berkeley_samp = { 'course_code':['CS 61A', 'CS 61C', 'CS 164', 'MATH 1A', 'PHYS 7A'],
                'title': ['Structure and Interpretation of Computer Programs',
                 'Machine Structures', 
                 'Programming Languages and Compilers',
                 'Calculus',
                 'Physics for Scientists and Engineers'],
                'description': ['Introduction to programming and computer science...',
                       'Machine structures, assembly language...',
                       'Survey of programming languages, compilers...',
                       'Differential and integral calculus...',
                       'Mechanics, oscillations, waves...'],
                'credits':[4, 4, 4, 4, 4],
                'department':['CS', 'CS', 'CS', 'MATH', 'PHYS'],
                'level':['Intro', 'Intermediate', 'Advanced', 'Intro', 'Intro']

        }

        return pd.DataFrame(berkeley_samp), pd.DataFrame(purdue_samp)

In [27]:
'''Loading and Displaying Basic Info for Purdue'''
purdue_df = load_course_data()
berkeley_df = load_course_data()

print("\n=== PURDUE COURSES PREVIEW ===")
display(purdue_df)
#print(f"\nShape: {purdue_df.shape}")
#print(f"Columns: {list(purdue_df.columns)}")


Purdue CS courses loaded: 1803
Berkeley courses loaded: 11406
Purdue CS courses loaded: 1803
Berkeley courses loaded: 11406

=== PURDUE COURSES PREVIEW ===


(                                        Id  Number  \
 0     97744585-87e3-4616-8a1f-bff2ab88471b    9200   
 1     631d471f-f14b-47b1-a43f-1efae5ec584a    9300   
 2     5237a73f-f2db-4130-8cc8-33f03a1bab55    9400   
 3     1782c85f-49f5-4b08-ad99-62910a6794bd    9500   
 4     5f26945a-ff4b-429a-8cb7-37cdba96e319   10100   
 ...                                    ...     ...   
 1798  8840e6c2-6020-4eb8-8dd4-15bafaadfb24   69000   
 1799  1b868766-2557-40b9-b224-61976374b9aa   69000   
 1800  e02608d1-96d1-4516-85d8-f3343852e40c   69000   
 1801  70f6cb0f-86e6-4ec4-893d-eb6d22698488   69800   
 1802  a8e6c2bc-3887-435c-b43d-ec15dd477865   69900   
 
                                  SubjectId                           Title  \
 0     86ad8a59-6ddc-4067-9f6b-169c8eec86a6        Professional Practice II   
 1     86ad8a59-6ddc-4067-9f6b-169c8eec86a6       Professional Practice III   
 2     86ad8a59-6ddc-4067-9f6b-169c8eec86a6        Professional Practice IV   
 3     86ad8a59-6ddc-4