Project Overview: Build an ML system that predicts where courses from different universities are eligible for transfer credit using NLP and classification algorithms.

In [31]:
'''Core Environment Setup'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
import os
warnings.filterwarnings('ignore')

In [32]:
'''NLP Environment Setup'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import nltk
import spacy

In [33]:
'''Deep Learning Environment Setup'''
from transformers import AutoTokenizer, AutoModel
import torch

In [34]:
'''Plotting'''
plt.style.use('seaborn-v0_8')
sb.set_palette("husl")

In [35]:
'''NLTK Data'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarathivelmurugan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Project Configuration can now be completed. Data loading and exploration is the next step.

In [36]:
'''Sample Data Function'''

def create_sample_data():
#Sample Purdue Courses
        purdue_samp = {'course_code': ['CS 180', 'CS 250', 'CS159'], 
               'title': ['Problem Solving and Object-Oriented Programming', 
                        'Computer Architecture',
                        'C Programming (Applications for Engineers)'], 
                'description':['Intro to programming using Python',
                               'Computer organization and architecture...',
                               'Intro to programming using C'],
                'credits':[4, 4, 3],
                'department':['CS', 'CS', 'CS'],
                'level':['Intro', 'Intermediate', 'Intro']
        }

        #Sample Berkeley Courses
        berkeley_samp = { 'course_code':['CS 61A', 'CS 61C', 'CS 164', 'MATH 1A', 'PHYS 7A'],
                'title': ['Structure and Interpretation of Computer Programs',
                        'Machine Structures', 
                        'Programming Languages and Compilers',
                        'Calculus',
                        'Physics for Scientists and Engineers'],
                'description': ['Introduction to programming and computer science...',
                       'Machine structures, assembly language...',
                       'Survey of programming languages, compilers...',
                       'Differential and integral calculus...',
                       'Mechanics, oscillations, waves...'],
                'credits':[4, 4, 4, 4, 4],
                'department':['CS', 'CS', 'CS', 'MATH', 'PHYS'],
                'level':['Intro', 'Intermediate', 'Advanced', 'Intro', 'Intro']

        }

        return pd.DataFrame(berkeley_samp), pd.DataFrame(purdue_samp)

In [37]:
'''Data Loading'''
berkCS_courses_indices = np.arange(2001, 2099).tolist()

def load_course_data():
    try:
        purdueCS_courses = pd.read_csv('Course_CSV_Files/Purdue_CS_Courses_CSV.csv')
        berkeley_courses = pd.read_csv('Course_CSV_Files/UCB_Courses.csv', skiprows=lambda x:x not in berkCS_courses_indices)

        print(f"Purdue CS courses loaded: {len(purdueCS_courses)}")
        print(f"Berkeley courses loaded: {len(berkeley_courses)}")

        return purdueCS_courses, berkeley_courses
    
    except FileNotFoundError:
        print("CSV Files not found. Sample data will be created for demo.")
        return create_sample_data()
    

In [38]:
'''Loading and Displaying Basic Info for Purdue'''
purdue_df, berkeley_df = load_course_data()

print("\n=== PURDUE COURSES PREVIEW ===")
display(purdue_df)
print(f"\nShape: {purdue_df.shape}")
print(f"Columns: {list(purdue_df.columns)}")

print("\n=== BERKELEY COURSES PREVIEW ===")
display(berkeley_df)
print(f"\nShape: {berkeley_df.shape}")
print(f"Columns: {list(berkeley_df.columns)}")

Purdue CS courses loaded: 1803
Berkeley courses loaded: 97

=== PURDUE COURSES PREVIEW ===


Unnamed: 0,Id,Number,SubjectId,Title,CreditHours,Description
0,97744585-87e3-4616-8a1f-bff2ab88471b,9200,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice II,0,
1,631d471f-f14b-47b1-a43f-1efae5ec584a,9300,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice III,0,
2,5237a73f-f2db-4130-8cc8-33f03a1bab55,9400,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice IV,0,
3,1782c85f-49f5-4b08-ad99-62910a6794bd,9500,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Professional Practice V,0,
4,5f26945a-ff4b-429a-8cb7-37cdba96e319,10100,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Digital Literacy,3,
...,...,...,...,...,...,...
1798,8840e6c2-6020-4eb8-8dd4-15bafaadfb24,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Software Trust Management,3,
1799,1b868766-2557-40b9-b224-61976374b9aa,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Git Based Data Model For Nosql,3,
1800,e02608d1-96d1-4516-85d8-f3343852e40c,69000,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Cryptography II,3,
1801,70f6cb0f-86e6-4ec4-893d-eb6d22698488,69800,86ad8a59-6ddc-4067-9f6b-169c8eec86a6,Research MS Thesis,1,



Shape: (1803, 6)
Columns: ['Id', 'Number', 'SubjectId', 'Title', 'CreditHours', 'Description']

=== BERKELEY COURSES PREVIEW ===


Unnamed: 0,COMPSCI,150,Electrical Engineering and Computer Sciences,5,5.1,-,"Basic building blocks and design methods to contruct synchronous digital systems, such as general purpose processors, hardware accelerators, and application specific processors. Representations and design methodologies for digital systems. Logic design using combinatorial and sequential circuits. Digital system implementation considering hardware descriptions languages, computer-aided design tools, field-programmable gate array architectures, and CMOS logic gates and state elements. Interfaces between peripherals, processor hardware, and software. Formal hardware laboratories and substantial design project.",-.1,Course is not repeatable for credit.,-.2,-.3,-.4
0,COMPSCI,152,Electrical Engineering and Computer Sciences,4.0,4,-,"Instruction set architecture, microcoding, pip...",-,Course is not repeatable for credit.,-,-,-
1,COMPSCI,160,Electrical Engineering and Computer Sciences,4.0,4,-,"The design, implementation, and evaluation of ...",-,Course is not repeatable for credit.,-,-,-
2,COMPSCI,161,Electrical Engineering and Computer Sciences,4.0,4,-,Introduction to computer security. Cryptograph...,-,Course is not repeatable for credit.,-,-,-
3,COMPSCI,162,Electrical Engineering and Computer Sciences,4.0,4,-,Basic concepts of operating systems and system...,-,Course is not repeatable for credit.,-,-,-
4,COMPSCI,164,Electrical Engineering and Computer Sciences,4.0,4,-,Survey of programming languages. The design of...,-,Course is not repeatable for credit.,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...
92,COMPSCI,H196,Electrical Engineering and Computer Sciences,3.0,3,-,Study in-depth of several topics in computer s...,-,-,-,-,-
93,COMPSCI,H196A,Electrical Engineering and Computer Sciences,1.0,4,-,Thesis work under the supervision of a faculty...,-,Course is not repeatable for credit.,-,-,-
94,COMPSCI,H196B,Electrical Engineering and Computer Sciences,1.0,4,-,Thesis work under the supervision of a faculty...,-,Course is not repeatable for credit.,-,-,-
95,COMPSCI,W10,Electrical Engineering and Computer Sciences,4.0,4,-,This course meets the programming prerequisite...,-,Course is not repeatable for credit.,-,-,-



Shape: (97, 12)
Columns: ['COMPSCI', '150', 'Electrical Engineering and Computer Sciences', '5', '5.1', '-', 'Basic building blocks and design methods to contruct synchronous digital systems, such as general purpose processors, hardware accelerators, and application specific processors. Representations and design methodologies for digital systems. Logic design using combinatorial and sequential circuits. Digital system implementation considering hardware descriptions languages, computer-aided design tools, field-programmable gate array architectures, and CMOS logic gates and state elements. Interfaces between peripherals, processor hardware, and software. Formal hardware laboratories and substantial design project.', '-.1', 'Course is not repeatable for credit.', '-.2', '-.3', '-.4']


Data preprocessing and Cleaning Section. Course IDs and others need to be separated and/or removed.

In [39]:
'''Data Cleaning for Purdue'''
#Cleaning and standardizing course data for Purdue

purdueDfCopy = purdue_df.copy() #creates a copy dataframe that will not change the original

#University Identifier
purdueDfCopy['university'] = 'Purdue University West Lafayette'

#Course codes
purdueDfCopy['course_code'] = purdue_df['SubjectId'].str.strip().str.upper()

#Clean titles and descriptions
purdueDfCopy['title'] = purdue_df['Title'].str.strip()
purdueDfCopy['description'] = purdue_df['Description'].fillna("").str.strip()

#Credit standardization
purdueDfCopy['credits'] = pd.to_numeric(purdue_df['CreditHours'], errors='coerce')

#Combined Text for NLP
purdueDfCopy['combined_text'] = purdueDfCopy['title'] + ' ' + purdueDfCopy['description']

#Course Level/Num
purdueDfCopy['course_num'] = purdue_df['Number']

#Drop Extra Columns
purdueDfCopy = purdueDfCopy.drop(['Id', 'Number', 'SubjectId', 'Title', 'CreditHours', 'Description'], axis=1)

purdue_copy = purdueDfCopy.to_csv('Purdue CSV File With Required Columns', index=False)

display(purdueDfCopy)
    

Unnamed: 0,university,course_code,title,description,credits,combined_text,course_num
0,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Professional Practice II,,0,Professional Practice II,9200
1,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Professional Practice III,,0,Professional Practice III,9300
2,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Professional Practice IV,,0,Professional Practice IV,9400
3,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Professional Practice V,,0,Professional Practice V,9500
4,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Digital Literacy,,3,Digital Literacy,10100
...,...,...,...,...,...,...,...
1798,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Software Trust Management,,3,Software Trust Management,69000
1799,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Git Based Data Model For Nosql,,3,Git Based Data Model For Nosql,69000
1800,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Cryptography II,,3,Cryptography II,69000
1801,Purdue University West Lafayette,86AD8A59-6DDC-4067-9F6B-169C8EEC86A6,Research MS Thesis,,1,Research MS Thesis,69800


In [40]:
'''Data Cleaning for Berkeley'''
#Cleaning and standardizing course data for Berkeley CS Courses

berkDfCopy = berkeley_df.copy()

#University Identifier
berkDfCopy['University'] = 'UC Berkeley'

#Course codes
berkDfCopy['course_code'] = berkeley_df[150].str.strip().str.upper()

#Clean titles and descriptions
berkDfCopy['title'] = berkeley_df['COMPSCI'].str.strip()
berkDfCopy['description'] = berkeley_df['Basic building blocks and design methods to contruct synchronous digital systems, such as general purpose processors, hardware accelerators, and application specific processors. Representations and design methodologies for digital systems. Logic design using combinatorial and sequential circuits. Digital system implementation considering hardware descriptions languages, computer-aided design tools, field-programmable gate array architectures, and CMOS logic gates and state elements. Interfaces between peripherals, processor hardware, and software. ' \
'Formal hardware laboratories and substantial design project.'].fillna("").str.strip()

#Credit standardization
berkDfCopy['credits'] = pd.to_numeric(berk_df['CreditHours'], errors='coerce')

#Combined Text for NLP
purdueDfCopy['combined_text'] = purdueDfCopy['title'] + ' ' + purdueDfCopy['description']

#Course Level/Num
purdueDfCopy['course_num'] = purdue_df['Number']

#Drop Extra Columns
purdueDfCopy = purdueDfCopy.drop(['Id', 'Number', 'SubjectId', 'Title', 'CreditHours', 'Description'], axis=1)

purdue_copy = purdueDfCopy.to_csv('Purdue CSV File With Required Columns', index=False)

display(purdueDfCopy)
    