In [23]:
import requests
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [24]:
# pull all class subject areas
res = requests.get('https://classes.cornell.edu/api/2.0/config/subjects.json?roster=SP23').json()
res = res["data"]["subjects"]
res

[{'value': 'AAP',
  'descr': 'Architecture, Art, and Plannin',
  'descrformal': 'Architecture, Art, and Planning'},
 {'value': 'AAS',
  'descr': 'Asian American Studies',
  'descrformal': 'Asian American Studies'},
 {'value': 'AEM',
  'descr': 'Applied Economics & Management',
  'descrformal': 'Applied Economics & Management'},
 {'value': 'AEP',
  'descr': 'Applied & Engineering Physics',
  'descrformal': 'Applied & Engineering Physics'},
 {'value': 'AGSCI',
  'descr': 'Agriculture Sciences',
  'descrformal': 'Agricultural Sciences'},
 {'value': 'AIIS',
  'descr': 'American Indian & Indigenous',
  'descrformal': 'American Indian and Indigenous Studies'},
 {'value': 'AIRS',
  'descr': 'Air Force Science',
  'descrformal': 'Aerospace Studies'},
 {'value': 'ALS',
  'descr': 'Agriculture & Life Sciences',
  'descrformal': 'Agriculture & Life Sciences'},
 {'value': 'AMST',
  'descr': 'American Studies',
  'descrformal': 'American Studies'},
 {'value': 'ANSC', 'descr': 'Animal Science', 'des

In [25]:
from collections import defaultdict
import json
import os

class_data_path = 'scraped_classes.json'

subj_classes = None
if not os.path.exists(class_data_path):
    subj_codes = list(map(lambda s_dict : s_dict["value"], res))
    subj_classes = defaultdict(list)
    for subj in subj_codes:
        print(f'Scraping {subj}...')
        res = requests.get(f'https://classes.cornell.edu/api/2.0/search/classes.json?roster=SP23&subject={subj}').json()
        res = res['data']['classes']
        for cls in res:
            subj_classes[subj].append((subj, cls["catalogNbr"], cls["titleShort"], cls['description']))
        print(f'Scraped {subj}')
        
        json.dump(subj_classes, open(class_data_path, 'w'))
else:
    subj_classes = json.load(class_data_path)

Scraping AAP...
Scraped AAP
Scraping AAS...
Scraped AAS
Scraping AEM...
Scraped AEM
Scraping AEP...
Scraped AEP
Scraping AGSCI...
Scraped AGSCI
Scraping AIIS...
Scraped AIIS
Scraping AIRS...
Scraped AIRS
Scraping ALS...
Scraped ALS
Scraping AMST...
Scraped AMST
Scraping ANSC...
Scraped ANSC
Scraping ANTHR...
Scraped ANTHR
Scraping ARAB...
Scraped ARAB
Scraping ARCH...
Scraped ARCH
Scraping ARKEO...
Scraped ARKEO
Scraping ART...
Scraped ART
Scraping ARTH...
Scraped ARTH
Scraping AS...
Scraped AS
Scraping ASIAN...
Scraped ASIAN
Scraping ASL...
Scraped ASL
Scraping ASRC...
Scraped ASRC
Scraping ASTRO...
Scraped ASTRO
Scraping BANA...
Scraped BANA
Scraping BCS...
Scraped BCS
Scraping BEE...
Scraped BEE
Scraping BENGL...
Scraped BENGL
Scraping BIOAP...
Scraped BIOAP
Scraping BIOEE...
Scraped BIOEE
Scraping BIOG...
Scraped BIOG
Scraping BIOMG...
Scraped BIOMG
Scraping BIOMI...
Scraped BIOMI
Scraping BIOMS...
Scraped BIOMS
Scraping BIONB...
Scraped BIONB
Scraping BME...
Scraped BME
Scraping B

In [28]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

def clean(text):
    text = re.sub(r'\|\|\|', r' ', text)
    text = text.replace('„','')
    text = text.replace('“','')
    text = text.replace('"','')
    text = text.replace('\'','')
    text = text.replace('-','')
    text = text.lower()
    return text

def remove_stopwords(text):
    engl_stops = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in engl_stops])

def tokenize(text):
    return list(filter(lambda word : len(word) > 3, word_tokenize(text)))

tagged_descs = []
cmp_classes = []
total = 0
for i, cls_arr in enumerate(subj_classes.values()):
    print(total, end='\r')
    for dep, code, title, desc in cls_arr:
        if desc and len(desc) > 100:
            cmp_classes.append((dep, code, title, desc))
            tagged_descs.append(TaggedDocument(words=tokenize(remove_stopwords(clean(title + " " + desc))), tags=[total]))
            total += 1

4116

In [29]:
from gensim.models.callbacks import CallbackAny2Vec
import multiprocessing

class VecCallback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        self.epoch += 1
        print(f"Starting epoch {self.epoch}")
        
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print(f'Loss after epoch {self.epoch}: {loss}')
        
cores = multiprocessing.cpu_count()
vec_model = Doc2Vec(dm=0, vector_size=50, alpha=0.025, workers=cores)
print("Initialized model")
vec_model.build_vocab(tagged_descs)
print("Built vocab")
vec_model.train(tagged_descs, total_examples=vec_model.corpus_count, epochs=50, callbacks=[VecCallback()])

Initialized model
Built vocab
Starting epoch 1
Loss after epoch 1: 0.0
Starting epoch 2
Loss after epoch 2: 0.0
Starting epoch 3
Loss after epoch 3: 0.0
Starting epoch 4
Loss after epoch 4: 0.0
Starting epoch 5
Loss after epoch 5: 0.0
Starting epoch 6
Loss after epoch 6: 0.0
Starting epoch 7
Loss after epoch 7: 0.0
Starting epoch 8
Loss after epoch 8: 0.0
Starting epoch 9
Loss after epoch 9: 0.0
Starting epoch 10
Loss after epoch 10: 0.0
Starting epoch 11
Loss after epoch 11: 0.0
Starting epoch 12
Loss after epoch 12: 0.0
Starting epoch 13
Loss after epoch 13: 0.0
Starting epoch 14
Loss after epoch 14: 0.0
Starting epoch 15
Loss after epoch 15: 0.0
Starting epoch 16
Loss after epoch 16: 0.0
Starting epoch 17
Loss after epoch 17: 0.0
Starting epoch 18
Loss after epoch 18: 0.0
Starting epoch 19
Loss after epoch 19: 0.0
Starting epoch 20
Loss after epoch 20: 0.0
Starting epoch 21
Loss after epoch 21: 0.0
Starting epoch 22
Loss after epoch 22: 0.0
Starting epoch 23
Loss after epoch 23: 0.0

In [48]:
def preprocess(text):
    return tokenize(remove_stopwords(clean(text)))

def gen_recs(inp, num_recs):
    inf = vec_model.infer_vector(preprocess(inp))
    sims = vec_model.dv.most_similar([inf], topn=100)
    titles = set()
    satisfied = 0
    for i, sim in sims:
        course = cmp_classes[i]
        if course[2] not in titles:
            titles.add(course[2])
            satisfied += 1
            print(sim)
            print(course)
            if satisfied == num_recs:
                break
    print(len(sims))

print("=" * 10)
gen_recs('generative ai', 10)
print("=" * 10)
gen_recs('puzzles and paradoxes philosophy', 10)

0.7840315103530884
('LING', '4424', 'Computational Linguistics I', 'Computational models of natural languages. Topics are drawn from: tree syntax and context free grammar, finite state generative morpho-phonology, feature structure grammars, logical semantics, tabular parsing, Hidden Markov models, categorial and minimalist grammars, text corpora, information-theoretic sentence processing, discourse relations, and pronominal coreference.')
0.753027081489563
('CS', '6785', 'Deep Generative Models', 'Generative models are a class of machine learning algorithms that define probability distributions over complex, high-dimensional objects such as images, sequences, and graphs. Recent advances in deep neural networks and optimization algorithms have significantly enhanced the capabilities of these models and renewed research interest in them. This course explores the foundational probabilistic principles of deep generative models, their learning algorithms, and popular model families, which 

In [31]:
vec_model.save('course_embeddings.model')

In [46]:
json.dump(cmp_classes, open('parsed_courses.json', 'w'))