In [134]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchtext
from torchtext.data import get_tokenizer
import spacy
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import vocab as build_vocab
from torch.utils.data import DataLoader, TensorDataset
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [135]:
df = pd.read_csv("courses.csv")
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,Course,ID,Department
0,Principles of Nutrition,BIOL,Biology
1,Nutrition for Fitness and Physical Activity,BIOL,Biology
2,Introduction to Human Physiology,BIOL,Biology
3,Biotechnology Management,BIOL,Biology
4,Living Biology at Brown and Beyond,BIOL,Biology


In [136]:
labels_text = df['Department']
text = df['Course']

In [137]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
tokens = [tokenizer(t) for t in text]
counter = Counter(word for seq in tokens for word in seq)
vocab = build_vocab(counter, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print("Vocabulary size:", len(vocab))

Vocabulary size: 4489


In [138]:
sequences = [torch.tensor([vocab[token] for token in seq]) for seq in tokens]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<unk>"])
print(padded_sequences.shape)

torch.Size([6690, 18])


In [139]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_text)
output_dim = len(label_encoder.classes_)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_encoded, test_size=0.2, random_state=17)
train_data = TensorDataset(X_train, torch.tensor(y_train))
test_data = TensorDataset(X_test, torch.tensor(y_test))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [141]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [142]:
embedding_dim = 100
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for i, word in vocab.get_stoi().items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
torch_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=64, output_dim=output_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = torch.nn.Embedding.from_pretrained(torch_embeddings, freeze=False)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = torch.nn.Linear(2*hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return out

In [144]:
model = LSTMClassifier(num_layers=1)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

LSTMClassifier(
  (embedding): Embedding(4489, 100)
  (lstm): LSTM(100, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=77, bias=True)
)


In [145]:
def train(model, train_loader, loss_fn, optimizer):
    num_batches = len(train_loader)
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        inputs, targets = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f"Batch {i+1}/{num_batches}, Loss: {loss.item():.4f}", end='\r', flush=True)

def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = batch
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [146]:
num_epochs = 25
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_loader, loss_fn, optimizer)
    test(model, test_loader)

Epoch 1/25
Test Accuracy: 8.45% 3.4733
Epoch 2/25
Test Accuracy: 8.45% 3.9643
Epoch 3/25
Test Accuracy: 8.45% 3.4571
Epoch 4/25
Test Accuracy: 8.67% 4.1263
Epoch 5/25
Test Accuracy: 9.19% 3.1030
Epoch 6/25
Test Accuracy: 9.87% 2.9270
Epoch 7/25
Test Accuracy: 10.54%3.9722
Epoch 8/25
Test Accuracy: 11.66%3.0072
Epoch 9/25
Test Accuracy: 12.11%2.9849
Epoch 10/25
Test Accuracy: 12.33%2.3546
Epoch 11/25
Test Accuracy: 13.68%1.9650
Epoch 12/25
Test Accuracy: 15.77%3.4657
Epoch 13/25
Test Accuracy: 15.02%2.7489
Epoch 14/25
Test Accuracy: 17.71%2.7792
Epoch 15/25
Test Accuracy: 17.64%2.7333
Epoch 16/25
Test Accuracy: 20.70%2.1615
Epoch 17/25
Test Accuracy: 23.32%2.5497
Epoch 18/25
Test Accuracy: 24.44%1.5184
Epoch 19/25
Test Accuracy: 25.64%1.8002
Epoch 20/25
Test Accuracy: 28.62%1.3788
Epoch 21/25
Test Accuracy: 28.55%0.9237
Epoch 22/25
Test Accuracy: 31.61%2.3199
Epoch 23/25
Test Accuracy: 30.57%1.9271
Epoch 24/25
Test Accuracy: 31.39%0.9272
Epoch 25/25
Test Accuracy: 32.59%1.1600


In [150]:
predict_text = "Computational Methods in Physics"
tokens = tokenizer(predict_text)
sequence = torch.tensor([vocab[token] for token in tokens]).unsqueeze(0)
with torch.no_grad():
    model.eval()
    output = model(sequence)
    probabilities = torch.nn.functional.softmax(output, dim=1)

    top_probs, top_indices = torch.topk(probabilities, k=5)
    top_probs = top_probs.squeeze().tolist()
    top_indices = top_indices.squeeze().tolist()

    print(f"Top 5 predicted departments for '{predict_text}':")
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        department = label_encoder.inverse_transform([idx])[0]
        print(f"{i+1}. {department}: {prob*100:.2f}%")

Top 5 predicted departments for 'Computational Methods in Physics':
1. Assyriology: 2.48%
2. Russian: 2.42%
3. Egyptology: 2.36%
4. Chinese: 2.19%
5. Slavic Studies: 2.18%
