In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchtext
from torchtext.data import get_tokenizer
import spacy
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import vocab as build_vocab
from torch.utils.data import DataLoader, TensorDataset
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("courses.csv")
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,Course,ID,Department
0,Principles of Nutrition,BIOL,Biology
1,Nutrition for Fitness and Physical Activity,BIOL,Biology
2,Introduction to Human Physiology,BIOL,Biology
3,Biotechnology Management,BIOL,Biology
4,Living Biology at Brown and Beyond,BIOL,Biology


In [3]:
labels_text = df['Department']
text = df['Course']

In [4]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
tokens = [tokenizer(t) for t in text]
counter = Counter(word for seq in tokens for word in seq)
vocab = build_vocab(counter, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
print("Vocabulary size:", len(vocab))

Vocabulary size: 4489


In [5]:
sequences = [torch.tensor([vocab[token] for token in seq]) for seq in tokens]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=vocab["<unk>"])
print(padded_sequences.shape)

torch.Size([6690, 18])


In [6]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_text)
output_dim = len(label_encoder.classes_)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_encoded, test_size=0.2, random_state=17)
train_data = TensorDataset(X_train, torch.tensor(y_train))
test_data = TensorDataset(X_test, torch.tensor(y_test))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [8]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [9]:
embedding_dim = 100
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for i, word in vocab.get_stoi().items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
torch_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)

In [71]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=64, output_dim=output_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = num_layers * 2 # Bidirectional LSTM doubles the layer dimension
        self.embedding = torch.nn.Embedding.from_pretrained(torch_embeddings, freeze=False)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.fc = torch.nn.Linear(2*hidden_dim, output_dim)

    def forward(self, x, h0=None, c0=None):
        if h0 is None or c0 is None:
            h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
            c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        embedded = self.embedding(x)
        lstm_out, (hn, cn) = self.lstm(embedded, (h0, c0))
        out = self.fc(lstm_out[:, -1, :])
        return out, hn, cn

In [72]:
model = LSTMClassifier(num_layers=1)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)

LSTMClassifier(
  (embedding): Embedding(4489, 100)
  (lstm): LSTM(100, 64, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=77, bias=True)
)


In [74]:
def train_loop(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=10):
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0        

        for inputs, targets in train_loader:
            
            h0 = torch.zeros(model.layer_dim, inputs.size(0), model.hidden_dim)
            c0 = torch.zeros(model.layer_dim, inputs.size(0), model.hidden_dim)

            optimizer.zero_grad()
            outputs, h0, c0 = model(inputs, h0, c0)
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * targets.size(0)
            correct += (outputs.argmax(1) == targets).sum().item()
            total += targets.size(0)

        train_loss = total_loss / total
        train_acc = correct / total
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                h0 = torch.zeros(model.layer_dim, inputs.size(0), model.hidden_dim)
                c0 = torch.zeros(model.layer_dim, inputs.size(0), model.hidden_dim)
                outputs, _, _ = model(inputs, h0, c0)
                loss = loss_fn(outputs, targets)

                val_loss += loss.item() * targets.size(0)
                val_correct += (outputs.argmax(1) == targets).sum().item()
                val_total += targets.size(0)

        val_loss /= val_total
        val_acc = val_correct / val_total
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return history

In [None]:
history = train_loop(model, train_loader, test_loader, loss_fn, optimizer, num_epochs=100)

Epoch 1/25: Train Loss: 3.5077, Train Acc: 0.1041 | Val Loss: 3.6458, Val Acc: 0.0972
Epoch 2/25: Train Loss: 3.0960, Train Acc: 0.1482 | Val Loss: 3.6044, Val Acc: 0.0919
Epoch 3/25: Train Loss: 2.7523, Train Acc: 0.2244 | Val Loss: 3.5035, Val Acc: 0.1465
Epoch 4/25: Train Loss: 2.3233, Train Acc: 0.3445 | Val Loss: 3.4565, Val Acc: 0.1951


KeyboardInterrupt: 

In [52]:
predict_text = "Physics 101"
tokens = tokenizer(predict_text)
sequence = torch.tensor([vocab[token] for token in tokens]).unsqueeze(0)
with torch.no_grad():
    model.eval()
    h0 = c0 = None
    output = model(sequence, h0, c0)[0]
    probabilities = torch.nn.functional.softmax(output, dim=1)

    top_probs, top_indices = torch.topk(probabilities, k=5)
    top_probs = top_probs.squeeze().tolist()
    top_indices = top_indices.squeeze().tolist()

    print(f"Top 5 predicted departments for '{predict_text}':")
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        department = label_encoder.inverse_transform([idx])[0]
        print(f"{i+1}. {department}: {prob*100:.2f}%")

Top 5 predicted departments for 'Physics 101':
1. Physics: 8.73%
2. Assyriology: 8.62%
3. Neuroscience: 4.49%
4. Ancient Greek: 3.81%
5. Egyptology: 3.74%
