# Imports

In [47]:
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

from collections import Counter

import torch.nn as nn

import torch.optim as optim
import torch.nn.functional as F

from torchinfo import summary
from torchviz import make_dot
from torchview import draw_graph

# Load In Processed Class Balances

In [4]:
# load the processed data 
with open('outputs_processing/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

# unpack everything you saved
X_train_tfidf = data['X_train_tfidf']
X_test_tfidf  = data['X_test_tfidf']
y_train_enc   = data['y_train_enc']
y_test_enc    = data['y_test_enc']

X_train = data['X_train']
X_test  = data['X_test']
y_train = data['y_train']
y_test  = data['y_test']

X = data['X']
y = data['y']

print("Data loaded successfully from outputs_processing/processed_data.pkl")
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"Example text: {X_train[0][:120]}...")

Data loaded successfully from outputs_processing/processed_data.pkl
Train samples: 9912, Test samples: 4248
Example text: complexity course concentrated last week...


In [5]:
X_train.head()

8405     much disjointed information felt absolutely cr...
6444     pain installing necessary tool course linux ma...
12444    faculty good explanation thank much coursera m...
10962    would let leave course signing due family issu...
13863                 course change way thinking thank lot
Name: processed, dtype: object

# Define Encoding for Baseline Models

In [28]:
# 1. Build vocabulary from training text
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for txt in texts:
        counter.update(txt.split())
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(X_train)
vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

# 2. Encoding function
def encode_text(text, vocab, max_len=100):
    tokens = text.split()
    ids = [vocab.get(tok, vocab['<UNK>']) for tok in tokens[:max_len]]
    if len(ids) < max_len:
        ids += [vocab['<PAD>']] * (max_len - len(ids))
    return torch.tensor(ids, dtype=torch.long)

Vocab size: 6224


# Create Dataset Class

In [29]:
class TokenizedTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = list(texts)
        self.labels = list(labels)
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text_tensor = encode_text(self.texts[idx], self.vocab, self.max_len)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        return text_tensor, label_tensor


In [None]:
class TextDataset(Dataset):
    """
    PyTorch Dataset for text and labels.
    Works whether texts/labels are lists or numpy arrays.
    """
    def __init__(self, texts, labels, transform=None):
        # Convert to lists to ensure standard indexing
        self.texts = list(texts)
        self.labels = list(labels)
        self.transform = transform

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        if self.transform:
            text = self.transform(text)

        return text, label

In [20]:
train_dataset = TextDataset(X_train, y_train_enc)
test_dataset  = TextDataset(X_test, y_test_enc)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [25]:
for texts, labels in train_loader:
    print(texts[0][:100])
    print(labels[:5])
    break


even though course cover lot fundamental introduction algorithm course designed good expected starte
tensor([0, 2, 0, 1, 0])


In [30]:
train_dataset = TokenizedTextDataset(X_train, y_train_enc, vocab, max_len=100)
test_dataset  = TokenizedTextDataset(X_test, y_test_enc, vocab, max_len=100)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32)

# Define Baseline LSTM Model

In [32]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

# Define Baseline GRU Model

In [33]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.gru(x)
        out = self.fc(h_n[-1])
        return out

# Define Training Loop CPU Friendly

In [36]:
device = torch.device("cpu")  

model = LSTMClassifier(vocab_size=vocab_size, num_classes=len(set(y_train_enc)))
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/3, Loss: 1.0972
Epoch 2/3, Loss: 1.0899
Epoch 3/3, Loss: 1.0813


In [37]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)
        predicted = torch.argmax(preds, dim=1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

print(f"Accuracy: {100 * correct / total:.2f}%")


Accuracy: 36.16%


# Visualize Models

In [43]:
model = LSTMClassifier(vocab_size=vocab_size, num_classes=len(set(y_train_enc)))

summary(model, input_size=(100,), dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
LSTMClassifier                           [3]                       --
├─Embedding: 1-1                         [100, 128]                796,672
├─LSTM: 1-2                              [100, 128]                132,096
├─Linear: 1-3                            [3]                       387
Total params: 929,155
Trainable params: 929,155
Non-trainable params: 0
Total mult-adds (Units.GIGABYTES): 1.77
Input size (MB): 0.00
Forward/backward pass size (MB): 0.20
Params size (MB): 3.72
Estimated Total Size (MB): 3.92

In [46]:
x = torch.randint(0, vocab_size, (1, 100))  # dummy input
y = model(x)

dot = make_dot(y, params=dict(model.named_parameters()))
dot.format = 'png'
dot.render('model_architecture')

'model_architecture.png'

In [48]:
x = torch.randint(0, vocab_size, (1, 100))
model_graph = draw_graph(model, input_data=x, expand_nested=True)
model_graph.visual_graph.render("lstm_architecture", format="png")




'lstm_architecture.png'