In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchtext
from torchtext.data import get_tokenizer
import spacy
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import vocab as build_vocab
from torch.utils.data import DataLoader, TensorDataset
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [24]:
df = pd.read_csv('courses.csv')
df.head()

Unnamed: 0,Course,ID,Department
0,Principles of Nutrition,BIOL,Biology
1,Nutrition for Fitness and Physical Activity,BIOL,Biology
2,Introduction to Human Physiology,BIOL,Biology
3,Biotechnology Management,BIOL,Biology
4,Living Biology at Brown and Beyond,BIOL,Biology


In [25]:
text = df['Course']
labels_text = df['Department']

In [26]:
text_list = text.tolist()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
encoded = tokenizer(text_list, padding=True, return_tensors='np')
sequences = encoded['input_ids']
print(sequences.shape)
max_length = sequences.shape[1]

(6690, 26)


In [27]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_text)
output_dim = len(label_encoder.classes_)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(sequences, labels_encoded, test_size=0.2, random_state=17)
train_data = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_data = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [49]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=output_dim)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
for param in model.distilbert.parameters():
    param.requires_grad = False

In [51]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

In [52]:
def train_loop(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=100):
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0        

        for inputs, targets in train_loader:
            
            optimizer.zero_grad()
            outputs = model(inputs).logits
            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * targets.size(0)
            correct += (outputs.argmax(1) == targets).sum().item()
            total += targets.size(0)

        train_loss = total_loss / total
        train_acc = correct / total
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)

        # Validation
        model.eval()
        val_loss, val_correct, val_total = 0, 0, 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs).logits
                loss = loss_fn(outputs, targets)

                val_loss += loss.item() * targets.size(0)
                val_correct += (outputs.argmax(1) == targets).sum().item()
                val_total += targets.size(0)

        val_loss /= val_total
        val_acc = val_correct / val_total
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return history

In [53]:
history = train_loop(model, train_loader, test_loader, loss_fn, optimizer, 100)
torch.save(model.state_dict(), "finetune_bert_torch.pth")

Epoch 1/100: Train Loss: 3.4866, Train Acc: 0.1388 | Val Loss: 2.9684, Val Acc: 0.2025
Epoch 2/100: Train Loss: 2.9491, Train Acc: 0.2167 | Val Loss: 2.7376, Val Acc: 0.2601
Epoch 3/100: Train Loss: 2.7096, Train Acc: 0.2780 | Val Loss: 2.5234, Val Acc: 0.3012
Epoch 4/100: Train Loss: 2.5944, Train Acc: 0.2932 | Val Loss: 2.3674, Val Acc: 0.3363
Epoch 5/100: Train Loss: 2.5186, Train Acc: 0.3154 | Val Loss: 2.3623, Val Acc: 0.3580
Epoch 6/100: Train Loss: 2.4363, Train Acc: 0.3350 | Val Loss: 2.2638, Val Acc: 0.3752
Epoch 7/100: Train Loss: 2.3754, Train Acc: 0.3416 | Val Loss: 2.2156, Val Acc: 0.3774
Epoch 8/100: Train Loss: 2.3394, Train Acc: 0.3472 | Val Loss: 2.1539, Val Acc: 0.3954
Epoch 9/100: Train Loss: 2.2700, Train Acc: 0.3602 | Val Loss: 2.1749, Val Acc: 0.3744
Epoch 10/100: Train Loss: 2.2754, Train Acc: 0.3572 | Val Loss: 2.1120, Val Acc: 0.4081
Epoch 11/100: Train Loss: 2.2366, Train Acc: 0.3780 | Val Loss: 2.0942, Val Acc: 0.4073
Epoch 12/100: Train Loss: 2.2313, Train A