In [44]:
import torch

In [45]:

if torch.backends.mps.is_available():
    torch_device = torch.device("mps:0")
    x = torch.ones(1, device=torch_device)
    print (x)
elif torch.cuda.is_available():
    torch_device = torch.device("cuda")
    print ("MPS device not found. Using CUDA.")
else:
    torch_device = torch.device("cpu")
    print ("No accelerator device found. Using CPU.")

MPS device not found. Using CUDA.


## Preload

In [46]:
from transformers import GPT2LMHeadModel

# Load the model
gpt2_small = GPT2LMHeadModel.from_pretrained(
    "openai-community/gpt2", 
).to(torch_device)


In [47]:
gpt2_small

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [48]:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L458
gpt2_small.transformer.h[0].attn

GPT2SdpaAttention(
  (c_attn): Conv1D(nf=2304, nx=768)
  (c_proj): Conv1D(nf=768, nx=768)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

## Dataset

In [49]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "../data/sms-spam-collection.zip"
extracted_path = "../data/sms-spam-collection"

data_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(
        url, zip_path, extracted_path, data_path
):
    if data_path.exists():
        print(f"Data already exists at {data_path}")
    else:
        with urllib.request.urlopen(url) as response:
            with open(zip_path, "wb") as out_file:
                out_file.write(response.read())

        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extracted_path)
        
        original_file_path = Path(extracted_path) / "SMSSpamCollection"

        os.rename(original_file_path, data_path)
        print(f"Data downloaded and extracted to {data_path}")
    return


In [50]:
download_and_unzip_spam_data(url, zip_path, extracted_path, data_path)

Data already exists at ../data/sms-spam-collection/SMSSpamCollection.tsv


In [51]:
import pandas as pd

df = pd.read_csv(data_path, sep="\t", header=None, names = ["label", "text"])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
print(df.label.value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


In [53]:
def word_count(text):
    return len(text.split())

df["word_count"] = df["text"].apply(word_count)

df.groupby("label").word_count.mean()

label
ham     14.310259
spam    23.911647
Name: word_count, dtype: float64

In [54]:
spam_df = df.where(df["label"] == "spam").dropna().reset_index(drop=True)
ham_df = df.where(df["label"] == "ham").dropna().reset_index(drop=True)
ham_sample_df = ham_df.sample(747)
balanced_df = pd.concat([spam_df, ham_sample_df])

balanced_df.label.value_counts()
train_df = balanced_df.sample(frac=0.8, random_state=1)
test_df = balanced_df.drop(train_df.index)

train_df.label.value_counts(),test_df.label.value_counts()

(label
 spam    602
 ham     593
 Name: count, dtype: int64,
 label
 spam    128
 ham     126
 Name: count, dtype: int64)

In [55]:
train_df = balanced_df.sample(frac=0.8, random_state=1)
test_df = balanced_df.drop(train_df.index)

train_df.label.value_counts(),test_df.label.value_counts()

(label
 spam    602
 ham     593
 Name: count, dtype: int64,
 label
 spam    128
 ham     126
 Name: count, dtype: int64)

In [56]:
# train_df.to_csv("../data/sms-spam-collection/train.tsv", index=False, sep="\t")
# test_df.to_csv("../data/sms-spam-collection/test.tsv", index=False, sep="\t")

In [57]:
def random_split(df, train_ratio=0.8, validation_ratio=0.1):
    # Calculate the size of each set
    train_size = int(len(df) * train_ratio)
    validation_size = int(len(df) * validation_ratio)
    
    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Split the DataFrame
    train_df = df.iloc[:train_size]
    validation_df = df.iloc[train_size:train_size + validation_size]
    test_df = df.iloc[train_size + validation_size:]
    
    return train_df, validation_df, test_df

In [58]:
train_df, validation_df, test_df = random_split(balanced_df, train_ratio=0.8, validation_ratio=0.1)
train_df.to_csv("../data/sms-spam-collection/train.tsv", index=False, sep="\t")
validation_df.to_csv("../data/sms-spam-collection/validation.tsv", index=False, sep="\t")
test_df.to_csv("../data/sms-spam-collection/test.tsv", index=False, sep="\t")

### Dataloader

In [59]:
train_df.head()

Unnamed: 0,label,text,word_count
0,spam,-PLS STOP bootydelious (32/F) is inviting you ...,24.0
1,ham,Convey my regards to him,5.0
2,spam,4mths half price Orange line rental & latest c...,26.0
3,ham,"Cool, I'll text you when I'm on the way",9.0
4,ham,Anything lor... U decide...,4.0


In [60]:
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file, sep="\t", header=0)
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["text"]
        ]
        if max_length is None:
            self.max_length = max(len(text) for text in self.encoded_texts)
        else:
            self.max_length = max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]
        self.encoded_texts = torch.tensor(self.encoded_texts)
        self.labels = torch.tensor(self.data["label"].map({"spam": 1, "ham": 0}).values)

    def __getitem__(self, idx):
        return self.encoded_texts[idx], self.labels[idx]

    def __len__(self):
        return len(self.data)


In [61]:
sample = pd.read_csv("../data/sms-spam-collection/train.tsv", sep="\t", header=0)
sample

Unnamed: 0,label,text,word_count
0,spam,-PLS STOP bootydelious (32/F) is inviting you ...,24.0
1,ham,Convey my regards to him,5.0
2,spam,4mths half price Orange line rental & latest c...,26.0
3,ham,"Cool, I'll text you when I'm on the way",9.0
4,ham,Anything lor... U decide...,4.0
...,...,...,...
1190,spam,Hi I'm sue. I am 20 years old and work as a la...,35.0
1191,ham,Thats cool. Where should i cum? On you or in y...,12.0
1192,ham,excellent. I spent &lt;#&gt; years in the Ai...,21.0
1193,ham,I thk u dun haf 2 hint in e forum already lor....,23.0


In [62]:
sample.label.describe()

count     1195
unique       2
top        ham
freq       610
Name: label, dtype: object

In [63]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
spam_dataset = SpamDataset(
    "../data/sms-spam-collection/train.tsv", 
    tokenizer, 
    max_length=1024,
    pad_token_id=50256,
)
for i in range(5):
    print(spam_dataset[i])


(tensor([   12,  6489,    50,  ..., 50256, 50256, 50256]), tensor(1))
(tensor([ 3103,  3304,   616,  ..., 50256, 50256, 50256]), tensor(0))
(tensor([   19,    76,  9998,  ..., 50256, 50256, 50256]), tensor(1))
(tensor([34530,    11,   314,  ..., 50256, 50256, 50256]), tensor(0))
(tensor([40028,   300,   273,  ..., 50256, 50256, 50256]), tensor(0))


In [64]:
NUM_WORKERS = 0
BATCH_SIZE = 8

train_dataset = SpamDataset(
    "../data/sms-spam-collection/train.tsv", 
    tokenizer, 
    max_length=1024,
    pad_token_id=50256,
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    num_workers=NUM_WORKERS,
)

validation_dataset = SpamDataset(
    "../data/sms-spam-collection/validation.tsv", 
    tokenizer, 
    max_length=1024,
    pad_token_id=50256,
)
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=True,
    num_workers=NUM_WORKERS,
)

test_dataset = SpamDataset(
    "../data/sms-spam-collection/test.tsv", 
    tokenizer, 
    max_length=1024,
    pad_token_id=50256,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=True,
    num_workers=NUM_WORKERS,
)


## Training Utilities

In [65]:
def generate_text_simple(model, token_ids, max_new_tokens, context_size):
    logits = None
    for i in range(max_new_tokens):
        context_token_ids = token_ids[:, -context_size:]
        with torch.no_grad():
            logits = model(context_token_ids)
        if not isinstance(logits, torch.Tensor):
            logits = logits.logits

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        token_id_next = torch.argmax(probas, dim=1, keepdim=True)  # Pure Greed
        token_ids = torch.cat((token_ids, token_id_next), dim=1) 

    return token_ids

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special=set(['<|endoftext|>']))
    encoded_tensor = torch.tensor(encoded).unsqueeze(0).to(torch_device)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    decoded = tokenizer.decode(token_ids[0].tolist())
    return decoded


In [66]:
from torch import nn
def loss_fn(logits, targets):
    vocab_size = logits.shape[-1]
    loss = nn.CrossEntropyLoss()(logits.view(-1, vocab_size), targets.view(-1))
    return loss

In [67]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch).logits
    loss = torch.nn.functional.cross_entropy(
        logits[:, -1, :],
        target_batch,
    )
    return loss

In [68]:
def calc_loss_loader(dataloader, model, device, num_batches=None):
    total_loss = 0
    if len(dataloader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    for i, (input_batch, target_batch) in enumerate(dataloader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [69]:
def calc_accuracy_loader(dataloader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    for i, (input_batch, target_batch) in enumerate(dataloader):
        if i < num_batches:
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
            with torch.no_grad():
                logits = model(input_batch).logits[:, -1, :]
            predicted_labels = torch.argmax(logits, dim=-1)
            correct_predictions += (predicted_labels == target_batch).sum().item()
            num_examples += predicted_labels.shape[0]
        else:
            break
    return correct_predictions / num_examples


### Preloaded model testing

In [70]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

In [71]:
sample_input = "Every effort moves you"
token_ids = generate_text_simple(
    gpt2_small, 
    text_to_token_ids(sample_input, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG_124M["context_length"]
)

In [72]:
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [73]:
sample_spam_q = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner and you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)
token_ids = generate_text_simple(
    gpt2_small, 
    text_to_token_ids(sample_spam_q, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG_124M["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))


Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner and you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or


## Fine-tuning with new head

In [74]:
model = gpt2_small.to(torch_device)

In [75]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [76]:
# Freeze all parameters except the new head
for param in model.parameters():
    param.requires_grad = False
for param in model.transformer.h[-1].parameters():
    param.requires_grad = True
for param in model.transformer.ln_f.parameters():
    param.requires_grad = True


In [77]:
from torch import nn
torch.manual_seed(123)
num_classes = 2
model.lm_head = nn.Linear(GPT_CONFIG_124M["emb_dim"], num_classes).to(torch_device)


In [78]:
sample_spam_q = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner and you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)
token_ids = generate_text_simple(
    model, 
    text_to_token_ids(sample_spam_q, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG_124M["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))


Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner and you have been specially selected to receive $1000 cash or a $2000 award.'""""""""!""""""


In [79]:
torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(train_dataloader, model, torch_device, num_batches=10)
validation_accuracy = calc_accuracy_loader(validation_dataloader, model, torch_device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_dataloader, model, torch_device, num_batches=10)
print(f"Train accuracy: {train_accuracy:.4f}, Validation accuracy: {validation_accuracy:.4f}, Test accuracy: {test_accuracy:.4f}")

Train accuracy: 0.5000, Validation accuracy: 0.6500, Test accuracy: 0.5250


In [80]:
len(train_dataloader), len(validation_dataloader), len(test_dataloader)

(149, 18, 18)

In [81]:
from collections import Counter
print(Counter([label.item() for _, label in list(train_dataset)]))
print(Counter([label.item() for _, label in list(validation_dataset)]))
print(Counter([label.item() for _, label in list(test_dataset)]))

Counter({0: 610, 1: 585})
Counter({1: 89, 0: 60})
Counter({0: 77, 1: 73})


In [82]:
train_loss = calc_loss_loader(train_dataloader, model, torch_device, num_batches=10)
validation_loss = calc_loss_loader(validation_dataloader, model, torch_device, num_batches=10)
test_loss = calc_loss_loader(test_dataloader, model, torch_device, num_batches=10)
print(f"Train loss: {train_loss:.4f}, Validation loss: {validation_loss:.4f}, Test loss: {test_loss:.4f}")

Train loss: 4.6865, Validation loss: 3.1915, Test loss: 4.3300


## Train

In [83]:
def evaluate_model(model, train_dataloader, validation_dataloader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_dataloader, model, device, num_batches=eval_iter)
        validation_loss = calc_loss_loader(validation_dataloader, model, device, num_batches=eval_iter)
    model.train()
    print(f"Train loss: {train_loss:.4f}, validation loss: {validation_loss:.4f}")
    return train_loss, validation_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    token_ids = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, token_ids, 50, context_size)
    decoded_text = tokenizer.decode(token_ids[0].tolist())
    print(decoded_text.replace("\n", " "))
    model.train()

In [84]:
def train_classifier_simple(
        model, train_loader, val_loader, optimizer, device,
        num_epochs, eval_freq, eval_iter):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            loss.backward()
            optimizer.step()
            examples_seen += input_batch.shape[0]
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)

        train_acc = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_acc = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)

        train_accs.append(train_acc)
        val_accs.append(val_acc)

        print(f"Training accuracy: {train_acc:.4f}")
        print(f"Validation accuracy: {val_acc:.4f}")
        print(f"Epoch {epoch+1} complete")

    return train_losses, val_losses, train_accs, val_accs, examples_seen


In [85]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 0.00005, weight_decay = 0.1
)
num_epochs = 10
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_dataloader, validation_dataloader, optimizer, torch_device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Train loss: 5.3558, validation loss: 2.9764
Train loss: 0.8397, validation loss: 0.6396
Train loss: 0.8220, validation loss: 0.6263
Training accuracy: 0.6250
Validation accuracy: 0.6750
Epoch 1 complete
Train loss: 0.6729, validation loss: 0.6263
Train loss: 0.7317, validation loss: 0.6075
Train loss: 0.6370, validation loss: 0.6028
Training accuracy: 0.7500
Validation accuracy: 0.9000
Epoch 2 complete
Train loss: 0.6134, validation loss: 0.6003
Train loss: 0.7582, validation loss: 0.5764
Train loss: 0.6608, validation loss: 0.5650
Training accuracy: 0.5250
Validation accuracy: 0.6500
Epoch 3 complete
Train loss: 0.7180, validation loss: 0.5742
Train loss: 0.8599, validation loss: 0.5591
Train loss: 0.6453, validation loss: 0.5080
Training accuracy: 0.6500
Validation accuracy: 0.7750
Epoch 4 complete
Train loss: 0.4907, validation loss: 0.4852
Train loss: 0.7812, validation loss: 0.5434
Train loss: 1.2284, validation loss: 0.5756
Training accuracy: 0.6000
Validation accuracy: 0.6750
Ep

In [86]:
calc_accuracy_loader(test_dataloader, model, torch_device)

0.8125