In [87]:
from collections import Counter
import numpy as np
import numpy.typing as npt
from matplotlib import pyplot as plt
import pandas as pd
from typing import List, Tuple, Any, Dict

import transformers
import torch

In [103]:
run_device = torch.device("mps") if torch.backends.mps.is_available() else None
run_device

device(type='mps')

In [69]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [83]:
toy_tokenized = tokenizer(
    ["This is a brown fox, Sammmmmy!", "Nice"],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=1024,
)
toy_tokenized

{'input_ids': tensor([[ 1212,   318,   257,  7586, 21831,    11,  3409, 27532,  1820,     0],
        [35284, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [104]:
SPAM_HAM_CSV = "data/spam_ham_dataset.csv"


def prepare_spam_ham(
    csv_path: str,
    tokenizer,
    test_split_fraction: float = 0.2,
    rand_seed: int = 1,
    device: torch.DeviceObjType = None,
):
    """Read dataset.

    Returns:
      train_tokens, test_tokens
    """
    df = pd.read_csv(csv_path, index_col=0)
    df = df.sample(frac=1, random_state=rand_seed)
    train_size = int(len(df) * (1 - test_split_fraction))
    train_split = df.iloc[:train_size]
    test_split = df.iloc[train_size:]
    res = []

    for split_df in [train_split, test_split]:
        tokenized = tokenizer(
            split_df["text"].to_list(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024,
        )
        labels = split_df["label"] == "spam"
        labels_tensor = torch.tensor(labels.to_list(), dtype=torch.long)
        if device:
            labels_tensor = labels_tensor.to(device=device)
            tokenized = {k:v.to(device=device) for k, v in tokenized.items()}
        res.append(tokenized)
        res.append(labels_tensor)
    return res[0], res[1], res[2], res[3]


train_tokenized, train_labels, test_tokenized, test_labels = prepare_spam_ham(
    SPAM_HAM_CSV,
    tokenizer=tokenizer,
    device=run_device,
)

In [105]:
train_tokenized

{'input_ids': tensor([[19776,    25,   644,  ..., 50256, 50256, 50256],
         [19776,    25,   289,  ..., 50256, 50256, 50256],
         [19776,    25,  8174,  ..., 50256, 50256, 50256],
         ...,
         [19776,    25,  9912,  ..., 50256, 50256, 50256],
         [19776,    25, 11149,  ...,   289,  6526,   201],
         [19776,    25,  7627,  ..., 50256, 50256, 50256]], device='mps:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]], device='mps:0')}

In [106]:
train_labels

tensor([0, 0, 1,  ..., 0, 0, 0], device='mps:0')

In [107]:
classifier_model = transformers.GPT2ForSequenceClassification.from_pretrained("gpt2")
classifier_model = classifier_model.to(device=run_device)
classifier_model

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [109]:
classifier_model.config.pad_token_id = tokenizer.pad_token_id
classifier_model.eval()
res = classifier_model(
    toy_tokenized["input_ids"].to(run_device),
    attention_mask=toy_tokenized["attention_mask"].to(run_device),
)
print("keys:", res.keys())
res["logits"]

keys: odict_keys(['logits', 'past_key_values'])


tensor([[ 8.9129, -0.2032],
        [ 1.4922, -0.1870]], device='mps:0', grad_fn=<IndexBackward0>)

In [101]:
run_device = torch.device("mps")

train_tokenized["input_ids"] = train_tokenized["input_ids"].to(device=run_device)

In [112]:
def train_loop(
  model: transformers.PreTrainedModel,
  tokens: Dict[str, torch.Tensor],
  labels: torch.Tensor,
  *,
  optimizer: torch.optim.Optimizer | None = None,
  lr: float = 1e-5,
  epochs: int = 1,
  batch_size: int = 8,
  device: torch.DeviceObjType | None = None,
) -> None:
  model.train()
  loss = torch.nn.CrossEntropyLoss()
  if not optimizer:
    optimizer = torch.optim.AdamW(classifier_model.parameters(), lr=lr)
  
  step = 0

  print("Starting the training loop ...")
  for epoch in range(epochs):
    total_examples = tokens["input_ids"].shape[0]
    indices = torch.randperm(total_examples, device=device)
    for batch_start in range(0, total_examples, batch_size)[:-1]:
      step += 1
      optimizer.zero_grad()
      batch_indices = indices[batch_start: batch_start + batch_size]
      batch_tokens = tokens["input_ids"][batch_indices]
      batch_attention_masks = tokens["attention_mask"][batch_indices]
      batch_labels = labels[batch_indices]
      logits = classifier_model(batch_tokens, attention_mask=batch_attention_masks)["logits"]
      output = loss(logits, batch_labels)
      output.backward()
      optimizer.step()
      print(f"Step {step}: loss={output.item()}")
    print(f"End of epoch {epoch}")


train_loop(classifier_model, train_tokenized, train_labels, device=run_device)

Starting the training loop ...


RuntimeError: MPS backend out of memory (MPS allocated: 21.85 GB, other allocations: 14.39 GB, max allowed: 36.27 GB). Tried to allocate 1.50 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).