# Appendix E
I am using the UCI Sentiment Labelled Sentences dataset instead of the UCI SMS Spam Collection dataset to get some novel results.

# E.2 - Preparing the dataset

In [1]:
# Importing the dataset
import pandas as pd
from previous_chapters import random_split
data = []
with open("../chapter-6-learn/sentiment_combined.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\n')
        text, label = line.rsplit('\t', 1)
        data.append((text, label))
balanced_df = pd.DataFrame(data, columns=["Text", "Label"])
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("../chapter-6-learn/train.csv", index = None)
validation_df.to_csv("../chapter-6-learn/validation.csv", index = None)
test_df.to_csv("../chapter-6-learn/test.csv", index = None)

In [2]:
# Instantiating PyTorch datasets
import torch
import tiktoken
from previous_chapters import SpamDataset
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SpamDataset("../chapter-6-learn/train.csv", max_length=None, tokenizer=tokenizer)
val_dataset = SpamDataset("../chapter-6-learn/validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)
test_dataset = SpamDataset("../chapter-6-learn/test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)

In [3]:
# Creating PyTorch data loaders
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)
train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True,
                          num_workers = num_workers, drop_last = True)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size,
                        num_workers = num_workers, drop_last = False)
test_loader = DataLoader(dataset = test_dataset, batch_size = batch_size,
                        num_workers = num_workers, drop_last = False)

In [4]:
# Print the batch dimensions
print("Train loader:")
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions", target_batch.shape)

Train loader:
Input batch dimensions: torch.Size([8, 103])
Label batch dimensions torch.Size([8])


In [5]:
# Print the total number of batches in each dataset
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

262 training batches
38 validation batches
75 test batches


# E.3 - Initializing the model

In [6]:
# Loading a pretrained GPT model
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "So far, I had"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size = model_size, models_dir = "../chapter-5-learn/gpt2")
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

2025-10-28 16:21:41.407852: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/checkpoint
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/encoder.json
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/hparams.json
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: ../chapter-5-learn/gpt2/124M/vocab.bpe


In [7]:
# Ensure the model generates coherent text
from previous_chapters import generate_text_simple, text_to_token_ids, token_ids_to_text
text_1 = "So far, I had"
token_ids = generate_text_simple(model = model, idx = text_to_token_ids(text_1, tokenizer), 
                                 max_new_tokens = 15, context_size = BASE_CONFIG["context_length"])
print(token_ids_to_text(token_ids, tokenizer))

So far, I had no idea what to expect.

I was so excited to see the


In [8]:
# Prepare the model for classification fine-tuning
torch.manual_seed(123)
num_classes = 2
model.out_head = torch.nn.Linear(in_features = 768, out_features = num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [9]:
# Calculate initial classification accuracy
from previous_chapters import calc_accuracy_loader
torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches = 10)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches = 10)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches = 10)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 50.00%
Validation accuracy: 48.75%
Test accuracy: 52.50%


# E.4 - Parameter-efficient fine-tuning with LoRA