In [1]:
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path
import tiktoken
from gpt_download3 import download_and_load_gpt2
import torch.nn as nn
from GPT_Model_layers import GPTModel
from Functions import generate, load_weights_into_gpt
import time

In [2]:
# Downloading Dataset
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Create an unverified SSL context
    ssl_context = ssl._create_unverified_context()

    # Downloading the file
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


sms_spam_collection\SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
df = pd.read_csv(data_file_path, sep="\t", names=['Label', "Text"])


In [4]:
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
def create_balanced_dataset(df):
    
    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [7]:
# labelling
balanced_df['Label'] = balanced_df['Label'].map({"ham":0, "spam":1})

In [8]:
def create_balanced_dataset(df):
    
    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder


In [10]:
print(f'train_df: {len(train_df)}\nvalidation : {len(validation_df)}\ntest : {len(test_df)}')

train_df: 1045
validation : 149
test : 300


In [11]:
# saving in csv for Dataloaders
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [12]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        label_map = {"ham": 0, "spam": 1}
        self.data["Label"] = self.data["Label"].map(label_map)
        
        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            
            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

<div class="alert alert-block alert-info">

Step 1: Pre-tokenize texts
    
Step 2: Truncate sequences if they are longer than max_length
    
Step 3: Pad sequences to the longest sequence

</div>

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)
val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [15]:
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [16]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
19 validation batches
38 test batches


### Initialize model with Pretrained weights 

In [17]:
Model_1 = "gpt2-small (124M)"
Model_2 = "gpt2-xl (1558M)"
Input_Prompt = "Wake Up Early Morning"

BASE_CONFIG_1 = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

BASE_CONFIG_2 = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG_1.update(model_configs[Model_1])
BASE_CONFIG_2.update(model_configs[Model_2])

assert train_dataset.max_length <= BASE_CONFIG_1["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG_1['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG_1['context_length']}`"
)

assert train_dataset.max_length <= BASE_CONFIG_2["context_length"], (
    f"Dataset length {train_dataset.max_length} exceeds model's context "
    f"length {BASE_CONFIG_2['context_length']}. Reinitialize data sets with "
    f"`max_length={BASE_CONFIG_2['context_length']}`"
)


In [18]:
model_size_1 = "124M"
model_size_2 = "1558M"

settings_1, params_1 = download_and_load_gpt2(model_size= model_size_1, models_dir= "gpt2")
settings_2, params_2 = download_and_load_gpt2(model_size= model_size_2, models_dir= "gpt2")

model_1 = GPTModel(BASE_CONFIG_1)
load_weights_into_gpt(model_1, params_1)
model_1.eval();

model_2 = GPTModel(BASE_CONFIG_2)
load_weights_into_gpt(model_2, params_2)
model_2.eval();



File already exists and is up-to-date: gpt2\124M\checkpoint




File already exists and is up-to-date: gpt2\124M\encoder.json




File already exists and is up-to-date: gpt2\124M\hparams.json




File already exists and is up-to-date: gpt2\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\124M\model.ckpt.index




File already exists and is up-to-date: gpt2\124M\model.ckpt.meta




File already exists and is up-to-date: gpt2\124M\vocab.bpe




File already exists and is up-to-date: gpt2\1558M\checkpoint




File already exists and is up-to-date: gpt2\1558M\encoder.json




File already exists and is up-to-date: gpt2\1558M\hparams.json




File already exists and is up-to-date: gpt2\1558M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2\1558M\model.ckpt.index




File already exists and is up-to-date: gpt2\1558M\model.ckpt.meta




File already exists and is up-to-date: gpt2\1558M\vocab.bpe


In [19]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [20]:
text = "Going School is"

token_ids_1 = generate(
    model = model_1,
    idx = text_to_token_ids(text, tokenizer),
    max_new_tokens = 20,
    context_size = BASE_CONFIG_1['context_length'],
)
print(token_ids_to_text(token_ids_1, tokenizer))

token_ids_2 = generate(
    model = model_2,
    idx = text_to_token_ids(text, tokenizer),
    max_new_tokens = 20,
    context_size = BASE_CONFIG_2['context_length'],
)
print(token_ids_to_text(token_ids_2, tokenizer))

Going School is a great way to get started.

I'm not sure if you've heard of it,
Going School is a free, online, self-paced, online course that will help you learn how to become a


In [21]:
print(f'{model_1}\n --------\n{model_2}')

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [22]:
for params_1 in model_1.parameters():
    params_1.requires_grad = False
    
for params_2 in model_2.parameters():
    params_2.requires_grad = False

<div class="alert alert-block alert-success">

Then, we replace the output layer (model.out_head), which
originally maps the layer inputs to 50,257 dimensions (the size of the vocabulary):
</div>

In [23]:
torch.manual_seed(123)
num_classes = 2

model_1.out_head = torch.nn.Linear(in_features=BASE_CONFIG_1["emb_dim"], out_features=num_classes)

model_2.out_head = torch.nn.Linear(in_features=BASE_CONFIG_2["emb_dim"], out_features=num_classes)


<div class="alert alert-block alert-success">

Additionally, we configure the last transformer block and the final LayerNorm module,
which connects this block to the output layer, to be trainable
    
</div>

In [24]:
for params_1 in model_1.trf_blocks[-1].parameters():
    params_1.requires_grad = True

for params_1 in model_1.final_norm.parameters():
    params_1.requires_grad = True
    
for params_2 in model_2.trf_blocks[-1].parameters():
    params_2.requires_grad = True

for params_2 in model_2.final_norm.parameters():
    params_2.requires_grad = True

### Loss & Accuracy

In [25]:
# inputs = tokenizer.encode("Do you have time")
# inputs = torch.tensor(inputs).unsqueeze(0)
# print("Inputs:", inputs)
# print("Inputs dimensions:", inputs.shape)

In [26]:
# with torch.no_grad():
#     outputs_1 = model_1(inputs)
#     outputs_2 = model_2(inputs)
# 
# print("Outputs of model 1:\n", outputs_1)
# print("Outputs dimensions of model 1:", outputs_1.shape)
# 
# print("Outputs of model 2:\n", outputs_2)
# print("Outputs dimensions of model 2:", outputs_2.shape)

In [27]:
# probas_1 = torch.softmax(outputs_1[:, -1, :], dim=-1)
# label_1 = torch.argmax(probas_1)
# print("Class label:", label_1.item())
# 
# probas_2 = torch.softmax(outputs_2[:, -1, :], dim=-1)
# label_2 = torch.argmax(probas_2)
# print("Class label:", label_2.item())

In [28]:
# logits_1 = outputs_1[:, -1, :]
# label_1 = torch.argmax(logits_1)
# print("Class label:", label_1.item())
# 
# logits_2 = outputs_2[:, -1, :]
# label_2 = torch.argmax(logits_2)
# print("Class label:", label_2.item())

In [29]:
device = torch.device("cuda")
model_1.to(device)
model_2.to(device)

torch.manual_seed(123)

<torch._C.Generator at 0x23ec60b2b30>

In [30]:
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]  # Logits of last output token
            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

#### Accuracy

In [32]:
train_accuracy_1 = calc_accuracy_loader(train_loader, model_1, device, num_batches=10)
val_accuracy_1 = calc_accuracy_loader(val_loader, model_1, device, num_batches=10)
test_accuracy_1 = calc_accuracy_loader(test_loader, model_1, device, num_batches=10)
print(f'Model_1\nTrain accuracy: {train_accuracy_1}\nTest accuracy: {test_accuracy_1}\nValidation accuracy: {val_accuracy_1}')
# 
# train_accuracy_2 = calc_accuracy_loader(train_loader, model_2, device, num_batches=10)
# val_accuracy_2 = calc_accuracy_loader(val_loader, model_2, device, num_batches=10)
# test_accuracy_2 = calc_accuracy_loader(test_loader, model_2, device, num_batches=10)
# print(f'Model_2\nTrain Accuracy: {train_accuracy_2}\nTest Accuracy: {test_accuracy_2}\nValidation Accuracy: {val_accuracy_2}')

Model_1
Train accuracy: 0.5
Test accuracy: 0.4875
Validation accuracy: 0.45


#### Loss

In [33]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]  # Logits of last output token
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

# Same as in chapter 5
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [34]:
with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss_1 = calc_loss_loader(train_loader, model_1, device, num_batches=5)
    val_loss_1 = calc_loss_loader(val_loader, model_1, device, num_batches=5)
    test_loss_1 = calc_loss_loader(test_loader, model_1, device, num_batches=5)
    
    train_loss_2 = calc_loss_loader(train_loader, model_2, device, num_batches=5)
    val_loss_2 = calc_loss_loader(val_loader, model_2, device, num_batches=5)
    test_loss_2= calc_loss_loader(test_loader, model_2, device, num_batches=5)

print(f"Training loss: {train_loss_1:.3f}")
print(f"Validation loss: {val_loss_1:.3f}")
print(f"Test loss: {test_loss_1:.3f}")
print('*'*10)
print(f"Training loss: {train_loss_2:.3f}")
print(f"Validation loss: {val_loss_2:.3f}")
print(f"Test loss: {test_loss_2:.3f}")

Training loss: 1.542
Validation loss: 2.583
Test loss: 2.322
**********
Training loss: 0.754
Validation loss: 0.714
Test loss: 0.683


### Fine Tuning
<div class="alert alert-block alert-info">
    
Step 1: Set model to training mode

Step 2: Reset loss gradients from previous batch iteration

Step 3: Calculate loss gradients

Step 4: Update model weights using loss gradients

Step 5: New: track examples instead of tokens

Step 6: Optional evaluation step

Step 7: Calculate accuracy after each epoch

</div>

In [35]:
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter):
    # Initialize lists to track losses and examples seen
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)

    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [36]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [37]:
start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model_1.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model_1, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed of Model 1 in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.153, Val loss 2.392
Ep 1 (Step 000050): Train loss 0.617, Val loss 0.637
Ep 1 (Step 000100): Train loss 0.523, Val loss 0.557
Training accuracy: 70.00% | Validation accuracy: 72.50%
Ep 2 (Step 000150): Train loss 0.561, Val loss 0.489
Ep 2 (Step 000200): Train loss 0.419, Val loss 0.397
Ep 2 (Step 000250): Train loss 0.409, Val loss 0.353
Training accuracy: 82.50% | Validation accuracy: 85.00%
Ep 3 (Step 000300): Train loss 0.333, Val loss 0.320
Ep 3 (Step 000350): Train loss 0.340, Val loss 0.306
Training accuracy: 90.00% | Validation accuracy: 90.00%
Ep 4 (Step 000400): Train loss 0.136, Val loss 0.200
Ep 4 (Step 000450): Train loss 0.153, Val loss 0.132
Ep 4 (Step 000500): Train loss 0.222, Val loss 0.137
Training accuracy: 100.00% | Validation accuracy: 97.50%
Ep 5 (Step 000550): Train loss 0.207, Val loss 0.143
Ep 5 (Step 000600): Train loss 0.083, Val loss 0.074
Training accuracy: 100.00% | Validation accuracy: 97.50%
Training completed of Model 1

In [38]:
# start_time = time.time()
# 
# torch.manual_seed(123)
# 
# optimizer_2 = torch.optim.AdamW(model_1.parameters(), lr=5e-5, weight_decay=0.1)
# 
# num_epochs = 5
# train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
#     model_2, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=50, eval_iter=5,
# )
# 
# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"Training completed of Model 2 in {execution_time_minutes:.2f} minutes.")

<div class="alert alert-block alert-info">
    
Step 1: Prepare inputs to the model

Step 2: Truncate sequences if they too long
    
Step 3: Pad sequences to the longest sequence

Step 4: Add batch dimension

Step 5: Model inference without gradient tracking
    
Step 6: Logits of the last output token

Step 7: Return the classified result

</div>

In [39]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    # Prepare inputs to the model
    input_ids = tokenizer.encode(text)
    supported_context_length = model.pos_emb.weight.shape[0]
    # Note: In the book, this was originally written as pos_emb.weight.shape[1] by mistake
    # It didn't break the code but would have caused unnecessary truncation (to 768 instead of 1024)

    # Truncate sequences if they too long
    input_ids = input_ids[:min(max_length, supported_context_length)]

    # Pad sequences to the longest sequence
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # add batch dimension

    # Model inference
    with torch.no_grad():
        logits = model(input_tensor)[:, -1, :]  # Logits of the last output token
    predicted_label = torch.argmax(logits, dim=-1).item()

    # Return the classified result
    return "spam" if predicted_label == 1 else "not spam"

In [40]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_review(
    text_1, model_1, tokenizer, device, max_length=train_dataset.max_length
))

spam


In [42]:
torch.save(model_1.state_dict(), "review_classifier.pth")

In [44]:
model_state_dict = torch.load("review_classifier.pth")
model_1.load_state_dict(model_state_dict)

  model_state_dict = torch.load("review_classifier.pth")


<All keys matched successfully>