In [2]:
import torch
import torch.nn as nn

#####################################
# Chapter 3
#####################################
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec


#####################################
# Chapter 4
#####################################
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
MODEL_CONFIG = GPT_CONFIG_124M.copy()
MODEL_CONFIG.update(model_configs[model_name])
MODEL_CONFIG.update({"context_length": 1024, "qkv_bias": True})

model = GPTModel(MODEL_CONFIG)

In [4]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


# Loading PreTrained GPT2 124M Model Weights
import os
import tensorflow as tf
import json
import numpy as np

def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params


model_dir = '../../ch05/00_ME-Experimenting/gpt2/124M/'
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

print(settings)
print(params.keys()) # Model parameters loaded successfully!

  if not hasattr(np, "object"):


{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [5]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
    
load_weights_into_gpt(model, params)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [6]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [9]:
for params in model.parameters():
    params.requires_grad = False

for params in model.parameters():
    print(params.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [10]:
model.out_head = nn.Linear(in_features=768, out_features=2)

In [15]:
for params in model.final_norm.parameters():
    params.requires_grad = True

In [40]:
for params in model.out_head.parameters():
    print(params.requires_grad)

True
True


In [12]:
for params in model.trf_blocks[-1].parameters():
    params.requires_grad = True

In [32]:
for params in model.trf_blocks[-1].parameters():
    print(params.requires_grad)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [17]:
# 0 -> spam
# 1 -> not-spam

import pandas as pd
train_csv_path = 'my-dataset/train.csv'
val_csv_path = 'my-dataset/val.csv'
test_csv_path = 'my-dataset/test.csv'

train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)
test_df = pd.read_csv(test_csv_path)

In [18]:
train_df['label'].value_counts()

label
1    539
0    506
Name: count, dtype: int64

In [19]:
val_df['label'].value_counts()

label
0    171
1    143
Name: count, dtype: int64

In [20]:
test_df['label'].value_counts()

label
0    70
1    65
Name: count, dtype: int64

In [197]:
text_max_length = 0
for text in train_df['text']:
    text_len = len(text)
    if text_max_length < text_len:
        text_max_length = text_len
text_max_length

611

In [198]:
text_max_length = 0
for text in val_df['text']:
    text_len = len(text)
    if text_max_length < text_len:
        text_max_length = text_len
text_max_length

329

In [199]:
text_max_length = 0
for text in test_df['text']:
    text_len = len(text)
    if text_max_length < text_len:
        text_max_length = text_len
text_max_length

372

In [22]:
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_text_len=None, pad_id=50256):
        self.df = df
        self.x = [tokenizer.encode(text) for text in self.df['text']]
        self.max_text_len = max_text_len
        if (max_text_len is None):
            self.max_text_len = self._getMaxTextLen()
        # Padding:
        for i, x in enumerate(self.x):
            x = x[: self.max_text_len]
            x += [pad_id] * (self.max_text_len - len(x))
            self.x[i] = x
        self.x = torch.tensor(self.x)
    
    def __getitem__(self, i):
        x = self.x[i]
        y = self.df['label'].iloc[i].item()
        return (x, y)

    def __len__(self):
        return len(self.df)
    
    def _getMaxTextLen(self):
        text_max_length = 0
        for textTokens in self.x:
            ans = len(textTokens)
            if (text_max_length < ans):
                text_max_length = ans
        return text_max_length

tokenizer = tiktoken.get_encoding('gpt2')

# Train, Val, Test Dataset:
train_dataset = SpamDataset(train_df, tokenizer, max_text_len=120)
val_dataset = SpamDataset(val_df, tokenizer, max_text_len=120)
test_dataset = SpamDataset(test_df, tokenizer, max_text_len=120)

# Train, Val, Test Dataloader: 
train_dataloader = DataLoader(train_dataset, batch_size=8, drop_last=True, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=8, drop_last=True, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=8, drop_last=True, shuffle=True, num_workers=0)

In [42]:
txt = "Its a not scam!!."
encoding = torch.tensor([tokenizer.encode(txt)])
model.to('cpu')
torch.argmax(model(encoding)[:, -1, :], -1)

tensor([1])

In [32]:
torch.cuda.empty_cache()

In [33]:
from tqdm import tqdm
import torch

model.to(device)
model.train()  # ðŸ‘ˆ Important: set to train mode (you had eval() before!)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
EPOCH = 5

for epoch in range(EPOCH):
    losses_train = []
    # Wrap the dataloader with tqdm for a progress bar
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCH}", leave=True)
    
    for i, (x, y) in enumerate(progress_bar):
        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device)
        logits = model(x)[:, -1, :]
        loss = torch.nn.functional.cross_entropy(logits, y)
        loss.backward()
        optimizer.step()
        losses_train.append(loss.item())
        
        # Update progress bar with current loss
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    AVG_EPOCH_LOSS = sum(losses_train) / len(losses_train)
    print(f'EPOCH {epoch}: AVG_EPOCH_LOSS = {AVG_EPOCH_LOSS:.6f}')

Epoch 1/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 130/130 [00:22<00:00,  5.69it/s, loss=0.6986]


EPOCH 0: AVG_EPOCH_LOSS = 0.895657


Epoch 2/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 130/130 [00:22<00:00,  5.74it/s, loss=0.5141]


EPOCH 1: AVG_EPOCH_LOSS = 0.609003


Epoch 3/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 130/130 [00:22<00:00,  5.72it/s, loss=0.2695]


EPOCH 2: AVG_EPOCH_LOSS = 0.569013


Epoch 4/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 130/130 [00:22<00:00,  5.70it/s, loss=0.6956]


EPOCH 3: AVG_EPOCH_LOSS = 0.532076


Epoch 5/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 130/130 [00:22<00:00,  5.69it/s, loss=0.3335]

EPOCH 4: AVG_EPOCH_LOSS = 0.483948





In [17]:
def calculate_accuracy(model, loader: DataLoader, device='cpu'):
    model.eval()
    model.to(device)
    
    correct_predictions = 0
    total_examples = 0

    # Wrap the entire dataloader with tqdm ONCE
    progress_bar = tqdm(loader, desc="Validation", leave=True)

    with torch.no_grad():
        for x, y in progress_bar:
            x = x.to(device)
            y = y.to(device)

            logits = model(x)[:, -1, :]
            predictions = logits.argmax(-1)

            correct = (predictions == y).sum().item()
            batch_size = y.size(0)  # safer than loader.batch_size

            correct_predictions += correct
            total_examples += batch_size

            # Update progress bar with current accuracy
            current_acc = (correct_predictions / total_examples) * 100
            progress_bar.set_postfix({'acc': f'{current_acc:.2f}%'})

    final_acc = (correct_predictions / total_examples) * 100
    return final_acc

In [20]:
calculate_accuracy(model, val_dataloader)

Validation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 39/39 [00:19<00:00,  1.96it/s, acc=84.94%]


84.93589743589743

In [34]:
model.eval()
model.to('cpu')
for i in range(len(train_df)):
    txt = train_df['text'].iloc[i]
    encoding = torch.tensor([tokenizer.encode(txt)])
    with torch.no_grad():
        output = model(encoding)[:, -1, :].argmax(-1).item()
    target = train_df['label'].iloc[i]
    
    print(f'Target:{target}; Output:{output}')
    if i > 20:
        break

Target:1; Output:0
Target:0; Output:0
Target:1; Output:0
Target:1; Output:0
Target:1; Output:0
Target:1; Output:0
Target:0; Output:0
Target:0; Output:0
Target:1; Output:0
Target:1; Output:0
Target:0; Output:0
Target:1; Output:0
Target:1; Output:0
Target:0; Output:0
Target:0; Output:0
Target:0; Output:0
Target:1; Output:0
Target:0; Output:0
Target:1; Output:0
Target:0; Output:0
Target:1; Output:0
Target:0; Output:0


In [None]:
PAD_ID = 50256
MAX_LEN = 120

def encode_with_padding(text, tokenizer, max_len=MAX_LEN, pad_id=PAD_ID):
    tokens = tokenizer.encode(text)
    tokens = tokens[:max_len]
    tokens += [pad_id] * (max_len - len(tokens))
    return torch.tensor(tokens)

model.eval()
model.to('cpu')
for i in range(len(val_df)):
    txt = val_df['text'].iloc[i]
    encoding = encode_with_padding(txt, tokenizer).unsqueeze(0)  # âœ… Now with padding!
    with torch.no_grad():
        output = model(encoding)[:, -1, :].argmax(-1).item()
    target = val_df['label'].iloc[i]
    
    print(f'Target:{target}; Output:{output}')
    if i > 20:
        break

In [213]:
test_df['text'].iloc[0] # spam text

"You'll not rcv any more msgs from the chat svc. For FREE Hardcore services text GO to: 69988 If u get nothing u must Age Verify with yr network & try again"

In [222]:
test_df['label'].value_counts()

label
0    70
1    65
Name: count, dtype: int64

In [18]:
calculate_accuracy(model, test_dataloader)

Validation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:07<00:00,  2.08it/s, acc=99.22%]


99.21875

In [None]:
# Checking the model:
train_df['text']

In [43]:
# Training pads every sequence to 120 tokens, so reading logits at index -1 expects that padded slot.
# Directly encoding texts without the same padding causes the head to see a different position, leading to constant predictions.
PAD_ID = 50256
MAX_LEN = 120

def encode_with_padding(text, tokenizer, max_len=MAX_LEN, pad_id=PAD_ID):
    tokens = tokenizer.encode(text)
    tokens = tokens[:max_len]
    tokens += [pad_id] * (max_len - len(tokens))
    return torch.tensor(tokens)

def predict_batch(texts, model, tokenizer, device=model.out_head.weight.device):
    model.eval()
    batch = torch.stack([encode_with_padding(t, tokenizer) for t in texts])
    batch = batch.to(device)
    with torch.no_grad():
        logits = model(batch)[:, -1, :]
    return logits.argmax(dim=-1).cpu()

sample_texts = train_df['text'].iloc[:8].tolist()
print(predict_batch(sample_texts, model, tokenizer))

tensor([1, 0, 1, 1, 1, 1, 0, 0])
