In [2]:
import json

data = []
with open("dataset/sarcasm/sarcasm_instruction_pairs.jsonl", "r", encoding="utf-8") as file:
    for line in file:
        data.append(json.loads(line))
print(len(data))


726


In [3]:
def format_input(entry):
    instruction_text = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 
    
### Instruction:\n{entry['instruction']}
    """

    return instruction_text

In [4]:
print(format_input(data[301]))


Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
 Do you think sharing personal information on Google+ is a good idea?
    


In [5]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 617
Validation set length: 37
Test set length: 72


In [6]:
import torch
from torch.utils.data import Dataset


class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['response']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [7]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [8]:
def custom_collate_draft_2(
    batch,
    pad_token_id=50256,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs to tensor and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [9]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Use PyTorch 2.9 or newer for stable mps results
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cpu


In [11]:
from functools import partial

customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [12]:
from torch.utils.data import DataLoader


num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

In [13]:
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [34]:
import torch
from untrained_model import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 256,        # Embedding dimension
    "n_heads": 4,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

checkpoint = torch.load("model_and_optimizer_small.pth", weights_only=True)

model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();

In [24]:
torch.manual_seed(123)

input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
 Do you think tea tree oil and condoms are commonly placed together in stores?
    


In [31]:
from untrained_model import generate
from untrained_model import text_to_token_ids
from untrained_model import token_ids_to_text

inference_device = torch.device("cpu")

model.to(inference_device)
model.eval()

torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer).to(inference_device),
    max_new_tokens=35,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.5
)
generated_text = token_ids_to_text(token_ids, tokenizer)

response_text = (
    generated_text[len(input_text):]
    .replace("### Response:", "")
    .strip()
)
print(response_text)

# model.train();

*   “let   ;!””
I was my head until my voice would probably sticking in to inform my wife under his bra“


In [64]:
from untrained_model import (
    calc_loss_loader,
    train_model_simple
)

model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 3.3913725852966308
Validation loss: 3.7617190361022947


In [56]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 1

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 3.550, Val loss 3.812
Ep 1 (Step 000005): Train loss 3.624, Val loss 3.815
Ep 1 (Step 000010): Train loss 3.496, Val loss 3.803
Ep 1 (Step 000015): Train loss 3.579, Val loss 3.794
Ep 1 (Step 000020): Train loss 3.349, Val loss 3.793
Ep 1 (Step 000025): Train loss 3.475, Val loss 3.788
Ep 1 (Step 000030): Train loss 3.575, Val loss 3.787
Ep 1 (Step 000035): Train loss 3.435, Val loss 3.801
Ep 1 (Step 000040): Train loss 3.568, Val loss 3.789
Ep 1 (Step 000045): Train loss 3.538, Val loss 3.774
Ep 1 (Step 000050): Train loss 3.596, Val loss 3.791
Ep 1 (Step 000055): Train loss 3.350, Val loss 3.776
Ep 1 (Step 000060): Train loss 3.404, Val loss 3.764
Ep 1 (Step 000065): Train loss 3.645, Val loss 3.764
Ep 1 (Step 000070): Train loss 3.387, Val loss 3.767
Ep 1 (Step 000075): Train loss 3.390, Val loss 3.763
Below is an instruction that describes a task. Write a response that appropriately completes the request.   ### Instruction:  Do you think tea tree oil 

In [71]:
model.to(inference_device)
model.eval()

torch.manual_seed(123)
input_text = format_input({"instruction":"Do you think politics is good ?"})

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer).to(inference_device),
    max_new_tokens=30,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=10,
    temperature=0.7
)
generated_text = token_ids_to_text(token_ids, tokenizer)

response_text = (
    generated_text[len(input_text):]
    .replace("### Response:", "")
    .replace("###", "")
    .replace("Response:", "")
    .replace("Response", "")
    .replace("<|endoftext|>", "")
    .strip()
)
print(response_text)

I don't not sure it's all they were no right to know why, or the other


In [55]:
file_name = f"sarcasm_finetuned_v1.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")

# Load model via
# model.load_state_dict(torch.load("sarcasm_finetuned_v1.pth"))

Model saved as sarcasm_finetuned_v1.pth
