In [3]:
import torch
import torch.nn as nn

In [14]:
class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
    
    def forward(self, x):
        # Shortcut connection for attnetion block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back

        return x

In [15]:
class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
    
    def forward(self, x):
        # Shortcut connection for attnetion block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # Add the original input back

        return x

In [16]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )
    
    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        # implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec
    
    

In [17]:
class FeedForward(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

In [18]:
class GELU(nn.Module):

    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [19]:
class LayerNorm(nn.Module):

    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [20]:
import tiktoken
import time


GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortended context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

model = GPTModel(GPT_CONFIG_124M)

In [11]:
def generate(model, idx, max_new_tokens, context_size, tokenizer, text_to_token_ids, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
        
        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1) # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
        
        # Otherwise, same as before: get the idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1)
        
        if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break
        
        # if idx_next == text_to_token_ids(".", tokenizer):
        if idx_next == "tensor([[13]])":
            # idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
            print("\nperiod\n")
        
        # if idx_next == text_to_token_ids("?", tokenizer):
        if idx_next == "tensor([[30]])":
            # idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
            print("\nperiod\n")
        
        # if idx_next == text_to_token_ids("!", tokenizer):
        if idx_next == "tensor([[0]])":
            # idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
            print("\nperiod\n")
        
        # print(idx_next)
        # print("----")
        # print(idx_next + text_to_token_ids("Meow.", tokenizer))
        # test = idx_next + text_to_token_ids("Meow.", tokenizer)
        # print("------")
        # print(token_ids_to_text(idx_next, tokenizer))
        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)
    
    return idx

In [21]:
print(text_to_token_ids("!", tokenizer))

NameError: name 'text_to_token_ids' is not defined

In [22]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [23]:
def train_model(model, train_loader, val_loader, optimizer, device,
                n_epochs, eval_freq, eval_iter, start_context, tokenizer,
                warmup_steps, initial_lr=3e-05, min_lr=1e-6):
    
    train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], []
    tokens_seen, global_step = 0, -1

    # Retrieve the maximum learning rate from the optimizer
    peak_lr = optimizer.param_groups[0]["lr"]

    # Calculate the total number of iterations in the training process
    total_training_steps = len(train_loader) * n_epochs

    # Calculate the learning rate increment during the warmup phase
    lr_increment = (peak_lr - initial_lr) / warmup_steps

    for epoch in range(n_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            global_step += 1

            # Adjust the learning rate based on the current phase (warmup or cosine annealing)
            if global_step < warmup_steps:
                # Linear warmup
                lr = initial_lr + global_step * lr_increment
            else:
                # Cosine annealing after warmup
                progress = ((global_step - warmup_steps) /
                            (total_training_steps - warmup_steps))
                lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))
            
            # Apply the calculated learning rate to the optimizer
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr
            track_lrs.append(lr) # Store the current learning rate

            # Calculate and backpropagate the loss
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()

            # Apply gradient clipping after the warmup phase to avoid exploding gradients
            if global_step > warmup_steps:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            tokens_seen += input_batch.numel()

            # Periodically evaluate the model on the training and validation sets
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader,
                    device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                # Print the current losses
                print(f"Ep {epoch+1} (Iter {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}"
                )

        # Generate and print a sample from the model to monitor progress
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    
    return train_losses, val_losses, track_tokens_seen, track_lrs

In [24]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2") # A - Initalize the tokenizer
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) # B - Create dataset
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last, # C - drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training
        num_workers=0 # D - The number of CPU processes to use for preprocessing
    )

    return dataloader

In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # A

        for i in range(0, len(token_ids) - max_length, stride): # B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i +max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [26]:
tokenizer = tiktoken.get_encoding("gpt2")

print(text_to_token_ids("Meow.", tokenizer))
print(text_to_token_ids(".", tokenizer))

tensor([[5308,  322,   13]])
tensor([[13]])


In [27]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " ")) # Compact print format
    model.train()

In [28]:
import math

In [29]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [30]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        idx_cond = idx[:, -context_size:]

        # get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # apply softmax to get the probabilities
        probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)

        # if idx_next == text_to_token_ids(".", tokenizer):
        #     idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
        
        # if idx_next == text_to_token_ids("?", tokenizer):
        #     idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)
        
        # if idx_next == text_to_token_ids("!", tokenizer):
        #     idx_next = idx_next + text_to_token_ids("Meow.", tokenizer)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (batch , n_tokens+1)

    return idx

In [31]:
import math

missingLinks = []
for PG in range(1, 74439):

    if PG < 10001:
        if PG not in (10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 30, 31, 32, 33, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 51, 52, 54, 55, 57, 58, 59, 60, 61, 62, 64, 68, 70,71,72,73, 
74, 
76, 
77, 
78, 
81, 
82, 
83, 
84, 
85, 
86, 
90, 
91, 
92, 
93, 
94, 
95, 
96, 
97, 
98, 
99, 
114, 
129, 
182, 
183, 
184, 
185, 
186, 
187, 
188, 
189, 
190, 
191, 
192, 
193, 
194, 
195, 
196, 
197, 
198, 
199, 
239, 
256, 
462, 
628, 
629, 
630, 
631, 
632, 
635, 
664, 
745, 
758, 
900, 
928, 
1070, 
1071, 
1072, 
1073, 
1255, 
1316, 
1344, 
1460, 
1647, 
1648, 
1691, 
1723, 
1766, 
1767, 
1789, 
1856, 
1914, 
1939, 
1964, 
1984, 
2001, 
2091, 
2200, 
2201, 
2202, 
2203, 
2204, 
2205, 
2206, 
2207, 
2208, 
2209, 
2210, 
2211, 
2212, 
2213, 
2214, 
2215, 
2216, 
2217, 
2218, 
2219, 
2220, 
2221, 
2222, 
2223, 
2224, 
2367, 
2623, 
2624, 
2625, 
2626, 
2738, 
2770, 
2774, 
2877, 
2879, 
2994, 
3002, 
3018, 
3184, 
3185, 
3201, 
3445, 
3446, 
3448, 
3449, 
3450, 
3501, 
3502, 
3503, 
3504, 
3505, 
3506, 
3507, 
3508, 
3509, 
3510, 
3511, 
3513, 
3514, 
3515, 
3516, 
3517, 
3518, 
3519, 
3520, 
3521, 
3522, 
3523, 
3524, 
3651, 
3803, 
3926, 
4002, 
4107, 
4274, 
4405, 
4699, 
4749, 
4750, 
4751, 
4935, 
4936, 
4949, 
4950, 
4951, 
5001, 
5124, 
5188, 
5189, 
5190, 
5212, 
5213, 
5214, 
5215, 
5216, 
5373, 
5613, 
5627, 
5634, 
5635, 
5714, 
5740, 
5884, 
5885, 
5886, 
6084, 
6191, 
6302, 
6532, 
6533, 
6534, 
6535, 
6536, 
6537, 
6538, 
6539, 
6540, 
6541, 
6542, 
6543, 
6544, 
6546, 
6547, 
6548, 
6550, 
6551, 
6552, 
6553, 
6554, 
6555, 
6556, 
6557, 
6620, 
6871, 
6951, 
7092, 
7093, 
7094, 
7507, 
7536, 
7684, 
7825, 
7869, 
7872, 
7873, 
7874, 
8204, 
8205, 
8227, 
8608, 
8609, 
8610, 
8611, 
8612, 
8613, 
8614, 
8615, 
8616, 
8617, 
8618, 
8619, 
8620, 
8621, 
8622, 
8623, 
8624, 
8625, 
8626, 
8627, 
8628, 
8629, 
8630, 
8631, 
8632, 
8633, 
8634, 
8635, 
8636, 
8637, 
8746, 
8748, 
8749, 
8750, 
8751, 
8752, 
8753, 
8754, 
8755, 
8756, 
8757, 
8758, 
8759, 
8760, 
8761, 
8762, 
8763, 
8764, 
8765, 
8766, 
8767, 
8768, 
8769, 
8806, 
8807, 
8808, 
8809, 
8810, 
8811, 
8812, 
8816, 
8817, 
8818, 
8958, 
8959, 
8960, 
8962, 
8963, 
8965, 
8966, 
8967, 
8968, 
8969, 
8970, 
8971, 
8972, 
8973, 
8974, 
8975, 
8976, 
8977, 
8978, 
8979, 
8980, 
8981, 
8982, 
8983, 
8984, 
8985, 
8986, 
8987, 
8988, 
8989, 
8990, 
9001, 
9002, 
9003, 
9004, 
9005, 
9006, 
9007, 
9008, 
9009, 
9010, 
9011, 
9012, 
9013, 
9014, 
9015, 
9016, 
9017, 
9018, 
9019, 
9020, 
9021, 
9022, 
9023, 
9024, 
9025, 
9026, 
9027, 
9028, 
9029, 
9030, 
9031, 
9032, 
9033, 
9034, 
9035, 
9036, 
9037, 
9038, 
9039, 
9040, 
9041, 
9042, 
9056, 
9113, 
9114, 
9115, 
9116, 
9117, 
9118, 
9119, 
9120, 
9121, 
9122, 
9123, 
9124, 
9125, 
9126, 
9127, 
9128, 
9129, 
9130, 
9131, 
9132, 
9133, 
9134, 
9135, 
9136, 
9137, 
9138, 
9139, 
9140, 
9141, 
9142, 
9143, 
9144, 
9145, 
9146, 
9147, 
9255, 
9268, 
9269, 
9270, 
9271, 
9272, 
9273, 
9274, 
9275, 
9276, 
9277, 
9278, 
9279, 
9280, 
9281, 
9282, 
9283, 
9284, 
9285, 
9286, 
9287, 
9288, 
9289, 
9290, 
9291, 
9292, 
9293, 
9336, 
9337, 
9338, 
9339, 
9340, 
9341, 
9342, 
9343, 
9344, 
9345, 
9346, 
9347, 
9348, 
9349, 
9350, 
9351, 
9352, 
9353, 
9354, 
9355, 
9356, 
9357, 
9358, 
9359, 
9360, 
9361, 
9392, 
9416, 
9417, 
9418, 
9419, 
9420, 
9421, 
9422, 
9423, 
9424, 
9425, 
9426, 
9427, 
9428, 
9429, 
9430, 
9431, 
9432, 
9433, 
9434, 
9435, 
9436, 
9437, 
9438, 
9451, 
9452, 
9510, 
9511, 
9512, 
9513, 
9514, 
9515, 
9516, 
9517, 
9518, 
9519, 
9520, 
9521, 
9522, 
9523, 
9524, 
9525, 
9526, 9527, 9528, 9529, 9530, 9531, 9532, 9533, 9534, 9535, 9536, 9537, 9538, 9539, 9540, 9541,9551,9552, 9553, 
9554, 9555, 
9556, 
9557, 
9558, 
9671, 
9672, 
9673, 9674, 9675, 9676, 9677, 9678, 9679, 9680, 
9681, 
9682, 9683, 9684, 9685, 9686, 9687, 9688, 9689, 9690, 9691, 9692, 9693, 9694, 9695, 9696, 9697, 9698, 9699, 9702, 9703, 9704, 9705, 9706, 9707, 9708, 9709, 9710, 9711, 9712, 9713, 9714, 9715, 9716, 9717, 9718, 9719, 9720, 9721, 9722, 9723, 9724, 9725, 9726, 9727, 9728, 9729, 
9730, 9731, 9732, 9733, 9734, 9735, 9736, 9737, 9738, 9739, 9740, 9741, 9742, 9743, 9744, 9830, 9930, 9933, 9934, 9942,):

            # print("--------------------")
            # print(f"PG {PG}")
            import os
            import urllib.request

            file_path = f"text/PG{PG}_text.txt"
            url = "https://huggingface.co/datasets/KittyCat00/CatGPTDataset/raw/main/PG" + str(PG) + "_text.txt"

            # # # if not os.path.exists(file_path):
            # # #     with urllib.request.urlopen(url) as response:
            # # #         text_data = response.read().decode('utf-8')
            # # #     with open(file_path, "w", encoding="utf-8") as file:
            # # #         file.write(text_data)
            # # # else:
            # # #     with open(file_path, "r", encoding="utf-8") as file:
            # # #         text_data = file.read()
            # with urllib.request.urlopen(url) as response:
            #     text_data = response.read().decode('utf-8')
            # with open(file_path, "w", encoding="utf-8") as file:
            #     file.write(text_data)

            missingLinks = []


            # import module
            from urllib.request import urlopen
            from urllib.error import *
 
            # try block to read URL
            try:
                html = urlopen(url)
     
            # except block to catch
            # exception
            # and identify error
            except HTTPError as e:
                # print("HTTP error", e)
                print(str(PG) + ", ")
                missingLinks.append(PG)
                
            except URLError as e:
                # print("Opps ! Page not found!", e)
                print(str(PG) + ", ")
                missingLinks.append(PG)
                
            
           
print(missingLinks)

In [324]:
for PG in range(1, 74439):

    if PG < 10001 and PG > 24 :
        if PG not in (10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 30, 31, 32, 33, 35, 36, 40, 41, 42, 43, 44, 45, 46, 47, 51, 52, 54, 55, 57, 58, 59, 60, 61, 62, 64, 68):

            print("--------------------")
            print(f"PG {PG}")

            model.to("cpu")
            model.eval()

            tokenizer = tiktoken.get_encoding("gpt2")

            torch.manual_seed(123)

            token_ids = generate(
                model=model,
                idx=text_to_token_ids("Every effort moves you", tokenizer),
                max_new_tokens=15,
                context_size=GPT_CONFIG_124M["context_length"],
                top_k=25,
                temperature=1.4,
                text_to_token_ids=text_to_token_ids,
                tokenizer=tokenizer
            )

            print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


            torch.manual_seed(123)

            token_ids = generate(
                model=model,
                idx=text_to_token_ids("Every effort moves you", tokenizer),
                max_new_tokens=15,
                context_size=GPT_CONFIG_124M["context_length"],
                top_k=25,
                temperature=1.4,
                text_to_token_ids=text_to_token_ids,
                tokenizer=tokenizer
            )

            print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


            optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)


            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                },
                "model_and_optimizer.pth"
            )


            checkpoint = torch.load("model_and_optimizer.pth", weights_only=True)

            model = GPTModel(GPT_CONFIG_124M)
            model.load_state_dict(checkpoint["model_state_dict"])

            optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            model.train()


            from importlib.metadata import version


            print("torch version:", version("torch"))

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            if torch.cuda.is_available():
                device = torch.device("cuda")
            elif torch.backends.mps.is_available():
                device = torch.device("mps")
            else:
                device = torch.device("cpu")

            print(f"Using {device} device.")

            torch.manual_seed(123)
            model = GPTModel(GPT_CONFIG_124M)
            model.eval()


            import os
            import urllib.request

            file_path = f"text/PG{PG}_text.txt"
            url = "https://huggingface.co/datasets/KittyCat00/CatGPTDataset/raw/main/PG" + str(PG) + "_text.txt"

            # if not os.path.exists(file_path):
            #     with urllib.request.urlopen(url) as response:
            #         text_data = response.read().decode('utf-8')
            #     with open(file_path, "w", encoding="utf-8") as file:
            #         file.write(text_data)
            # else:
            #     with open(file_path, "r", encoding="utf-8") as file:
            #         text_data = file.read()
            with urllib.request.urlopen(url) as response:
                text_data = response.read().decode('utf-8')
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(text_data)


            # Train/validation ratio
            train_ratio = 0.90
            split_idx = int(train_ratio * len(text_data))


            torch.manual_seed(123)

            train_loader = create_dataloader_v1(
                text_data[:split_idx],
                batch_size=2,
                max_length=GPT_CONFIG_124M["context_length"],
                stride=GPT_CONFIG_124M["context_length"],
                drop_last=True,
                shuffle=True,
                num_workers=0
            )

            val_loader = create_dataloader_v1(
                text_data[split_idx:],
                batch_size=2,
                max_length=GPT_CONFIG_124M["context_length"],
                stride=GPT_CONFIG_124M["context_length"],
                drop_last=False,
                shuffle=False,
                num_workers=0
            )


            n_epochs = 15
            initial_lr = 0.0001
            peak_lr = 0.01


            total_steps = len(train_loader) * n_epochs
            warmup_steps = int(0.2 * total_steps) # 20% warmup
            print(warmup_steps)


            import tiktoken
            import time

            start_time = time.time()

            torch.manual_seed(123)
            model = GPTModel(GPT_CONFIG_124M)
            model.to(device)

            peak_lr = 5e-4
            optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1)
            tokenizer = tiktoken.get_encoding("gpt2")

            n_epochs = 15
            train_losses, val_losses, tokens_seen, lrs = train_model(
                model, train_loader, val_loader, optimizer, device, n_epochs=n_epochs,
                eval_freq=5, eval_iter=1, start_context="Every effort moves you",
                tokenizer=tokenizer, warmup_steps=warmup_steps,
                initial_lr=1e-5, min_lr=1e-5
            )

            end_time = time.time()
            execution_time_minutes = (end_time - start_time) / 60
            print(f"Training completed in {execution_time_minutes:.2f} minutes.")

            

--------------------
PG 25
Output text:
 Every effort moves you, IU, UN.935%, US$1,


Output text:
 Every effort moves you, IU, UN.935%, US$1,




KeyboardInterrupt: 

In [40]:
model.to("cpu")
model.eval()

tokenizer = tiktoken.get_encoding("gpt2")

# torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    text_to_token_ids=text_to_token_ids,
    top_k=25,
    tokenizer=tokenizer,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you), minimized Windsorarian Engelicone dose proven signalingfly Finn screenshots bald EXT charm


In [None]:
torch.manual_seed(123)

token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

In [90]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

In [91]:
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    },
    "model_and_optimizer.pth"
)

In [None]:
checkpoint = torch.load("model_and_optimizer.pth", weights_only=True)

model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()

In [94]:
from importlib.metadata import version

In [None]:
print("torch version:", version("torch"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

In [98]:
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/AlanikREDAWN/CatGPT/main/the-verdict.txt"

if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
        text_data = response.read().decode('utf-8')
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()

In [None]:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    text_data[:split_idx],
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    text_data[split_idx:],
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [None]:
n_epochs = 15
initial_lr = 0.0001
peak_lr = 0.01

In [None]:
total_steps = len(train_loader) * n_epochs
warmup_steps = int(0.2 * total_steps) # 20% warmup
print(warmup_steps)

In [None]:
import tiktoken
import time

start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)

peak_lr = 5e-4
optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1)
tokenizer = tiktoken.get_encoding("gpt2")

n_epochs = 15
train_losses, val_losses, tokens_seen, lrs = train_model(
    model, train_loader, val_loader, optimizer, device, n_epochs=n_epochs,
    eval_freq=5, eval_iter=1, start_context="Every effort moves you",
    tokenizer=tokenizer, warmup_steps=warmup_steps,
    initial_lr=1e-5, min_lr=1e-5
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")