<a href="https://www.kaggle.com/code/evanupham/gpt-tiny-story?scriptVersionId=187567065" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import DataLoader, Dataset, Subset
from datasets import load_dataset
from tqdm import tqdm
from transformers import GPT2Tokenizer, get_linear_schedule_with_warmup
import numpy as np
# Define the custom dataset class
class TinyStoriesDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = [text for text in texts if text.strip() != '']  # Filter out empty sequences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        input_ids = tokens.input_ids.squeeze(0)  # Ensure the correct dimension
        attention_mask = tokens.attention_mask.squeeze(0)  # Ensure the correct dimension
        return input_ids, attention_mask

# Load the dataset
dataset = load_dataset('roneneldan/TinyStories')

# Initialize the GPT tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Prepare the dataset
max_length = 1000
train_texts = dataset['train']['text']
train_dataset = TinyStoriesDataset(train_texts, tokenizer, max_length)

# Create data loader
batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Define the GPT-2 model with dropout
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]
class GroupedQueryAttention(nn.Module):
    def __init__(self, d_model, num_heads, num_groups=2, dropout=0.1):
        super(GroupedQueryAttention, self).__init__()
        self.num_heads = num_heads
        self.num_groups = num_groups
        self.d_model = d_model

        assert d_model % (num_heads * num_groups) == 0

        self.depth = d_model // (num_heads * num_groups)

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.dense = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_groups, self.num_heads, self.depth)
        return x.permute(0, 2, 3, 1, 4)  # (batch_size, num_groups, num_heads, seq_len, depth)

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)

        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.depth)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(1)  # (batch_size, 1, 1, seq_len, seq_len)
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, v)

        output = output.permute(0, 3, 1, 2, 4).contiguous().view(batch_size, -1, self.d_model)
        output = self.dense(output)

        return output, attention_weights

    
class MoE(nn.Module):
    def __init__(self, d_model, d_ff, n_experts=4, dropout=0.3, temperature=1.2):
        super(MoE, self).__init__()
        self.n_experts = n_experts
        self.temperature = temperature
        self.experts = nn.ModuleList([nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        ) for _ in range(n_experts)])
        
        self.gating_network = nn.Linear(d_model, n_experts)
        
    def forward(self, x):
        # Compute the gating weights
        gate_logits = self.gating_network(x)
        gate_outputs = F.gumbel_softmax(gate_logits, tau=self.temperature, hard=False)
        
        # Compute the expert outputs
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)
        
        # Combine expert outputs weighted by the gating network
        output = torch.einsum('bld,blnd->bln', gate_outputs, expert_outputs)
        
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, n_experts=5, dropout=0.3, temperature=0.8):
        super(FeedForward, self).__init__()
        self.moe_layer = MoE(d_model, d_ff, n_experts, dropout, temperature)
        
    def forward(self, x):
        return self.moe_layer(x)


class GPTBlock(nn.Module):
    def __init__(self, d_model, num_heads, num_groups, d_ff, dropout=0.3):
        super(GPTBlock, self).__init__()
        self.attention = GroupedQueryAttention(d_model, num_heads, num_groups, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff, n_experts=4, dropout=dropout)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        attn_output, _ = self.attention(x, x, x, mask)
        out1 = self.norm1(x + attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.norm2(out1 + ffn_output)
        return out2

class GPT2(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_groups, d_ff, num_layers, max_len=5000, dropout=0.3):
        super(GPT2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([GPTBlock(d_model, num_heads, num_groups, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        x = self.norm(x)
        x = self.dropout(x)
        return self.fc(x)

def create_future_mask(size):
    mask = torch.tril(torch.ones(size, size)).unsqueeze(0)
    return mask  # (1, size, size)

vocab_size = len(tokenizer)
d_model = 1536  # GPT-2 small model size
num_heads = 6
d_ff = 3072
num_layers = 12
max_len = 1024
num_groups = 4
model = GPT2(vocab_size, d_model, num_heads, num_groups, d_ff, num_layers, max_len)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
import os
model_path = "/kaggle/working/model_weights_1536.pth"
# # Load the model weights if they exist
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path))
    print(f"Model weights loaded from {model_path}")

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training loop with progress bar
model.train()

# Freeze all layers initially
for param in model.parameters():
    param.requires_grad = False

def contains_repeated_ngram(seq, n):
    ngrams = set()
    for i in range(len(seq) - n + 1):
        ngram = tuple(seq[i:i+n].tolist())
        if ngram in ngrams:
            return True
        ngrams.add(ngram)
    return False

def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, min_p=0.0):
    """Filter a distribution of logits using top-k, top-p (nucleus), and min-p filtering"""
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = -float('Inf')

    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        if min_p > 0.0:
            sorted_indices_to_remove &= (sorted_logits < min_p).cumsum(dim=-1).bool()

        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        logits[indices_to_remove] = -float('Inf')
        
    if min_p > 0.0:
        logits[logits < min_p] = -float('Inf')

    return logits

def apply_repetition_penalty(logits, seq, repetition_penalty):
    """Apply a penalty to the logits to discourage repetition"""
    for token_id in seq:
        logits[0, token_id] /= repetition_penalty
    return logits

import torch
import torch.nn.functional as F
from collections import defaultdict
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(reference, hypothesis):
    reference = [reference]  # BLEU expects a list of references
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothie)

def contains_repeated_ngram(seq, n):
    ngrams = set()
    for i in range(len(seq) - n + 1):
        ngram = tuple(seq[i:i+n].tolist())
        if ngram in ngrams:
            return True
        ngrams.add(ngram)
    return False

def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, min_p=0.0):
    """Filter a distribution of logits using top-k, top-p (nucleus), and min-p filtering"""
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = -float('Inf')

    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        if min_p > 0.0:
            sorted_indices_to_remove &= (sorted_logits < min_p).cumsum(dim=-1).bool()

        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        logits[indices_to_remove] = -float('Inf')
        
    if min_p > 0.0:
        logits[logits < min_p] = -float('Inf')

    return logits

def apply_repetition_penalty(logits, seq, repetition_penalty):
    """Apply a penalty to the logits to discourage repetition"""
    for token_id in seq:
        logits[0, token_id] /= repetition_penalty
    return logits

def beam_search(model, tokenizer, input_text, beam_width=5, max_len=100, length_penalty=1.2, no_repeat_ngram_size=3, top_k=70, top_p=0.7, min_p=0.1, temperature=0.8, repetition_penalty=1.2, diversity_rate=0.3):
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    input_ids = input_ids[:, :-1]  # Remove the last token for autoregressive generation

    beam = [(input_ids, 0, [])]  # (input_ids, score, generated tokens)
    completed_sequences = []
    diversity_penalty = defaultdict(lambda: 0)

    for step in range(max_len):
        new_beam = []
        for seq, score, generated_tokens in beam:
            with torch.no_grad():
                outputs = model(seq, create_future_mask(seq.size(1)).to(device))
            logits = outputs[:, -1, :]  # Get the logits for the last token
            logits = logits / temperature
            logits = apply_repetition_penalty(logits, seq[0], repetition_penalty)
            logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p, min_p=min_p)
            probs = F.log_softmax(logits, dim=-1)
            topk_probs, topk_ids = probs.topk(beam_width)

            for i in range(beam_width):
                next_seq = torch.cat([seq, topk_ids[:, i:i+1]], dim=-1)
                new_score = score + topk_probs[0, i].item()
                new_generated_tokens = generated_tokens + [topk_ids[0, i].item()]

                if no_repeat_ngram_size > 0 and contains_repeated_ngram(next_seq[0], no_repeat_ngram_size):
                    continue  # Skip sequences with repeated n-grams

                # Diversity penalty
                diversity_penalty[tuple(map(tuple, next_seq.tolist()))] += diversity_rate * step
                new_score -= diversity_penalty[tuple(map(tuple, next_seq.tolist()))]

                new_beam.append((next_seq, new_score, new_generated_tokens))

        if not new_beam:
            break  # Break the loop if no new sequences are generated

        beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]

        # Check for completed sequences (sequences that have the end token)
        for seq, score, generated_tokens in beam:
            if seq[0, -1] == tokenizer.eos_token_id:
                length_normalized_score = score / (seq.size(1) ** length_penalty)
                completed_sequences.append((seq, length_normalized_score, generated_tokens))

        # Keep only the sequences that are not completed
        beam = [b for b in beam if b[0][0, -1] != tokenizer.eos_token_id]

        # Early stopping if all sequences are completed
        if not beam:
            break

    if completed_sequences:
        best_seq = sorted(completed_sequences, key=lambda x: x[1], reverse=True)[0]
    else:
        if beam:
            best_seq = beam[0]  # Fallback to the best beam
        else:
            return ""  # Return an empty string if no valid sequence is found

    best_seq_tokens = best_seq[2]
    reference = tokenizer.encode(input_text)  # Use the input text as the reference
    bleu_score = calculate_bleu(reference, best_seq_tokens)

    output_text = tokenizer.decode(best_seq[0].squeeze(), skip_special_tokens=True)
    return output_text, bleu_score

def set_requires_grad(model, layer_idx, requires_grad):
    for i, layer in enumerate(model.layers):
        for param in layer.parameters():
            param.requires_grad = (i == layer_idx) and requires_grad

def get_custom_training_sequence(num_layers):
        return [1, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 12]

num_layers = len(model.layers)
training_sequence = get_custom_training_sequence(num_layers)

num_epochs = len(training_sequence)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps // 10, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
dataset_size = 1000  # Number of samples per epoch

def parabolic_scaling(epoch, num_epochs):
    mid_epoch = num_epochs // 2
    return -4 * ((epoch - mid_epoch) ** 2) / (num_epochs ** 2) + 1
import textstat

def parabolic_scale_readability_score(text, target_grade, grade_range):
    # Calculate the Flesch-Kincaid Grade Level
    fk_grade = textstat.flesch_kincaid_grade(text)
    
    # Parabolic scaling
    scaled_score = 1 - ((fk_grade - target_grade) / grade_range) ** 2
    
    # Clip the scaled score to be within the range [-1, 1]
    scaled_score = max(min(scaled_score, 1), -1)
    
    return scaled_score
from rouge_score import rouge_scorer

# Define the ROUGE score calculation function
def calculate_rouge(reference, hypothesis, tokenizer):
    reference_text = tokenizer.decode(reference, skip_special_tokens=True)
    hypothesis_text = tokenizer.decode(hypothesis, skip_special_tokens=True)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, hypothesis_text)
    return scores

# Define the BLEU score calculation function
def calculate_bleu(reference, hypothesis):
    reference = [reference]  # BLEU expects a list of references
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothie)

def z_loss(logits, beta=1e-4):
    """Z-Loss regularizes logits to prevent extreme values."""
    log_z = torch.logsumexp(logits, dim=-1)
    return beta * log_z.pow(2).mean()
start_epoch = 0
for epoch in range(start_epoch, num_epochs):
    # Determine which layer to unfreeze according to the training sequence
    layer_to_unfreeze = training_sequence[epoch] - 1
    set_requires_grad(model, layer_to_unfreeze, True)
    
    # Create a new subset of the dataset
    indices = np.random.choice(len(train_dataset), dataset_size, replace=False)
    subset = Subset(train_dataset, indices)
    train_loader = DataLoader(subset, batch_size=batch_size, shuffle=True)
        
    total_loss = 0
    total_bleu_score = 0
    total_rouge_score = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", postfix={"Loss": 0.0000, "Perplexity": 0.0000, "BLEU": 0.0000})
    
    for batch in progress_bar:
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Shift the input for the next token prediction
        labels = input_ids[:, 1:].contiguous()
        input_ids = input_ids[:, :-1].contiguous()
        
        # Create future mask
        seq_length = input_ids.size(1)
        mask = create_future_mask(seq_length).to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, mask)
        
        # Compute loss
        loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))
        
        # Generate sequences for reward calculation
        generated_ids = outputs.argmax(dim=-1).cpu().numpy()
        references = labels.cpu().numpy()
        batch_bleu_score = 0
        batch_rouge_score = 0
        rewards = []
        for ref, gen in zip(references, generated_ids):
            ref_tokens = ref.tolist()
            gen_tokens = gen.tolist()
            bleu_score = calculate_bleu(ref_tokens, gen_tokens)
            rouge_score = calculate_rouge(ref_tokens, gen_tokens, tokenizer)
            rouge_l_score = rouge_score['rougeL'].fmeasure  # Using ROUGE-L F1-score as the reward
            rewards.append(rouge_l_score)
            batch_bleu_score += bleu_score
            batch_rouge_score += rouge_l_score
        
        avg_bleu_score = batch_bleu_score / len(references)
        avg_rouge_score = batch_rouge_score / len(references)
        
        # REINFORCE algorithm
        rewards = torch.tensor(rewards, dtype=torch.float).to(device)
        log_probs = F.log_softmax(outputs, dim=-1)
        log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)
        policy_loss = -log_probs * rewards.unsqueeze(-1)
        policy_loss = policy_loss.mean() * parabolic_scaling(epoch, num_epochs)
        zloss_value = z_loss(outputs)
        total_loss_with_reward = loss + zloss_value + policy_loss
        total_loss_with_reward.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += total_loss_with_reward.item()
        total_bleu_score += avg_bleu_score
        total_rouge_score += avg_rouge_score
        avg_loss = total_loss / len(progress_bar)
        perplexity = torch.exp(torch.tensor(loss)).item()
        progress_bar.set_postfix(Loss=f"{loss.item():.4f}", Perplexity=f"{perplexity:.4f}", BLEU=f"{avg_bleu_score:.4f}", ROUGE=f"{avg_rouge_score:.4f}")
    
    avg_loss = total_loss / len(train_loader)
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    avg_bleu_score = total_bleu_score / len(train_loader)
    avg_rouge_score = total_rouge_score / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss_with_reward.item():.4f}, Perplexity: {perplexity:.4f}, Avg BLEU: {avg_bleu_score:.4f}, Avg ROUGE: {avg_rouge_score:.4f}")
    generated_text, bleu_score = beam_search(model, tokenizer, "Once upon a time", beam_width=5, max_len=50)
    print(f"Generated text: {generated_text}")
    print(f"BLEU score: {bleu_score:.4f}")  
    # Save model weights
    torch.save(model.state_dict(), model_path)
    print(f"Model weights saved to {model_path}")
    
    # Freeze the previously unfrozen layer
    set_requires_grad(model, layer_to_unfreeze, False)

print("Training complete.")

Repo card metadata block was not found. Setting CardData to empty.


Model weights loaded from /kaggle/working/model_weights_1536.pth


Epoch 1/24:   0%|          | 0/500 [00:00<?, ?it/s, BLEU=0, Loss=0, Perplexity=0]2024-07-08 17:50:35.209892: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 17:50:35.209953: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 17:50:35.211682: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  perplexity = torch.exp(torch.tensor(loss)).item()
Epoch 1/24: 100%|██████████| 500/500 [10:13<00:00,  1.23s/it, BLEU=0.5198, Loss=7.7639, Perplexity=2354.0771, ROUGE=0.1572]


Epoch 1/24, Loss: 7.7823, Perplexity: 1957.1715, Avg BLEU: 0.7994, Avg ROUGE: 0.2291
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so the park and said, ".


" and he was so her to the little girl, " and said. The to her mom was a big
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 2/24: 100%|██████████| 500/500 [09:49<00:00,  1.18s/it, BLEU=0.8439, Loss=7.7248, Perplexity=2263.6941, ROUGE=0.2355]


Epoch 2/24, Loss: 7.7969, Perplexity: 2052.9524, Avg BLEU: 0.7953, Avg ROUGE: 0.2289
Generated text: Once upon a time, there was a little girl. She was very to the little and the park. He was so her to the park, " and said, he was very.

". She said, but it was a big and said. The
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 3/24: 100%|██████████| 500/500 [10:07<00:00,  1.22s/it, BLEU=0.8187, Loss=7.4183, Perplexity=1666.2378, ROUGE=0.2162]


Epoch 3/24, Loss: 7.5413, Perplexity: 2108.8716, Avg BLEU: 0.8036, Avg ROUGE: 0.2311
Generated text: Once upon a time, there was a little girl. He was so to the park. She was very to the little and said, " day, ".

The the park, but her and he was a big and said. She had a big to
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 4/24: 100%|██████████| 500/500 [09:28<00:00,  1.14s/it, BLEU=0.8264, Loss=7.2186, Perplexity=1364.5940, ROUGE=0.2000]


Epoch 4/24, Loss: 7.3619, Perplexity: 2282.2429, Avg BLEU: 0.8008, Avg ROUGE: 0.2248
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so the park and said, " to her mom and he was so to the little.

", but she was a big. She said, but it
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 5/24: 100%|██████████| 500/500 [09:48<00:00,  1.18s/it, BLEU=0.7541, Loss=7.8442, Perplexity=2550.8879, ROUGE=0.1994]


Epoch 5/24, Loss: 8.0695, Perplexity: 2347.3743, Avg BLEU: 0.7972, Avg ROUGE: 0.2244
Generated text: Once upon a time, there was a little. She was very to the

The. He was so the park. She said, " and said, but she was so to the park and he was a big. The to the little girl, " her
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 6/24: 100%|██████████| 500/500 [09:09<00:00,  1.10s/it, BLEU=0.8420, Loss=7.3535, Perplexity=1561.6176, ROUGE=0.2494]


Epoch 6/24, Loss: 7.5960, Perplexity: 2411.4050, Avg BLEU: 0.7983, Avg ROUGE: 0.2243
Generated text: Once upon a time, there was a little girl. She was very to the park.

The, " the park and said, but it was so he was a big and said. He was so her to the little. She said, " and the
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 7/24: 100%|██████████| 500/500 [09:28<00:00,  1.14s/it, BLEU=0.7075, Loss=7.1329, Perplexity=1252.5162, ROUGE=0.2636]


Epoch 7/24, Loss: 7.5618, Perplexity: 2527.5212, Avg BLEU: 0.8006, Avg ROUGE: 0.2256
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so the park and said, " her.


The was a big to the little. He said, but he was so her to the" and said
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 8/24: 100%|██████████| 500/500 [08:48<00:00,  1.06s/it, BLEU=0.7615, Loss=7.9554, Perplexity=2851.0002, ROUGE=0.2286]


Epoch 8/24, Loss: 8.3601, Perplexity: 2536.5933, Avg BLEU: 0.8035, Avg ROUGE: 0.2283
Generated text: Once upon a time, there was a little girl. He was so to the park. She was very to the

The and said, " the big and said. They day, but he was so her mom. The to play with the park and it
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 9/24: 100%|██████████| 500/500 [09:09<00:00,  1.10s/it, BLEU=0.6268, Loss=7.5740, Perplexity=1946.9353, ROUGE=0.2340]


Epoch 9/24, Loss: 8.1117, Perplexity: 2594.6650, Avg BLEU: 0.7984, Avg ROUGE: 0.2270
Generated text: Once upon a time, there was a little girl. She was very her to the park. He was so the park and said, " to the


The. She had a big, " and he was so to play, but it. The and
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 10/24: 100%|██████████| 500/500 [08:30<00:00,  1.02s/it, BLEU=0.8465, Loss=7.3757, Perplexity=1596.6635, ROUGE=0.2321]


Epoch 10/24, Loss: 7.6725, Perplexity: 2641.0681, Avg BLEU: 0.7985, Avg ROUGE: 0.2276
Generated text: Once upon a time, there was a little. He was so was very to the park.

The and said, ". She was so the park to the big, but the day, " her and he was a big and said. She said,
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 11/24: 100%|██████████| 500/500 [08:50<00:00,  1.06s/it, BLEU=0.6905, Loss=7.3480, Perplexity=1553.0466, ROUGE=0.2494]


Epoch 11/24, Loss: 7.8930, Perplexity: 2714.9321, Avg BLEU: 0.7961, Avg ROUGE: 0.2254
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so her to the
The and said, ", but he was a big and said.

". The to the little girl, " day, "
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 12/24: 100%|██████████| 500/500 [08:10<00:00,  1.02it/s, BLEU=0.7672, Loss=7.4378, Perplexity=1698.9619, ROUGE=0.1852]


Epoch 12/24, Loss: 7.8170, Perplexity: 2720.5591, Avg BLEU: 0.7993, Avg ROUGE: 0.2284
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so a big, " and said, but the.

The to the little girl and he was so the park and said. She had a big to her
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 13/24: 100%|██████████| 500/500 [08:29<00:00,  1.02s/it, BLEU=0.8648, Loss=7.5344, Perplexity=1871.3489, ROUGE=0.2293]


Epoch 13/24, Loss: 7.8324, Perplexity: 2732.2610, Avg BLEU: 0.8002, Avg ROUGE: 0.2288
Generated text: Once upon a time, there was a little girl. She was so to the park. He was very to the little and said, ".

The was so her and he had a big, but the park, " it. He said, she and
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 14/24: 100%|██████████| 500/500 [07:50<00:00,  1.06it/s, BLEU=0.7907, Loss=7.3777, Perplexity=1599.9222, ROUGE=0.2264]


Epoch 14/24, Loss: 7.7449, Perplexity: 2658.0115, Avg BLEU: 0.7978, Avg ROUGE: 0.2297
Generated text: Once upon a time, there was a little girl. She was so he was very to the park. He was a big and said, ".

The and the park, but her to her to the little. She said, but it was so the
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 15/24: 100%|██████████| 500/500 [08:10<00:00,  1.02it/s, BLEU=0.7474, Loss=7.3866, Perplexity=1614.2255, ROUGE=0.2374]


Epoch 15/24, Loss: 7.8598, Perplexity: 2688.3726, Avg BLEU: 0.7988, Avg ROUGE: 0.2313
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so the park and said, " and he was so she was a big to the

". She said, but her. They had to play and the
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 16/24: 100%|██████████| 500/500 [07:31<00:00,  1.11it/s, BLEU=0.8096, Loss=8.1361, Perplexity=3415.7236, ROUGE=0.2008]


Epoch 16/24, Loss: 8.4751, Perplexity: 2616.9038, Avg BLEU: 0.7983, Avg ROUGE: 0.2287
Generated text: Once upon a time, there was a little girl. She was so to the park. He was very and said, " day, he was very to the big.

" and the park, " her and said. They had a big to her mom
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 17/24: 100%|██████████| 500/500 [07:50<00:00,  1.06it/s, BLEU=0.8004, Loss=7.7077, Perplexity=2225.3813, ROUGE=0.2315]


Epoch 17/24, Loss: 8.0796, Perplexity: 2594.4214, Avg BLEU: 0.7990, Avg ROUGE: 0.2287
Generated text: Once upon a time, there was a little. She was very to the park. He was so the
The and said, " day, but the park and he was a big to play.
" and said. They her to the little girl, "
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 18/24: 100%|██████████| 500/500 [07:10<00:00,  1.16it/s, BLEU=0.8300, Loss=7.5270, Perplexity=1857.4589, ROUGE=0.2849]


Epoch 18/24, Loss: 7.8898, Perplexity: 2461.3306, Avg BLEU: 0.8022, Avg ROUGE: 0.2322
Generated text: Once upon a time, there was a little. He was very to the park. She was so he was very a big and said, " to the little girl.


The 
". She said, but the park and she was so her
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 19/24: 100%|██████████| 500/500 [07:29<00:00,  1.11it/s, BLEU=0.8977, Loss=7.2638, Perplexity=1427.6962, ROUGE=0.2492]


Epoch 19/24, Loss: 7.4451, Perplexity: 2391.1748, Avg BLEU: 0.8074, Avg ROUGE: 0.2323
Generated text: Once upon a time, there was a little girl. She was so to the park. He was very to the

The and said, " day, but he was a big. The and said. She had to her mom and the park, " her
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 20/24: 100%|██████████| 500/500 [06:52<00:00,  1.21it/s, BLEU=0.8082, Loss=7.5846, Perplexity=1967.6162, ROUGE=0.2029]


Epoch 20/24, Loss: 7.8214, Perplexity: 2367.7271, Avg BLEU: 0.7945, Avg ROUGE: 0.2308
Generated text: Once upon a time, there was a little. He was so to the park.

The. She was so the park and said, " day, " and he was very to the little girl and said. She had a big, but he was so
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 21/24: 100%|██████████| 500/500 [07:13<00:00,  1.15it/s, BLEU=0.8705, Loss=7.1803, Perplexity=1313.3145, ROUGE=0.2731]


Epoch 21/24, Loss: 7.3675, Perplexity: 2240.8469, Avg BLEU: 0.7924, Avg ROUGE: 0.2291
Generated text: Once upon a time, there was a little girl.
The, " was so to the park. She was very to her and said, but the little girl and said. He was a big to play. She had a big and he was so the park
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 22/24: 100%|██████████| 500/500 [06:31<00:00,  1.28it/s, BLEU=0.7563, Loss=7.4825, Perplexity=1776.6440, ROUGE=0.2330]


Epoch 22/24, Loss: 7.7124, Perplexity: 2083.7578, Avg BLEU: 0.8002, Avg ROUGE: 0.2337
Generated text: Once upon a time, there was a little girl. He was very to the park. She was so he was a big to play and said, ".
The day, but the park and he was so her mom and said. She had a big and
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 23/24: 100%|██████████| 500/500 [06:51<00:00,  1.21it/s, BLEU=0.8355, Loss=7.4757, Perplexity=1764.6826, ROUGE=0.2295]


Epoch 23/24, Loss: 7.5937, Perplexity: 1992.4585, Avg BLEU: 0.8024, Avg ROUGE: 0.2349
Generated text: Once upon a time, there was a little girl. She was very to the park. He was so the park and said, " her.

", but he was a big and said. She had a big to play with the little girl, "
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth


Epoch 24/24: 100%|██████████| 500/500 [06:32<00:00,  1.28it/s, BLEU=0.8283, Loss=7.1977, Perplexity=1336.3148, ROUGE=0.2311]


Epoch 24/24, Loss: 7.2683, Perplexity: 1877.3755, Avg BLEU: 0.7973, Avg ROUGE: 0.2332
Generated text: Once upon a time, there was a little girl. She was so to the park.

The, " said, but the park and said. He was so he was very to the little girl and saw a big and said, " her to play.
BLEU score: 0.1881
Model weights saved to /kaggle/working/model_weights_1536.pth
Training complete.


In [None]:
!pip install rouge-score


In [None]:
!pip install textstat

In [None]:
import random

def get_custom_training_sequence(num_layers, mode='forwards'):
    if mode == 'forwards':
        return [1, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 12]

    elif mode == 'backwards':
        return [12, 11, 12, 10, 11, 9, 10, 8, 9, 7, 8, 6, 7, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1]

    elif mode == 'random':
        layers = list(range(1, num_layers + 1))
        sequence = layers + layers
        random.shuffle(sequence)
        return sequence

    else:
        raise ValueError("Mode must be one of 'forwards', 'backwards', or 'random'.")

# Example usage:
num_layers = 12
forwards_sequence = get_custom_training_sequence(num_layers, mode='forwards')
backwards_sequence = get_custom_training_sequence(num_layers, mode='backwards')
random_sequence = get_custom_training_sequence(num_layers, mode='random')

forwards_sequence, backwards_sequence, random_sequence


In [None]:
model_path = "model_weights.pth"
torch.save(model.state_dict(), model_path)
print(f"Model weights saved to {model_path}")

In [None]:
import torch
import torch.nn.functional as F
from collections import defaultdict

def create_future_mask(size):
    mask = torch.tril(torch.ones(size, size)).unsqueeze(0).unsqueeze(0)
    return mask  # (1, 1, size, size)


import torch
import torch.nn.functional as F
from collections import defaultdict
import nltk
class GroupedQueryAttention(nn.Module):
    def __init__(self, d_model, num_heads, num_groups=2, dropout=0.1):
        super(GroupedQueryAttention, self).__init__()
        self.num_heads = num_heads
        self.num_groups = num_groups
        self.d_model = d_model

        assert d_model % (num_heads * num_groups) == 0

        self.depth = d_model // (num_heads * num_groups)

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.dense = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_groups, self.num_heads, self.depth)
        return x.permute(0, 2, 3, 1, 4)  # (batch_size, num_groups, num_heads, seq_len, depth)

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)

        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.depth)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2).unsqueeze(3)  # (batch_size, 1, 1, 1, seq_len)
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = torch.nn.functional.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, v)

        output = output.permute(0, 3, 1, 2, 4).contiguous().view(batch_size, -1, self.d_model)
        output = self.dense(output)

        return output, attention_weights

def calculate_bleu(reference, hypothesis):
    reference = [reference]  # BLEU expects a list of references
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothie)

def contains_repeated_ngram(seq, n):
    ngrams = set()
    for i in range(len(seq) - n + 1):
        ngram = tuple(seq[i:i+n].tolist())
        if ngram in ngrams:
            return True
        ngrams.add(ngram)
    return False

def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, min_p=0.0):
    """Filter a distribution of logits using top-k, top-p (nucleus), and min-p filtering"""
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = -float('Inf')

    if top_p < 1.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        if min_p > 0.0:
            sorted_indices_to_remove &= (sorted_logits < min_p).cumsum(dim=-1).bool()

        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        logits[indices_to_remove] = -float('Inf')
        
    if min_p > 0.0:
        logits[logits < min_p] = -float('Inf')

    return logits

def apply_repetition_penalty(logits, seq, repetition_penalty):
    """Apply a penalty to the logits to discourage repetition"""
    for token_id in seq:
        logits[0, token_id] /= repetition_penalty
    return logits

def beam_search(model, tokenizer, input_text, beam_width=5, max_len=100, length_penalty=1.2, no_repeat_ngram_size=3, top_k=70, top_p=0.7, min_p=0.1, temperature=0.8, repetition_penalty=1.2, diversity_rate=0.3):
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    input_ids = input_ids[:, :-1]  # Remove the last token for autoregressive generation

    beam = [(input_ids, 0, [])]  # (input_ids, score, generated tokens)
    completed_sequences = []
    diversity_penalty = defaultdict(lambda: 0)

    for step in range(max_len):
        new_beam = []
        for seq, score, generated_tokens in beam:
            with torch.no_grad():
                outputs, _ = model(seq, create_future_mask(seq.size(1)).to(device))
            logits = outputs[:, -1, :]  # Get the logits for the last token
            logits = logits / temperature
            logits = apply_repetition_penalty(logits, seq[0], repetition_penalty)
            logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p, min_p=min_p)
            probs = F.log_softmax(logits, dim=-1)
            topk_probs, topk_ids = probs.topk(beam_width)

            for i in range(beam_width):
                next_seq = torch.cat([seq, topk_ids[:, i:i+1]], dim=-1)
                new_score = score + topk_probs[0, i].item()
                new_generated_tokens = generated_tokens + [topk_ids[0, i].item()]

                if no_repeat_ngram_size > 0 and contains_repeated_ngram(next_seq[0], no_repeat_ngram_size):
                    continue  # Skip sequences with repeated n-grams

                # Diversity penalty
                diversity_penalty[tuple(map(tuple, next_seq.tolist()))] += diversity_rate * step
                new_score -= diversity_penalty[tuple(map(tuple, next_seq.tolist()))]

                new_beam.append((next_seq, new_score, new_generated_tokens))

        if not new_beam:
            break  # Break the loop if no new sequences are generated

        beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]

        # Check for completed sequences (sequences that have the end token)
        for seq, score, generated_tokens in beam:
            if seq[0, -1] == tokenizer.eos_token_id:
                length_normalized_score = score / (seq.size(1) ** length_penalty)
                completed_sequences.append((seq, length_normalized_score, generated_tokens))

        # Keep only the sequences that are not completed
        beam = [b for b in beam if b[0][0, -1] != tokenizer.eos_token_id]

        # Early stopping if all sequences are completed
        if not beam:
            break

    if completed_sequences:
        best_seq = sorted(completed_sequences, key=lambda x: x[1], reverse=True)[0]
    else:
        if beam:
            best_seq = beam[0]  # Fallback to the best beam
        else:
            return ""  # Return an empty string if no valid sequence is found

    best_seq_tokens = best_seq[2]
    reference = tokenizer.encode(input_text)  # Use the input text as the reference
    bleu_score = calculate_bleu(reference, best_seq_tokens)

    output_text = tokenizer.decode(best_seq[0].squeeze(), skip_special_tokens=True)
    return output_text, bleu_score





generated_text, bleu_score = beam_search(model, tokenizer, "Once upon a time", beam_width=5, max_len=50)
print(f"Generated text: {generated_text}")
print(f"BLEU score: {bleu_score:.4f}")


In [None]:
!pip install einops