In [2]:
# This is the initial dataset which is used in the first initial step of training after this the model should be able to complete text

In [3]:
import os

from datasets import load_dataset

from torch.utils.data import Dataset, DataLoader

from preprocess.sequencing import create_sequences
from preprocess.tokenizer import BPETokenizer

from transformer.DecoderLayer import DecoderLayer

from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SPECIAL_TOKENS = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
TEXT_COMPLETION_PATH = os.path.join("data", "text_completion.json")

train_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="train[:2%]")

# Load 5% of the validation set
valid_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="validation[:2%]")

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(valid_set)}")

In [56]:
train_articles = train_set["article"]
train_highlights = train_set["highlights"]


tokenizer = BPETokenizer(
    vocab_size=30000, min_frequency=2, special_tokens=SPECIAL_TOKENS
)

if not os.path.exists(TEXT_COMPLETION_PATH):
    tokenizer.fit(
        train_articles + train_highlights,
    )
    tokenizer.save(TEXT_COMPLETION_PATH)
else:
    tokenizer.load(TEXT_COMPLETION_PATH)

In [57]:
train_articles = [item["article"] for item in tqdm(train_set, desc="Extracting Train Articles") if item["article"] is not None]
valid_articles = [item["article"] for item in tqdm(valid_set, desc="Extracting Valid Articles") if item["article"] is not None]

def encode_article(article):
    return tokenizer.encode(article)

def parallel_encode(articles, desc):
    encoded_articles = []
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(encode_article, article): article for article in articles}
        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
            encoded_articles.append(future.result())
    return encoded_articles

train_set_encoded = parallel_encode(train_articles, "Encoding Train Set")
valid_set_encoded = parallel_encode(valid_articles, "Encoding Valid Set")

Extracting Train Articles:   0%|          | 0/5742 [00:00<?, ?it/s]

Extracting Train Articles: 100%|██████████| 5742/5742 [00:00<00:00, 60478.82it/s]
Extracting Valid Articles: 100%|██████████| 267/267 [00:00<00:00, 58543.53it/s]
Encoding Train Set: 100%|██████████| 5742/5742 [00:01<00:00, 4240.84it/s]
Encoding Valid Set: 100%|██████████| 267/267 [00:00<00:00, 3353.84it/s]


In [58]:
def extract_token_ids(encoded_data):
    """
    Convert each Encoding object into its list of token IDs and flatten them into a single list,
    with a progress bar showing the extraction progress.
    """
    flattened_ids = []
    for encoding in tqdm(encoded_data, desc="Extracting Token IDs"):
        flattened_ids.extend(encoding.ids)
    return flattened_ids

# Extract token IDs with progress bars for training and validation sets
train_token_ids = extract_token_ids(train_set_encoded)
valid_token_ids = extract_token_ids(valid_set_encoded)


Extracting Token IDs: 100%|██████████| 5742/5742 [00:00<00:00, 48834.17it/s]
Extracting Token IDs: 100%|██████████| 267/267 [00:00<00:00, 62854.53it/s]


In [59]:
CONTEXT_ELN = 50 # N
TARGET_ELN = 1

train_seq = create_sequences(
    tokenized_data=train_token_ids, 
    max_context_length=CONTEXT_ELN,
    max_target_length=TARGET_ELN,
    skip_processed=True,
)

valid_seq = create_sequences(
    tokenized_data=valid_token_ids,
    max_context_length=CONTEXT_ELN,
    max_target_length=TARGET_ELN,
    skip_processed=True,
)

In [60]:
print(len(train_seq))
for i, (context, target) in enumerate(train_seq):
    print(f"Context: {context[:10]}... (Total: {len(context)} tokens)") 
    print(f"Target: {target} (Total: {len(target)} token)") 
    print(f"Decoded: ...{tokenizer.decode(context)[-10:]}")
    print(f"Decoded: {tokenizer.decode(target)}")
    if i == 5:
        break


89900
Context: [4507, 16, 1844, 388, 9322, 5054, 13, 289, 5959, 8373]... (Total: 50 tokens)
Target: [261] (Total: 1 token)
Decoded: ... Radcliffe
Decoded:  as
Context: [261, 5959, 8373, 175, 200, 15463, 8373, 185, 155, 15629]... (Total: 50 tokens)
Target: [45] (Total: 1 token)
Decoded: ...parties. "
Decoded: I
Context: [45, 749, 393, 795, 173, 199, 442, 180, 766, 417]... (Total: 50 tokens)
Target: [27108] (Total: 1 token)
Decoded: ...rticularly
Decoded:  extravagant
Context: [27108, 18, 200, 472, 1415, 213, 609, 7125, 286, 1415]... (Total: 50 tokens)
Target: [30] (Total: 1 token)
Decoded: ...lm "Hostel
Decoded: :
Context: [30, 6751, 3702, 272, 3710, 1323, 3646, 4506, 301, 1342]... (Total: 50 tokens)
Target: [180] (Total: 1 token)
Decoded: ... some sort
Decoded:  of
Context: [180, 1907, 272, 207, 227, 175, 254, 1740, 18, 200]... (Total: 50 tokens)
Target: [27514] (Total: 1 token)
Decoded: ...g fame and
Decoded:  riches


In [61]:
class TextCompletionDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        context, target = self.sequences[idx]
        # Convert context and target to tensors
        context = torch.tensor(context, dtype=torch.long)
        target = torch.tensor(target, dtype=torch.long)
        return context, target

    
train_dataset = TextCompletionDataset(train_seq)
valid_dataset = TextCompletionDataset(valid_seq)

In [62]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [63]:
EPOCHS = 10
VOC_SIZE = tokenizer.get_vocab_size()
MAX_LEN = CONTEXT_ELN
D_MODEL = 512
FFN_HIDDEN = 2048
N_HEAD = 8
N_LAYERS = 6
DROP_PROB = 0.1

In [64]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        num_layers,
        num_heads,
        ff_dim,
        max_len=5000,
        dropout=0.1,
    ):
        super().__init__()
        self.decoder = DecoderLayer(
            vocab_size, embed_dim, num_layers, num_heads, ff_dim, max_len, dropout
        )

    def forward(self, x, mask=None):
        return self.decoder(x, mask)


In [65]:
model = TransformerModel(
    vocab_size=VOC_SIZE,
    embed_dim=D_MODEL,
    num_layers=N_LAYERS,
    num_heads=N_HEAD,
    ff_dim=FFN_HIDDEN,
    max_len=MAX_LEN,
    dropout=DROP_PROB
).to(DEVICE)

print("Parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))
print(model)

Parameters:  49664304
TransformerModel(
  (decoder): DecoderLayer(
    (embedding): InputEmbeddings(
      (embed): Embedding(30000, 512)
    )
    (positional_encoding): PositionalEncoding()
    (layers): ModuleList(
      (0-5): 6 x DecoderBlock(
        (attention): MultiHeadAttention(
          (qkv_proj): Linear(in_features=512, out_features=1536, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward): FeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (fc_out): Linear(in_features=512, out_features=30000, bias=True)
  )
)


In [66]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.002)

In [67]:
def look_ahead_mask(seq_len):
    mask = (torch.triu(torch.ones(seq_len, seq_len)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask

In [68]:
def train_epoch(model, loader, criterion, optimizer):
    """
    Trains the model for one epoch.

    Parameters:
    - model: The model being trained.
    - loader: DataLoader providing training data batches.
    - criterion: The loss function.
    - optimizer: The optimizer used to update the model weights.

    Returns:
    - average_loss: The average training loss over the epoch.
    """
    model.train()
    total_loss = 0

    for context, target in tqdm(loader, desc="Training"):
        context = context.to(DEVICE)
        target = target.to(DEVICE)

        # Zero gradients
        optimizer.zero_grad()

        # Create the look-ahead mask for the context sequence
        mask = look_ahead_mask(context.size(1)).to(DEVICE)

        # Forward pass: model should predict the next token based on context
        output = model(context, mask)  # Expect shape: (batch_size, seq_length, vocab_size)
        output = output[:, -1, :]  # Get the last token's output along the sequence dimension
        target = target.squeeze(-1)  # Adjust target shape to match the output shape

        # Calculate loss
        loss = criterion(output, target)
        total_loss += loss.item()

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

    # Calculate the average loss for the epoch
    average_loss = total_loss / len(loader)
    return average_loss


def validate_epoch(model, loader, criterion):
    """
    Validates the model for one epoch.

    Parameters:
    - model: The model being validated.
    - loader: DataLoader providing validation data batches.
    - criterion: The loss function.

    Returns:
    - average_loss: The average validation loss over the epoch.
    """
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for context, target in tqdm(loader, desc="Validation"):
            context = context.to(DEVICE)
            target = target.to(DEVICE)

            # Create the look-ahead mask for the context sequence
            mask = look_ahead_mask(context.size(1)).to(DEVICE)

            # Forward pass: model should predict the next token based on context
            output = model(context, mask)  # Expect shape: (batch_size, seq_length, vocab_size)
            output = output[:, -1, :]  # Get the last prediction along the sequence dimension
            target = target.squeeze(-1)  # Adjust target shape to match the output shape

            # Calculate loss
            loss = criterion(output, target)
            total_loss += loss.item()

    # Calculate the average loss for the epoch
    average_loss = total_loss / len(loader)
    return average_loss


In [1]:
losses = {"train": [], "valid": []}
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}")
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    valid_loss = validate_epoch(model, valid_loader, criterion)
    print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
    losses["train"].append(train_loss)
    losses["valid"].append(valid_loss)

NameError: name 'EPOCHS' is not defined

In [72]:
import torch
import torch.nn.functional as F

def top_k_logits(logits, k):
    """
    Keeps only the top k logits and sets the rest to a large negative value.
    
    Parameters:
    - logits: The logits tensor from which to sample.
    - k: The number of top logits to keep.
    
    Returns:
    - logits: The modified logits tensor.
    """
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1].unsqueeze(1)
    return torch.where(logits < min_values, torch.full_like(logits, float('-inf')), logits)

def generate_text(model, tokenizer, initial_text, n_words=10, device='cuda', temperature=1.0, top_k=0):
    """
    Generates text using a given model and tokenizer with temperature scaling and top-k sampling.

    Parameters:
    - model: The trained model used for text generation.
    - tokenizer: The tokenizer used to encode and decode text.
    - initial_text: The initial context text to start the generation.
    - n_words: The number of words to generate (default is 10).
    - device: The device to run the model on (default is 'cuda').
    - temperature: Temperature scaling factor for randomness control (default is 1.0).
    - top_k: Number of top logits to keep for sampling (default is 0, which keeps all).
    
    Returns:
    - decoded: The decoded generated text.
    """
    # Set the model to evaluation mode
    model.eval()
    
    # Encode the initial context text
    context = tokenizer.encode(initial_text)
    # Extract the token IDs and create the input tensor
    context = torch.tensor(context.ids, dtype=torch.long).unsqueeze(0).to(device)
    
    # Initialize the look-ahead mask based on the initial context size
    mask = look_ahead_mask(context.size(1)).to(device)

    # Function to generate the next word based on the current context and mask
    def gen_next_word(model, context, mask, temperature, top_k):
        with torch.no_grad():
            output = model(context, mask)
            logits = output[:, -1, :]  # Get the logits for the last token
            logits = logits / temperature  # Apply temperature scaling
            
            # Apply top-k filtering to logits
            logits = top_k_logits(logits, top_k)
            
            # Sample from the adjusted logits distribution
            probabilities = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
            return next_token.squeeze(-1)

    # Generate n_words tokens and update the context and mask accordingly
    for _ in range(n_words):
        next_token = gen_next_word(model, context, mask, temperature, top_k)
        # Expand next_token to match the batch dimension and concatenate
        context = torch.cat([context, next_token.unsqueeze(1)], dim=1)
        # Recalculate the mask to match the updated context size
        mask = look_ahead_mask(context.size(1)).to(device)

    # Decode the generated context into text
    decoded = tokenizer.decode(context.squeeze().tolist())
    return decoded

# Example usage:
initial_text = "The quick brown fox jumps over the lazy dog"
generated_text = generate_text(
    model, 
    tokenizer, 
    initial_text, 
    n_words=10, 
    device=DEVICE, 
    temperature=0.7,  # Adjust temperature here
    top_k=10          # Adjust top-k here
)
print(generated_text)


 The quick brown fox jumps over the lazy dogorship retailer Horman catalyst Smokey Sub Farrah valuable upcoming Republican
