In [1]:
# Install KerasNLP if not already installed
# !pip install keras-nlp

import os
import warnings

import keras

In [2]:
# Data
BATCH_SIZE = 64
SEQ_LEN = 128
MIN_TRAINING_SEQ_LEN = 450

# Model
EMBED_DIM = 4096
FEED_FORWARD_DIM = 16384
NUM_HEADS = 32  # Aumentado para acomodar la nueva dimensión de embedding
NUM_LAYERS = 48 
VOCAB_SIZE = 5000  # Limits parameters in model

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert embed_dim % num_heads == 0, "embed_dim debe ser divisible por num_heads"
        self.num_heads = num_heads
        self.depth = embed_dim // num_heads

        self.wq = nn.Linear(embed_dim, embed_dim)
        self.wk = nn.Linear(embed_dim, embed_dim)
        self.wv = nn.Linear(embed_dim, embed_dim)

        self.dense = nn.Linear(embed_dim, embed_dim)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)

    def forward(self, x):
        batch_size = x.size(0)

        query = self.split_heads(self.wq(x), batch_size)
        key = self.split_heads(self.wk(x), batch_size)
        value = self.split_heads(self.wv(x), batch_size)

        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)
        attention_weights = F.softmax(scores, dim=-1)

        out = torch.matmul(attention_weights, value)
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.depth)
        out = self.dense(out)

        return out

In [5]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, feed_forward_dim):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.linear1 = nn.Linear(embed_dim, feed_forward_dim)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(feed_forward_dim, embed_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)
        
    def forward(self, x, memory, tgt_mask=None, memory_mask=None):
        x2 = self.self_attn(x, x, x, attn_mask=tgt_mask)[0]
        x = x + self.dropout1(x2)
        x = self.norm1(x)
        x2 = self.linear2(self.dropout(F.relu(self.linear1(x))))
        x = x + self.dropout2(x2)
        x = self.norm2(x)
        return x

In [10]:
import tensorflow as tf

In [11]:
# Download and extract the dataset
keras.utils.get_file(
    origin="https://storage.googleapis.com/asl-public/text/data/simplebooks.zip",
    extract=True,
)
data_dir = os.path.expanduser("~/.keras/datasets/simplebooks/")

# Load and filter the training dataset
raw_train_ds = (
    tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/train.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

# Load and filter the validation dataset
raw_val_ds = (
    tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

In [23]:
import torch
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_dataset(dataset, max_length=512):
    """
    Tokenizes a dataset of text examples.

    Args:
    - dataset (list of str): List of text examples to tokenize.
    - max_length (int): Maximum length of the tokenized sequences.

    Returns:
    - dict: Dictionary containing 'input_ids' and 'attention_mask' tensors.
    """
    # Tokenize the dataset
    encoding = tokenizer(
        dataset,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

    return encoding

# Tokenize the training dataset
def tokenize(dataset):
    return dataset.map(lambda x: tokenizer(x))

#train_ds = tokenize_dataset(raw_train_ds)
#val_ds = tokenize_dataset(raw_val_ds)

In [None]:
from torchinfo import summary

In [17]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, feed_forward_dim, num_layers, seq_len):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_len, embed_dim))
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(embed_dim, num_heads, feed_forward_dim)
            for _ in range(num_layers)
        ])
        self.linear = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x, tgt_mask=None):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        for layer in self.layers:
            x = layer(x, x, tgt_mask)
        x = self.linear(x)
        return x

# Instantiate the model
model = TransformerModel(VOCAB_SIZE, EMBED_DIM, NUM_HEADS, FEED_FORWARD_DIM, NUM_LAYERS, SEQ_LEN)

In [26]:
raw_train_ds = [
    "Hello, how are you?",
    "I am fine, thank you!",
    "What are you doing today?"
]
raw_val_ds = [
    "Hi, how's it going?",
    "I'm good, what about you?"
]


In [27]:
train_encodings = tokenizer(raw_train_ds, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(raw_val_ds, padding=True, truncation=True, return_tensors="pt")

# Create TensorDataset and DataLoader
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_encodings["input_ids"])
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], val_encodings["input_ids"])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        # Move data to the appropriate device (CPU/GPU)
        input_ids = input_ids.to(model.device)
        attention_mask = attention_mask.to(model.device)
        labels = labels.to(model.device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Compute loss
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')

NameError: name 'TensorDataset' is not defined

: 