In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import time


In [11]:
# Dataset class
class ItalianLyricsDataset(Dataset):
    def __init__(self, lyrics, tokenizer, max_length):
        self.lyrics = lyrics
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        lyric = self.lyrics[idx]
        encoding = self.tokenizer(lyric, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

# RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        return self.fc(output)

# LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        return self.fc(output)

# Function to train models
def train_model(model, dataloader, criterion, optimizer, device, epochs, is_transformer=False):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            inputs, masks = batch
            inputs, masks = inputs.to(device), masks.to(device)
            optimizer.zero_grad()
            
            if is_transformer:
                outputs = model(inputs, attention_mask=masks, labels=inputs)
                loss = outputs.loss
            else:
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, outputs.size(-1)), inputs.view(-1))
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

# Custom generation function for RNN and LSTM models
def generate_custom(model, tokenizer, seed_text, max_length=50):
    model.eval()
    device = next(model.parameters()).device
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    generated = input_ids[0].tolist()

    with torch.no_grad():
        for _ in range(max_length - len(generated)):
            inputs = torch.tensor([generated]).to(device)
            outputs = model(inputs)
            next_token_logits = outputs[0, -1, :]
            next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), num_samples=1).item()
            generated.append(next_token)
            if next_token == tokenizer.eos_token_id:
                break

    return tokenizer.decode(generated, skip_special_tokens=True)

# Generation function for transformer model
def generate_transformer(model, tokenizer, seed_text, max_length=50):
    model.eval()
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(model.device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True)



In [19]:
# Main function
def main():
    # Load and preprocess data
    with open('./data/italian_lyrics.txt', 'r', encoding='utf-8') as f:
        lyrics = f.readlines()

    # Use a subset of the data for faster training
    num_songs = 1000  # Adjust this number as needed
    lyrics = lyrics[:num_songs]
    
    # Initialize tokenizer and models
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    
    vocab_size = tokenizer.vocab_size  #len(tokenizer)
    embedding_dim = 256
    hidden_dim = 512
    max_length = 128
    batch_size = 32
    epochs = 3
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')  # Explicitly use CPU

    # Create dataset and dataloader
    dataset = ItalianLyricsDataset(lyrics, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize models
    rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim).to(device)
    lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
    transformer_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
    transformer_model.resize_token_embeddings(len(tokenizer))

    # Train models
    criterion = nn.CrossEntropyLoss()
    rnn_optimizer = optim.Adam(rnn_model.parameters())
    lstm_optimizer = optim.Adam(lstm_model.parameters())
    transformer_optimizer = optim.Adam(transformer_model.parameters())

    # print("Training RNN model...")
    # train_model(rnn_model, dataloader, criterion, rnn_optimizer, device, epochs)

    # print("Training LSTM model...")
    # train_model(lstm_model, dataloader, criterion, lstm_optimizer, device, epochs)

    # print("Training Transformer model...")
    # train_model(transformer_model, dataloader, criterion, transformer_optimizer, device, epochs, is_transformer=True)
    models = [
        ("RNN", rnn_model, rnn_optimizer),
        ("LSTM", lstm_model, lstm_optimizer),
        ("Transformer", transformer_model, transformer_optimizer)
    ]

    for model_name, model, optimizer in models:
        print(f"Training {model_name} model...")
        start_time = time.time()
        train_model(model, dataloader, criterion, optimizer, device, epochs, 
                    is_transformer=(model_name == "Transformer"))
        end_time = time.time()
        print(f"{model_name} training took {(end_time - start_time) / 60:.2f} minutes")

    # Generate sample lyrics
    print("\nGenerating sample lyrics:")
    seed_text = "Laura sei la più"
    print("RNN:", generate_custom(rnn_model, tokenizer, seed_text))
    print("LSTM:", generate_custom(lstm_model, tokenizer, seed_text))
    print("Transformer:", generate_transformer(transformer_model, tokenizer, seed_text))

if __name__ == "__main__":
    main()

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Training RNN model...
Epoch 1/3, Loss: 5.3852
Epoch 2/3, Loss: 1.4757
Epoch 3/3, Loss: 0.5397
RNN training took 5.04 minutes
Training LSTM model...
Epoch 1/3, Loss: 6.6381
Epoch 2/3, Loss: 3.2004
Epoch 3/3, Loss: 1.5834
LSTM training took 5.26 minutes
Training Transformer model...
Epoch 1/3, Loss: 5.6811
Epoch 2/3, Loss: 3.7114
Epoch 3/3, Loss: 3.2860
Transformer training took 102.26 minutes

Generating sample lyrics:
RNN: Laura sei la più� cavaliusiusiusiusiusiusemaamamamamamamamamamamamamamamamamamamamamamamamamamamamamamamamamamam


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LSTM: Laura sei la più��� david pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens pens saogl rend putus folk quiverver turb abl st st st st st
Transformer: Laura sei la più lontano essere più lontano essere più lontano essere più lontano essere più lontano essere pi


In [21]:

# Load and preprocess data
with open('./data/italian_lyrics.txt', 'r', encoding='utf-8') as f:
    lyrics = f.readlines()

# Use a subset of the data for faster training
# num_songs = 1000  # Adjust this number as needed
# lyrics = lyrics[:num_songs]

# Initialize tokenizer and models
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

vocab_size = tokenizer.vocab_size  #len(tokenizer)
embedding_dim = 256
hidden_dim = 512
max_length = 128
batch_size = 32
epochs = 3
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')  # Explicitly use CPU

# Create dataset and dataloader
dataset = ItalianLyricsDataset(lyrics, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize models
rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim).to(device)
lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
transformer_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
transformer_model.resize_token_embeddings(len(tokenizer))

# Train models
criterion = nn.CrossEntropyLoss()
rnn_optimizer = optim.Adam(rnn_model.parameters())
lstm_optimizer = optim.Adam(lstm_model.parameters())
transformer_optimizer = optim.Adam(transformer_model.parameters())

models = [
    ("RNN", rnn_model, rnn_optimizer),
    ("LSTM", lstm_model, lstm_optimizer),
    ("Transformer", transformer_model, transformer_optimizer)
]

for model_name, model, optimizer in models:
    print(f"Training {model_name} model...")
    start_time = time.time()
    train_model(model, dataloader, criterion, optimizer, device, epochs, 
                is_transformer=(model_name == "Transformer"))
    end_time = time.time()
    print(f"{model_name} training took {(end_time - start_time) / 60:.2f} minutes")



You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Training RNN model...
Epoch 1/3, Loss: 0.9600
Epoch 2/3, Loss: 0.0404
Epoch 3/3, Loss: 0.0098
RNN training took 153.38 minutes
Training LSTM model...
Epoch 1/3, Loss: 1.6617
Epoch 2/3, Loss: 0.1141
Epoch 3/3, Loss: 0.0416
LSTM training took 240.32 minutes
Training Transformer model...
Epoch 1/3, Loss: 3.5817
Epoch 2/3, Loss: 2.9421
Epoch 3/3, Loss: 2.7812
Transformer training took 992.20 minutes


In [27]:
# Generate sample lyrics
print("\nGenerating sample lyrics:")
seed_text = "Stamattina il sole"
print("RNN:", generate_custom(rnn_model, tokenizer, seed_text))
print("LSTM:", generate_custom(lstm_model, tokenizer, seed_text))
print("Transformer:", generate_transformer(transformer_model, tokenizer, seed_text))


Generating sample lyrics:
RNN: Stamattina il sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LSTM: Stamattina il sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole sole
Transformer: Stamattina il sole volere volere volere volere volere volere volere volere volere volere volere volere volere volere vole


The repetitive output you're seeing is a common issue in language models, often referred to as "repetition collapse" or "degeneration". This usually happens when the model hasn't learned enough variety or when the sampling strategy isn't diverse enough.

Key changes in the updated script below:

We've implemented nucleus sampling (top-k and top-p) for the RNN and LSTM models in the generate_custom function.
For the transformer model, we've added top-k, top-p, and repetition penalty parameters in the generate_transformer function.
We've added gradient clipping in the train_model function to prevent exploding gradients.
We've adjusted the learning rates for each model type.
We're now passing the attention mask to the RNN and LSTM models during training.

These changes should help reduce the repetitive outputs and improve the quality of the generated lyrics. Here's a brief explanation of the new parameters:

temperature: Controls the randomness of predictions. Lower values make the model more confident but also more repetitive.
top_k: Limits the sampling to the k most likely next words.
top_p (nucleus sampling): Limits the sampling to the smallest set of words whose cumulative probability exceeds p.
repetition_penalty: Penalizes repetitions in the transformer model.

In [29]:
# Updated custom generation function for RNN and LSTM models
def generate_custom(model, tokenizer, seed_text, max_length=50, temperature=0.7):
    model.eval()
    device = next(model.parameters()).device
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(device)
    attention_mask = torch.ones_like(input_ids).to(device)
    generated = input_ids[0].tolist()

    with torch.no_grad():
        for _ in range(max_length - len(generated)):
            inputs = torch.tensor([generated]).to(device)
            masks = attention_mask[:, :inputs.shape[1]]
            outputs = model(inputs, attention_mask=masks)
            next_token_logits = outputs[0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=0.9)
            next_token = torch.multinomial(torch.softmax(filtered_logits, dim=-1), num_samples=1).item()
            generated.append(next_token)
            attention_mask = torch.cat([attention_mask, torch.ones((1, 1)).to(device)], dim=1)
            if next_token == tokenizer.eos_token_id:
                break

    return tokenizer.decode(generated, skip_special_tokens=True)

# Updated generation function for transformer model
def generate_transformer(model, tokenizer, seed_text, max_length=50):
    model.eval()
    input_ids = tokenizer.encode(seed_text, return_tensors='pt').to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=max_length, 
            num_return_sequences=1, 
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            repetition_penalty=1.2,
            attention_mask=attention_mask,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Helper function for nucleus sampling
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    top_k = min(top_k, logits.size(-1))
    if top_k > 0:
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

# Updated train_model function
def train_model(model, dataloader, criterion, optimizer, device, epochs, is_transformer=False):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            inputs, masks = batch
            inputs, masks = inputs.to(device), masks.to(device)
            optimizer.zero_grad()
            
            if is_transformer:
                outputs = model(inputs, attention_mask=masks, labels=inputs)
                loss = outputs.loss
            else:
                outputs = model(inputs, attention_mask=masks)
                loss = criterion(outputs.view(-1, outputs.size(-1)), inputs.view(-1))
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")


In [30]:
# Use a subset of the data for faster training
num_songs = 10  # Adjust this number as needed
lyrics = lyrics[:num_songs]

# Initialize tokenizer and models
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

vocab_size = tokenizer.vocab_size  #len(tokenizer)
embedding_dim = 256
hidden_dim = 512
max_length = 128
batch_size = 32
epochs = 5
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')  # Explicitly use CPU

# Create dataset and dataloader
dataset = ItalianLyricsDataset(lyrics, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize models
rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim).to(device)
lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
transformer_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
transformer_model.resize_token_embeddings(len(tokenizer))

# Train models
criterion = nn.CrossEntropyLoss()
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)
transformer_optimizer = optim.Adam(transformer_model.parameters(), lr=5e-5)

models = [
    ("RNN", rnn_model, rnn_optimizer),
    ("LSTM", lstm_model, lstm_optimizer),
    ("Transformer", transformer_model, transformer_optimizer)
]

for model_name, model, optimizer in models:
    print(f"Training {model_name} model...")
    start_time = time.time()
    train_model(model, dataloader, criterion, optimizer, device, epochs, 
                is_transformer=(model_name == "Transformer"))
    end_time = time.time()
    print(f"{model_name} training took {(end_time - start_time) / 60:.2f} minutes")


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Training RNN model...


TypeError: forward() got an unexpected keyword argument 'attention_mask'