In [1]:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import requests

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # This is the entire text data

In [2]:
# Step 2: Prepare the dataset
sequence_length = 50
# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

# Convert lists to PyTorch tensors
sequences = torch.tensor(sequences, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

In [3]:
# Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Defining the Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get the output of the last Transformer block
        return output

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-sequence_length:]], dtype=torch.long).unsqueeze(0).to(device)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]


In [4]:
dataset = CharDataset(sequences, targets)

# Step 4: Create data loaders
batch_size = 128
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


In [5]:

# Hyperparameters
hidden_size = 48
num_layers = 4
nhead = 2
learning_rate = 0.001
epochs = 15

# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training the model
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()  # Zero the gradients

            # Forward pass
            outputs = model(inputs)
            
            # Compute the loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

    # Validate every 10 epochs
    if (epoch + 1) % 1 == 0:
        model.eval()  # Set the model to evaluation mode
        total_correct = 0
        total_samples = 0
        val_running_loss = 0.0
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                total_correct += (predicted == labels).sum().item()
                total_samples += labels.size(0)
                
                # Compute validation loss
                val_loss = criterion(outputs, labels)
                val_running_loss += val_loss.item()

            val_accuracy = total_correct / total_samples
            avg_val_loss = val_running_loss / len(test_loader)
        print(f'Epoch {epoch+1}, Loss: {running_loss}, Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}')


Epoch 1, Loss: 17359.12953042984, Validation Loss: 2.413601113131726, Validation Accuracy: 0.28636879171915414
Epoch 2, Loss: 16767.854868412018, Validation Loss: 2.3687573570119054, Validation Accuracy: 0.2955901537192528
Epoch 3, Loss: 16560.807185053825, Validation Loss: 2.346677472382117, Validation Accuracy: 0.30290627563668643
Epoch 4, Loss: 16421.835232257843, Validation Loss: 2.3262947903100746, Validation Accuracy: 0.3074295397388252
Epoch 5, Loss: 16322.433349967003, Validation Loss: 2.3134264492537193, Validation Accuracy: 0.3120424621977953
Epoch 6, Loss: 16254.139179587364, Validation Loss: 2.3023124344377632, Validation Accuracy: 0.314342199050518
Epoch 7, Loss: 16197.234177350998, Validation Loss: 2.2969512450441982, Validation Accuracy: 0.3156243135532055
Epoch 8, Loss: 16149.93190586567, Validation Loss: 2.290733229204354, Validation Accuracy: 0.31803612335196735
Epoch 9, Loss: 16107.74669110775, Validation Loss: 2.2830721935320364, Validation Accuracy: 0.3190940919625