In [None]:
!pip install wget



In [None]:
import wget

# Dataset URL
dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

# Download the dataset from its URL and save it in input.txt
wget.download(dataset_url)


'input (1).txt'

In [None]:
# Open the 'input.txt' file for reading with UTF-8 encoding
with open('input.txt', 'r', encoding='utf-8') as file:
    # Read the contents of the file into the 'text' variable
    text = file.read()


In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# Find all unique characters in the 'text' variable and sort them
unique_characters = sorted(list(set(text)))

# Calculate the vocabulary size, which is the total number of unique characters
vocab_size = len(unique_characters)

# Print the unique characters as a single string and the vocabulary size
print('Unique Characters:', ''.join(unique_characters))
print('Vocabulary Size:', vocab_size)


Unique Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary Size: 65


In [None]:
# Create a mapping from characters to integers using dictionary comprehension
# stoi: character to integer mapping
stoi = {ch: i for i, ch in enumerate(unique_characters)}

# Create a mapping from integers to characters using dictionary comprehension
# itos: integer to character mapping
itos = {i: ch for i, ch in enumerate(unique_characters)}

# Define an encoder function that takes a string and outputs a list of integers
encode = lambda s: [stoi[c] for c in s]

# Define a decoder function that takes a list of integers and outputs a string
decode = lambda l: ''.join([itos[i] for i in l])

# Test the encoder and decoder functions
encoded_text = encode("hii there")
decoded_text = decode(encoded_text)

# Print the encoded and decoded results
print("Encoded:", encoded_text)
print("Decoded:", decoded_text)


Encoded: [46, 47, 47, 1, 58, 46, 43, 56, 43]
Decoded: hii there


In [None]:
# Import the PyTorch library for working with tensors
import torch

# Encode the entire text dataset using the 'encode' function defined earlier
encoded_data = torch.tensor(encode(text), dtype=torch.long)

# Print the shape and data type of the encoded data tensor
print("Encoded Data Shape:", encoded_data.shape)
print("Encoded Data Type:", encoded_data.dtype)


Encoded Data Shape: torch.Size([1115394])
Encoded Data Type: torch.int64


In [None]:
# Calculate the index to split the data into train and validation sets (first 90% for train)
n = int(0.9 * len(encoded_data))

# Create the training data by taking the first 90% of the encoded data
train_data = encoded_data[:n]

# Create the validation data by taking the remaining 10% of the encoded data
val_data = encoded_data[n:]


In [None]:
# Define the block size, which is the length of the sequence to be extracted
block_size = 8

# Extract a sequence of characters from the beginning of the training data
sequence = train_data[:block_size + 1]


In [None]:
# Define the block size, which is the length of the sequence
block_size = 8

# Extract the input sequence 'x' and target sequence 'y' from the training data
x = train_data[:block_size]
y = train_data[1 : block_size + 1]

# Iterate through each time step in the sequence
for t in range(block_size):
    # Create the context by selecting the input sequence up to time step 't'
    context = x[:t + 1]

    # Determine the target character at time step 't'
    target = y[t]

    # Print the context and corresponding target character
    print(f"When input is {context}, the target is: {target}")


When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [None]:
# Set a manual seed for reproducibility
torch.manual_seed(1337)

# Define batch size (number of independent sequences to process in parallel) and block size (maximum context length)
batch_size = 4
block_size = 8

# Function to get a batch of data
def get_batch(split):
    # Select the appropriate dataset based on the split (train or validation)
    data = train_data if split == 'train' else val_data

    # Randomly select 'batch_size' starting positions within the dataset
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Extract input sequences 'x' and target sequences 'y' for the selected positions
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

# Get a batch of training data
xb, yb = get_batch('train')

# Print the shapes and contents of the input and target sequences in the batch
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

# Iterate through the batch dimension and time dimension to display inputs and targets
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()}, the target is: {target}")


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
When input is [24], the target is: 43
When input is [24, 43], the target is: 58
When input is [24, 43, 58], the target is: 5
When input is [24, 43, 58, 5], the target is: 57
When input is [24, 43, 58, 5, 57], the target is: 1
When input is [24, 43, 58, 5, 57, 1], the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46], the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43], the target is: 39
When input is [44], the target is: 53
When input is [44, 53], the target is: 56
When input is [44, 53, 56], the target is: 1
When input is [44, 53, 56, 1], the target is: 58
When input is [44, 53,

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Set a manual seed for reproducibility
torch.manual_seed(1337)

# Define the Bigram Language Model class
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # 'idx' and 'targets' are both (B,T) tensors of integers
        logits = self.token_embedding_table(idx)  # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # 'idx' is a (B, T) tensor of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on the last time step
            logits = logits[:, -1, :]  # (B, C)
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # Append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

# Create an instance of the Bigram Language Model with the specified vocabulary size
m = BigramLanguageModel(vocab_size)

# Compute logits and loss for the given input sequences
logits, loss = m(xb, yb)
print("Logits Shape:", logits.shape)
print("Loss:", loss)

# Generate text using the model starting from an empty context
generated_text = decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())
print("Generated Text:", generated_text)


Logits Shape: torch.Size([32, 65])
Loss: tensor(4.8786, grad_fn=<NllLossBackward0>)
Generated Text: 
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
# Create a PyTorch optimizer for training the model's parameters
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)


In [None]:
# Set the batch size for training
batch_size = 32

# Loop for a specified number of training steps
for steps in range(100):  # Increase the number of steps for better results...

    # Sample a batch of training data
    xb, yb = get_batch('train')

    # Evaluate the loss and perform gradient descent
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)  # Zero out gradients to avoid accumulation
    loss.backward()  # Compute gradients using backpropagation
    optimizer.step()  # Update model parameters using the optimizer

# Print the final loss after training
print("Final Loss:", loss.item())


Final Loss: 4.587916374206543


In [None]:
# Generate text using the trained model starting from an empty context
generated_text = decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())

# Print the generated text
print("Generated Text:")
print(generated_text)


Generated Text:

xiKi-RJ:CgqVuUa!U?qMH.uk!sCuMXvv!CJFfx;LgRyJknOEti.?I&-gPlLyulId?XlaInQ'q,lT$
3Q&sGlvHQ?mqSq-eON
x?SP fUAfCAuCX:bOlgiRQWN:Mphaw
tRLKuYXEaAXxrcq-gCUzeh3w!AcyaylgYWjmJM?Uzw:inaY,:C&OECW:vmGGJAn3onAuMgia!ms$Vb q-gCOcPcUhOnxJGUGSPJWT:.?ujmJFoiNL&A'DxY,prZ?qdT;hoo'dHooXXlxf'WkHK&u3Q?rqUi.kz;?Yx?C&u3Qbfzxlyh'Vl:zyxjKXgC?
lv'QKFiBeviNxO'm!Upm$srm&TqViqiBD3HBP!juEOpmZJyF$Fwfy!PlvWPFC
&WDdP!Ko,px
x
tREOE;AJ.BeXkylOVD3KHp$e?nD,.SFbWWI'ubcL!q-tU;aXmJ&uGXHxJXI&Z!gHRpajj;l.
pTErIBjx;JKIgoCnLGXrJSP!AU-AcbczR?


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
# Hyperparameters for training and model architecture

# Batch size: Number of independent sequences to process in parallel during training
batch_size = 16

# Block size: Maximum context length for predictions
block_size = 32

# Maximum number of training iterations
max_iters = 5000

# Interval for evaluation (printing progress, generating text, etc.)
eval_interval = 100

# Learning rate for optimizer
learning_rate = 1e-3

# Device for training (CPU or CUDA GPU if available)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Evaluation iterations (how often to evaluate the model)
eval_iters = 200

# Model architecture hyperparameters
n_embd = 64     # Embedding dimension
n_head = 4      # Number of attention heads
n_layer = 4     # Number of layers in the model
dropout = 0.0   # Dropout rate

torch.manual_seed(1337)

<torch._C.Generator at 0x7f1c901611b0>

In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
# Find all unique characters in the 'text' variable and sort them
chars = sorted(list(set(text)))

# Calculate the vocabulary size, which is the total number of unique characters
vocab_size = len(chars)

# Create a mapping from characters to integers using dictionary comprehension
# stoi: character-to-integer mapping
stoi = {ch: i for i, ch in enumerate(chars)}

# Create a mapping from integers to characters using dictionary comprehension
# itos: integer-to-character mapping
itos = {i: ch for i, ch in enumerate(chars)}

# Define an encoder function that takes a string and outputs a list of integers
encode = lambda s: [stoi[c] for c in s]  # encoder: take a string, output a list of integers

# Define a decoder function that takes a list of integers and outputs a string
decode = lambda l: ''.join([itos[i] for i in l])  # decoder: take a list of integers, output a string


In [None]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]


In [None]:
def get_batch(split):
    """
    Generate a batch of input sequences (x) and target sequences (y) for training or validation.

    Args:
        split (str): 'train' for training data or 'val' for validation data.

    Returns:
        x (torch.Tensor): Batch of input sequences.
        y (torch.Tensor): Batch of target sequences.

    """
    # Select the appropriate dataset based on the split (train or validation)
    data = train_data if split == 'train' else val_data

    # Randomly select 'batch_size' starting positions within the dataset
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Extract input sequences 'x' and target sequences 'y' for the selected positions
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # Move the tensors to the specified device (CPU or GPU)
    x, y = x.to(device), y.to(device)

    return x, y


In [None]:
@torch.no_grad()  # Decorator to disable gradient tracking during inference
def estimate_loss():
    """
    Estimate the loss of the model on the training and validation datasets.

    Returns:
        out (dict): A dictionary containing mean loss values for 'train' and 'val' splits.
    """
    out = {}
    model.eval()  # Set the model to evaluation mode (no gradient computation)

    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        # Calculate the mean loss for this split
        out[split] = losses.mean()

    model.train()  # Set the model back to training mode
    return out


In [None]:
class Head(nn.Module):
    """ One head of self-attention. """

    def __init__(self, head_size):
        super().__init__()

        # Linear transformations for key, query, and value
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # Register a lower triangular mask (used for masking future positions)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # Key projection (B, T, head_size)
        q = self.query(x) # Query projection (B, T, head_size)

        # Compute attention scores ("affinities") between queries and keys
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, T)

        # Apply a lower triangular mask to prevent attending to future positions
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)

        # Apply softmax to obtain attention weights
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # Apply dropout to attention weights
        wei = self.dropout(wei)

        # Perform weighted aggregation of values
        v = self.value(x) # Value projection (B, T, head_size)
        out = wei @ v # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)

        return out


In [None]:
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel. """

    def __init__(self, num_heads, head_size):
        super().__init__()

        # Create a list of attention heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

        # Linear transformation for projection
        self.proj = nn.Linear(n_embd, n_embd)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply each attention head in parallel and concatenate the results
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, num_heads * head_size)

        # Apply linear projection
        out = self.dropout(self.proj(out)) # (B, T, n_embd)

        return out


In [None]:
class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity and dropout. """

    def __init__(self, n_embd):
        super().__init__()

        # Define a sequential network with linear layers, ReLU activation, and dropout
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # Linear layer with 4 times the embedding dimension
            nn.ReLU(),  # ReLU activation function
            nn.Linear(4 * n_embd, n_embd),  # Linear layer to project back to the embedding dimension
            nn.Dropout(dropout),  # Dropout layer for regularization
        )

    def forward(self, x):
        # Apply the defined network to the input tensor 'x'
        return self.net(x)


In [None]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation. """

    def __init__(self, n_embd, n_head):
        """
        Initialize a Transformer block.

        Args:
            n_embd (int): Embedding dimension.
            n_head (int): Number of attention heads.
        """
        super().__init__()

        # Calculate the size of each attention head
        head_size = n_embd // n_head

        # Multi-head self-attention layer
        self.sa = MultiHeadAttention(n_head, head_size)

        # Feedforward neural network layer
        self.ffwd = FeedForward(n_embd)

        # Layer normalization layers
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        """
        Forward pass through the Transformer block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after processing through the block.
        """
        # Apply self-attention layer followed by layer normalization
        x = x + self.sa(self.ln1(x))

        # Apply feedforward neural network layer followed by layer normalization
        x = x + self.ffwd(self.ln2(x))

        return x


In [None]:

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()

        # Token embedding table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        # Position embedding table
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # Stack of transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        # Layer normalization for the final output
        self.ln_f = nn.LayerNorm(n_embd)  # Final layer norm

        # Linear layer for language modeling prediction
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensors of integers

        # Token embeddings
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)

        # Position embeddings
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)

        # Add token and position embeddings
        x = tok_emb + pos_emb  # (B,T,C)

        # Apply the stack of transformer blocks
        x = self.blocks(x)  # (B,T,C)

        # Apply layer normalization to the output
        x = self.ln_f(x)  # (B,T,C)

        # Generate logits for language modeling
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
      """
      Generate new text based on the given initial context.

      Args:
          idx (torch.Tensor): Initial context as a tensor of indices (B, T).
          max_new_tokens (int): Maximum number of new tokens to generate.

      Returns:
          torch.Tensor: Tensor of generated indices (B, T+max_new_tokens).

      """
      # Iterate for 'max_new_tokens' steps to generate new tokens
      for _ in range(max_new_tokens):
          # Crop the context to the last 'block_size' tokens
          idx_cond = idx[:, -block_size:]

          # Get predictions from the model for the current context
          logits, loss = self(idx_cond)

          # Focus only on the last time step
          logits = logits[:, -1, :]  # Shape: (B, C)

          # Apply softmax to obtain probabilities
          probs = F.softmax(logits, dim=-1)  # Shape: (B, C)

          # Sample from the probability distribution to get the next token
          idx_next = torch.multinomial(probs, num_samples=1)  # Shape: (B, 1)

          # Append the sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1)  # Shape: (B, T+1)

      return idx



In [None]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



0.209729 M parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5090, val loss 2.5059
step 300: train loss 2.4196, val loss 2.4338
step 400: train loss 2.3503, val loss 2.3565
step 500: train loss 2.2966, val loss 2.3129
step 600: train loss 2.2410, val loss 2.2500
step 700: train loss 2.2051, val loss 2.2191
step 800: train loss 2.1640, val loss 2.1874
step 900: train loss 2.1251, val loss 2.1515
step 1000: train loss 2.1023, val loss 2.1291
step 1100: train loss 2.0699, val loss 2.1192
step 1200: train loss 2.0375, val loss 2.0797
step 1300: train loss 2.0259, val loss 2.0647
step 1400: train loss 1.9924, val loss 2.0362
step 1500: train loss 1.9700, val loss 2.0304
step 1600: train loss 1.9631, val loss 2.0476
step 1700: train loss 1.9412, val loss 2.0131
step 1800: train loss 1.9097, val loss 1.9960
step 1900: train loss 1.9101, val loss 1.9882
step 2000: train loss 1.8867, val loss 1.9976
step 2100: train loss 1.

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


And they bride will to lay be madie;
Thou but take O-dam the change:
Warth full him tother dilth ane away, my fears,
You have was them of is heart mile,
You, and if ensmy contlatist, drov the does me now that
just, lesing that.
His my now, you up; and the tyby love.
In Bodiet, and whom
that demperakenous, so what evily well my
Murtus censurence of him the reshep and thrust for to imper my monte in Mont,
To fight? gry of thy hourb! stiddy as
ards bearing her broint must are no Runnts
Infortuce will me not be arm.
You contrantymes have myse.-
And fortwerle madam them may in son, live body.

Think you:
It stay might. 
CLAMENCE:
My whilesse everew in movet, if Cassce of's counted;
How what make you fear tals: the gold my sun?
What, loudy forgor man our him.
I will were but with some. Povinly Ford the welcont.

QUEEN FIDILIZ:
No?
Their him the not.

POLIXENENE:
But to me, God no now the summe wip.

GROMPEO:
Conguit, bruke this belike, on so han the bodiet.

CORIOLANUS:
Till the;
you wellse