<a href="https://colab.research.google.com/github/Bhaanupriyaranjit/DLProject/blob/main/Shakespearellm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [None]:
!pip install torch torchvision matplotlib
import torch
import torch.nn as nn
from torch.nn import functional as F
import urllib.request
import matplotlib.pyplot as plt
print("GPU available:", torch.cuda.is_available())


GPU available: True


In [None]:
#Dataset prep

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
data = urllib.request.urlopen(url).read().decode("utf-8")
print(f"Dataset length: {len(data):,} characters")
print(data[:500])  #to check few lines


Dataset length: 1,115,394 characters
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
#tokenization step
from torch.utils.data import Dataset

#configuration class for model hyperparameters-Batch Size B=128 and block size, N=128
class Config:
    def __init__(self, batch_size=128, block_size=128):
        self.batch_size = batch_size
        self.block_size = block_size

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):

        self.data = data
        self.block_size = config.block_size
        self.batch_size = config.batch_size
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        chars = sorted(list(set(self.data))) # get characters from the input data
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices
        self.itos = { i:ch for i,ch in enumerate(chars) }  # map integer indices to characters
        self.vocab_size = len(chars)
        print(f"Vocab size: {self.vocab_size} unique characters")


    def get_vocab_size(self):
        return self.vocab_size

  #to know,from this text, how many training examples (input x,target pairs,y) can it make, ie. how many valid training smaples it can make
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        chunk = self.data[idx : idx + self.block_size + 1]
        encoded = torch.tensor([self.stoi[c] for c in chunk], dtype=torch.long)
        x = encoded[:-1]
        y = encoded[1:]
        return x.to(self.device), y.to(self.device)

In [6]:
#to verify tokenization part

#for training and validation-train set and validation set
n = int(0.9 * len(data))
train_text = data[:n]
val_text   = data[n:]

cfg = Config(batch_size=128, block_size=128)
train_dataset = CharDataset(cfg, train_text)
val_dataset   = CharDataset(cfg, val_text)

x, y = train_dataset[0]
print("x shape:", x.shape)
print("y shape:", y.shape)

itos = train_dataset.itos
print("Input  (x):", ''.join([itos[i.item()] for i in x[:60]]))
print("Target (y):", ''.join([itos[i.item()] for i in y[:60]]))


#to simulate a batch
def get_batch(dataset, batch_size=cfg.batch_size):
    ix = torch.randint(len(dataset), (batch_size,))
    x = torch.stack([dataset[i][0] for i in ix])
    y = torch.stack([dataset[i][1] for i in ix])
    return x, y

xb, yb = get_batch(train_dataset)
print("Batch shapes:", xb.shape, yb.shape)



Vocab size: 65 unique characters
Vocab size: 61 unique characters
x shape: torch.Size([128])
y shape: torch.Size([128])
Input  (x): First Citizen:
Before we proceed any further, hear me speak.
Target (y): irst Citizen:
Before we proceed any further, hear me speak.

Batch shapes: torch.Size([128, 128]) torch.Size([128, 128])
