In [2]:
#imports
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import trange

In [108]:
# hyperparameters
n_embd = 64
n_blocks = 4
block_size = 128
batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 100
dropout = 0.1



# torch seed 1337
# torch.manual_seed(1337)

# get the data
text = open('../data/shakespeare.txt', 'r', encoding='utf-8').read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))

# create a mapping from characters to integers : def encode & decode
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: list(stoi[c] for c in s)
decode = lambda l: ''.join(itos[i] for i in l)

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(text))
train_data = data[:n]
val_data = data[n:]


#data loading  - def get_batch function def (split) returns x, y
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+1+block_size] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# def estimate_loss() @torch.no_grad() : model.eval() -> model.train()
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()'
    model.train()
    return out

# def Head module class
class Head(nn.Module):
     
    def __init__(self, head_size):
        super().__init__()
        
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self, x):
        
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        # calculate attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.trill == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)
        out = wei @ v
        return out
        
        

# def multihead attention module class
class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = proj(out)
        out = dropout(out)
        return out
        
    
# def feedforward module class
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

# def block module class

# def n-gram model class

# instantiate n-gram model : set it to device

# print the number of parameters in the model


# create a PyTorch optimizer

# for loop to train the model


tensor([[57,  1, 58,  ..., 58, 46,  1],
        [31, 46, 39,  ..., 56, 43, 57],
        [56,  0, 27,  ...,  1, 45, 53],
        ...,
        [53, 51, 43,  ..., 45, 43, 52],
        [39, 50,  1,  ..., 58,  6,  0],
        [43,  6,  1,  ..., 52,  6,  0]], device='cuda:0')

In [7]:
# estimate loss()


In [8]:
# generate from the model


In [None]:
# # hyperparameters
# batch_size = 32
# block_size = 128
# max_iters = 5000
# eval_interval = 100
# learning_rate = 1e-3
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embd = 128
# n_head = 4
# n_layer = 8
# dropout = 0.1
# #-----------