In [113]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, DistributedSampler

In [21]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), "Data/Tiny shakespeare/input.txt")

In [22]:
with open(data_dir, 'r') as f:
    text = f.read()

In [102]:
vocab = sorted(list(set(text)))
vocab_size = len(sorted(list(set(text)))) 

# Hyperparameters
batch_size = 4 #B
max_tokens = 10 #T
emb_dim = 32 #C

if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"
else:
    device = "cpu"


In [103]:
token_encodings = {}
token_decodings = {}
for i, token in enumerate(vocab):
    token_encodings[token] = i
    token_decodings[i] = token

In [112]:
def encode(txt):
    enc_char = [token_encodings[char] for char in txt]
    return enc_char

def decode(enc_tokens):
    dec_char = [token_decodings[idx] for idx in enc_tokens]
    # decoded_str = "".join(dec_char);
    return dec_char

def generate_batch(batch_size):
    idx = torch.randint(0, vocab_size - max_tokens - 1, (batch_size,))
    data = torch.tensor(
        [encode(text[i : i + max_tokens]) for i in idx], device=device
    )
    targets = torch.tensor(
        [encode(text[i + 1 : i + max_tokens + 1]) for i in idx], device=device
    )
    return data, targets

token_emb_table = nn.Embedding(vocab_size, emb_dim, device=device)

In [111]:
data, targets = generate_batch(batch_size)