In [65]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [3]:
dataset = open('dataset.txt', 'r', encoding='utf-8').read()
print(len(dataset))
print(dataset[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# Find tokens

chars = sorted(set(dataset))
dictionary = ''.join(chars)
print(f"Dataset character level dictionary is: {dictionary}")
print(f"Length of the dictionary is: {len(dictionary)}")

Dataset character level dictionary is: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of the dictionary is: 65


In [5]:
# Tokenizer
char_id = {s:i for i, s in enumerate(dictionary)}
id_char = {i:s for i, s in enumerate(char_id)}
encoder = lambda s: [char_id[c] for c in s]
decode = lambda i: ''.join([id_char[c] for c in i])

In [12]:
encoded_dataset = torch.tensor(encoder(dataset), dtype=torch.long)
print(encoded_dataset.shape, encoded_dataset.dtype)
print(encoded_dataset[:100])


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [21]:
# Split
train_length = int(0.9 * len(encoded_dataset))
print(train_length)
train_dataset = encoded_dataset[:train_length]
validation_dataset = encoded_dataset[train_length:]
print(train_dataset.shape)
print(validation_dataset.shape)

1003854
torch.Size([1003854])
torch.Size([111540])


In [49]:
SEQUENCE_LENGTH = 8
EMBEDDING_DIM = 8
BATCH_SIZE = 16
VOCAB_SIZE = len(dictionary)

In [47]:
def make_data(data):
    x = []
    y = []
    d = data.tolist()
    for i in range(len(d) // SEQUENCE_LENGTH):
        x.append(d[i*8 : (i+1)*8])
        y.append(d[i*8 +1 : (i+1)*8 + 1])
    #return torch.tensor(x), torch.tensor(y)
    return torch.tensor(x).reshape(-1, BATCH_SIZE, SEQUENCE_LENGTH), torch.tensor(y).reshape(-1, BATCH_SIZE, SEQUENCE_LENGTH)

X_train, y_train = make_data(encoded_dataset)
print(f"Size of the training data is: {X_train.shape}")
print(f"Size of the training label is: {y_train.shape}")

Size of the training data is: torch.Size([8714, 16, 8])
Size of the training label is: torch.Size([8714, 16, 8])


In [66]:
# make bigram model

class BiGram(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x):
        return self.token_embedding_table(x)
    
    def calc_loss(self, x, target):
        logits = self(x)
        F.cross_entropy(logits, target)
    
bigram_model = BiGram(VOCAB_SIZE)
# logits = bigram_model((X_train[0]))
# bigram_model.calc_loss(X_train[0], y_train[0])


RuntimeError: Expected target size [16, 65], got [16, 8]

In [23]:
# train bigarm