Experimentin with text file

In [5]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(len(text))




252022


In [6]:
print(text[:200])

The Project Gutenberg eBook of Dorothy and the Wizard in Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no rest


In [7]:
chars = sorted(set(text))
print(chars)
vocabulary_size = len(chars)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']


In [8]:
print(len(chars))

92


## character level Tokenizer

In [9]:
string_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_string = {i: ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

print(encode('hello'))

[65, 62, 69, 69, 72]


In [10]:
encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


Types of tokenizer = word level character level subword tokenizer

# Tensor Instead of Arrays

In [11]:
import torch

In [12]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data[:100])

tensor([91, 48, 65, 62,  1, 44, 75, 72, 67, 62, 60, 77,  1, 35, 78, 77, 62, 71,
        59, 62, 75, 64,  1, 62, 30, 72, 72, 68,  1, 72, 63,  1, 32, 72, 75, 72,
        77, 65, 82,  1, 58, 71, 61,  1, 77, 65, 62,  1, 51, 66, 83, 58, 75, 61,
         1, 66, 71,  1, 43, 83,  0,  1,  1,  1,  1,  0, 48, 65, 66, 76,  1, 62,
        59, 72, 72, 68,  1, 66, 76,  1, 63, 72, 75,  1, 77, 65, 62,  1, 78, 76,
        62,  1, 72, 63,  1, 58, 71, 82, 72, 71])


## Train and Validation

Train = 80%  Validation = 20%

In [13]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[:n]

Input and Targets Implementations

In [14]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[1]
    print('when input is', context, 'target is', target)

when input is tensor([91]) target is tensor(65)
when input is tensor([91, 48]) target is tensor(65)
when input is tensor([91, 48, 65]) target is tensor(65)
when input is tensor([91, 48, 65, 62]) target is tensor(65)
when input is tensor([91, 48, 65, 62,  1]) target is tensor(65)
when input is tensor([91, 48, 65, 62,  1, 44]) target is tensor(65)
when input is tensor([91, 48, 65, 62,  1, 44, 75]) target is tensor(65)
when input is tensor([91, 48, 65, 62,  1, 44, 75, 72]) target is tensor(65)


switch from CPU to cuda

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4

cpu


In [2]:
import torch.nn as nn



In [3]:
class CharModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden


In [20]:
chars = sorted(set(text))
vocab_size = len(chars)


In [21]:
print("Vocabulary Size:", vocab_size)


Vocabulary Size: 92


In [23]:
import torch.optim as optim


In [24]:
embed_size = 64
hidden_size = 128
num_layers = 2
batch_size = 4
block_size = 8
epochs = 10
learning_rate = 0.001

model = CharModel(vocab_size, embed_size, hidden_size, num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

def get_batch(split):
    data = train_data if split == 'train' else val_data
    idx = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx]).to(device)
    y = torch.stack([data[i+1:i+block_size+1] for i in idx]).to(device)
    return x, y

In [26]:
for epoch in range(epochs):
    model.train()
    for i in range(1000):  
        x, y = get_batch('train')
        optimizer.zero_grad()
        
        hidden = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                  torch.zeros(num_layers, batch_size, hidden_size).to(device))  

        output, hidden = model(x, hidden)  
        loss = criterion(output.view(-1, vocab_size), y.view(-1))
        
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 2.2120585441589355
Epoch 2, Loss: 2.3547379970550537
Epoch 3, Loss: 2.0613508224487305
Epoch 4, Loss: 1.9459351301193237
Epoch 5, Loss: 2.18825101852417
Epoch 6, Loss: 2.267282485961914
Epoch 7, Loss: 1.9814447164535522
Epoch 8, Loss: 1.4452241659164429
Epoch 9, Loss: 1.5366137027740479
Epoch 10, Loss: 1.9444795846939087


In [27]:
model.eval()
def generate_text(start_string, length=200):
    input_eval = torch.tensor([string_to_int[c] for c in start_string], dtype=torch.long).unsqueeze(0).to(device)
    hidden = (torch.zeros(num_layers, 1, hidden_size).to(device),
              torch.zeros(num_layers, 1, hidden_size).to(device))
    
    result = start_string
    for _ in range(length):
        output, hidden = model(input_eval, hidden)
        next_char = torch.argmax(output[:, -1, :], dim=-1).item()
        result += int_to_string[next_char]
        input_eval = torch.tensor([[next_char]], dtype=torch.long).to(device)
    return result

print(generate_text("Once upon a time"))


Once upon a time the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child the child
