In [246]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pickle 
import random 
from collections import Counter
import matplotlib.pyplot as plt

random.seed(0)
torch.manual_seed(220);

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'{device = }')

device = 'cuda'


In [3]:
with open('words_250000_train.txt', 'r', encoding='utf-8') as f:
    text = f.read().splitlines()

In [5]:
random.shuffle(text)
text[:10]

['thyreohyal',
 'verdancy',
 'nonbolshevism',
 'norml',
 'consecrating',
 'ginhound',
 'yaffed',
 'beats',
 'squarechinned',
 'skiagraphic']

In [6]:
maxlen = max(len(words) for words in text)
print(f'{maxlen = }')

maxlen = 29


In [7]:
charset = sorted(list(set(''.join(text))))
stoi = {c:idx+1 for idx,c in enumerate(charset)}
itos = {idx+1:c for idx,c in enumerate(charset)}
stoi['.'] = 0
itos[0] = '.'
encode = lambda x: [stoi[i] for i in x] 
decode = lambda x: ''.join([itos[i] for i in x])
vocab_size = len(stoi)
print(f'{vocab_size = }') 

vocab_size = 27


In [8]:
word = random.sample(text, 1)[0]
print(f'{word = }')

word = 'subgape'


In [9]:
encoded = encode(word)
print(f'{encoded = }')

encoded = [19, 21, 2, 7, 1, 16, 5]


In [10]:
decoded = decode(encoded)
print(f'{decoded = }')

decoded = 'subgape'


In [184]:
def build_dataset(words):
    x = []
    y = []
    for word in words:
        encoded = np.array(encode(word))
        input = np.zeros_like(encoded)
        x.append(input) 
        sorted_letter_count = Counter(word).most_common()
        g = [0]*26
        prev = x[-1].copy()
        while not np.all(prev != 0):
            prev = x[-1].copy()
            for letter, _ in sorted_letter_count:
                idx = ord(letter) - ord('a')
                if g[idx] != 1:
                    g[idx] = 1
                    g_in = stoi[letter]
                    break
            y.append(g_in)
            idxs = np.where(encoded == g_in)[0]
            prev[idxs] = g_in 
            if np.any(prev == 0):
                x.append(prev)
            
    # adding paddding to every input 
    return x , y 

random.shuffle(text)
n1 = int(0.9*len(text))
n2 = int(0.95*len(text))

Xtr, Ytr = build_dataset(text[:n1])
Xval, Yval = build_dataset(text[n1:n2])
Xts, Yts  = build_dataset(text[n2:])

# Xtr, Ytr = load_split('train')
# Xval, Yval = load_split('val')
# Xts, Yts = load_split('test')

In [186]:
print(f'Train Shape : {len(Xtr)} {len(Ytr)}')
print(f'Val Shape : {len(Xval)} {len(Yval)}')
print(f'Test Shape : {len(Xts)} {len(Yts)}')

Train Shape : 1513133 1513133
Val Shape : 83936 83936
Test Shape : 84140 84140


In [228]:
# hyperparemters 
learning_rate = 1e-3
n_embd = 32
hidden_size = 128
epochs = 1000

In [191]:
def get_batch(split):
    data  = {'train': (Xtr, Ytr),
            'val' : (Xval, Yval)}[split]
    ix = np.random.randint(0, len(Xtr), (32,))
    xb, yb = Xtr[ix], Ytr[ix]
    return xb, yb


@torch.no_grad()
def split_loss():
    model.eval()
    out = {}
    for split in ['train','val']:
        losses = torch.zeros(200)
        for k in range(200):
            xb, yb = get_batch(split)
            logits, loss = model(xb,yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [328]:
class LSTM(nn.Module):
    def __init__(self, n_embd, hidden_size, vocab_size):
        super().__init__()
        self.hidden_size = hidden_size 
        self.tok_embeddings = nn.Embedding(vocab_size, n_embd)
        self.i2h = nn.Linear(n_embd + hidden_size, hidden_size)
        self.i2m = nn.Linear(n_embd + hidden_size, hidden_size)
        self.u_gate = nn.Linear(n_embd + hidden_size, 1)
        self.f_gate = nn.Linear(n_embd + hidden_size, 1)
        self.o_gate = nn.Linear(n_embd + hidden_size, 1)
        self.i2o = nn.Linear(n_embd + hidden_size, vocab_size)

    def forward(self, x_t, hidden_state, memory_state):
        x = self.tok_embeddings(x_t).view(1,-1)
        x = torch.cat((hidden_state, x), dim=-1)
        memory_update = torch.tanh(self.i2m(x))
        update = torch.sigmoid(self.u_gate(x))
        forget = torch.sigmoid(self.f_gate(x))
        output = torch.sigmoid(self.o_gate(x))
        memory_state = update * memory_update + forget * memory_state
        hidden_state = output * torch.tanh(memory_state)
        y_pred = self.i2o(x)
        return y_pred, hidden_state, memory_state

    def init_hidden_memory(self):
        return torch.zeros(1, self.hidden_size), torch.zeros(1, self.hidden_size)

    def guessletter(self, context, h, memory_stat, guessed_letters):
        for t in range(context.size()[0]):
            y_pred, h, m = lstm(context[t], h, m)
        
        probs = F.softmax(y_pred, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        while guessed_letters[ix-2] == 1:
            ix = torch.multinomial(probs, num_samples=1).item()

        return ix
        

In [334]:
lstm = LSTM(n_embd, hidden_size, vocab_size)
# model = lstm"
optimizer = torch.optim.AdamW(lstm.parameters(), lr=1e-3)
lossi = []

In [335]:
num_params = sum(p.numel() for p in lstm.parameters(recurse=True))
num_params

46910

In [337]:
for step in range(10):
    h, m = lstm.init_hidden_memory()
    loss = 0
    for x, y in zip(Xtr[:10000], Ytr):
        x = torch.tensor(x)
        y = torch.tensor(y)
        seq_len = len(x)
        for t in range(seq_len):
            y_pred, h, m = lstm(x[t], h, m)
            
        probs = F.softmax(y_pred, dim=1)
        loss += -probs[:, y].log10()
    print(f'{i} / {10} : {loss.item()}')
    lossi.append(loss.item())
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


KeyboardInterrupt



In [None]:
plt.plot(lossi)

In [318]:
def play(word):
    context = [0] * len(word)
    encoded = encode(word)
    chances = 12
    guessed_letters = [0]*26 
    context = torch.tensor(context).to(device)
    while chances > 0:
        ix = model.guessletter(context, h, m, guessed_letters)
        guessed_letters[ix-2] = 1 
        idx = np.where(np.array(encoded) == ix)[0]
        if idx.size != 0:
            context[idx] = ix
        else:
            chances -= 1
        w = np.array(context).cpu()
        print(f'Guessed --- {itos[ix]} for Context {decode(w)}') 
        if decode(w) == word:
            break

In [307]:
play('football')

Guessed --- o for Context .oo.....
Guessed --- r for Context .oo.....
Guessed --- r for Context .oo.....
Guessed --- s for Context .oo.....
Guessed --- r for Context .oo.....
Guessed --- i for Context .oo.....
Guessed --- o for Context .oo.....
Guessed --- r for Context .oo.....
Guessed --- i for Context .oo.....
Guessed --- m for Context .oo.....
Guessed --- r for Context .oo.....
Guessed --- t for Context .oot....
Guessed --- o for Context .oot....
Guessed --- e for Context .oot....
Guessed --- p for Context .oot....
Guessed --- m for Context .oot....


In [308]:
import torch
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

# Sample sequences with variable lengths
sequences = [torch.tensor([1, 2, 3]), torch.tensor([4, 5]), torch.tensor([6, 7, 8, 9])]

# Pad sequences
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

# Sort sequences by length
sorted_lengths, sorted_indices = torch.sort(torch.tensor([len(seq) for seq in sequences]), descending=True)
sorted_sequences = padded_sequences[sorted_indices]

# Pack padded sequences
packed_sequences = pack_padded_sequence(sorted_sequences, lengths=sorted_lengths, batch_first=True)

# Now 'packed_sequences' can be used in a model
