In [43]:
import torch
from torchtext.data import Field, TabularDataset, BucketIterator
from mlg.settings import BASE_DIR
import tqdm
from torch import nn, optim
import torch.nn.functional as F

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

cleaned_data_path = f'{BASE_DIR}/data/subtitles/cleaned_test.txt'
data_path = f'{BASE_DIR}/data/subtitles'

batch_size = 16
seq_len = 25

pad_tkn = '~'
unk_tkn = '*'
eos_tkn = '\n'
init_tkn = '>'

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
TEXT = Field(sequential=True, tokenize=list, fix_length=seq_len, unk_token=unk_tkn, pad_first=False,
             pad_token=pad_tkn, eos_token=eos_tkn, init_token=init_tkn)

train_dataset, test_dataset = TabularDataset.splits(
    path=data_path,
    train='cleaned.txt', test='cleaned_test.txt',
    format='csv',
    skip_header=False,
    fields=[("text", TEXT)])

TEXT.build_vocab(train_dataset)
vocab_size = len(TEXT.vocab.itos)

train_iter, test_iter = BucketIterator.splits(
    (train_dataset, test_dataset),
    batch_sizes=(batch_size, batch_size),
    device=device,
    sort_key=lambda txt: len(txt.text),
    sort_within_batch=False,
    repeat=True
)



In [6]:
sample = None
for x in test_iter:
    sample = x.text
    break
sample



tensor([[ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
        [15,  8, 10, 40,  8,  8,  7, 10, 10, 10, 12, 12, 12, 12, 10, 18],
        [12, 12,  7, 37, 12, 20, 12,  7,  7,  7,  5,  5,  5,  5,  7,  7],
        [ 3,  3,  3,  3,  3,  3,  3, 30, 30, 16, 18, 18, 18, 18, 20, 15],
        [ 1,  1,  1,  1,  1,  1,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 1,  1,  1,  1,  1,  1,  1,  

In [46]:
class NextCharModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embed_size
        )

        self.rnn = nn.RNN(
            input_size=self.embed_size,
            hidden_size=self.hidden_size,
            nonlinearity='relu'
        )

        self.y = nn.Linear(self.hidden_size, vocab_size)

    def forward(self, x):
        y = F.relu(self.embed(x))
        y, _ = self.rnn(y)
        return F.softmax(self.y(y), 2)

model = NextCharModel(vocab_size, 512, 512).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [51]:
epochs = 1
model.train()
for epoch in range(epochs):
    i = 0
    losses = 0
    for batch in train_iter:
        x_batch = batch.text
        y_batch = x_batch[1:]
        x_batch = x_batch[:-1]

        y_pred = model(x_batch)
        loss = loss_fn(y_pred.view((-1, vocab_size)), y_batch.flatten())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses += loss.item()

        i+=1
        if i % 50 == 0:
            print(losses)
            losses = 0



170.95746231079102
170.65003657341003
170.13964200019836
170.6318187713623
169.82480645179749
168.93178415298462
169.41843247413635
169.08238005638123
168.2544150352478
169.40612816810608
168.2716019153595
168.09404468536377
167.9272747039795
168.22805786132812
167.7569501399994
167.63531255722046
167.8082673549652
167.57655572891235
168.75072288513184
167.62026596069336
167.40412020683289
167.66362261772156
167.3826024532318
167.3676564693451
167.75201654434204
167.59945511817932
167.66371273994446
167.78736424446106


KeyboardInterrupt: 

In [78]:
def predict(sentence):
    terminal_chars = [eos_tkn, '\n', pad_tkn]
    max_len = 50
    next_char = 0
    model.eval()
    with torch.no_grad():
        while next_char not in terminal_chars and len(sentence) < max_len:
            seq = torch.LongTensor([TEXT.vocab[s] or TEXT.vocab[unk_tkn] for s in list(sentence.lower())]).view((-1, 1))
            preds = model(seq)
            m = int(preds[-1][0].argmax())
            next_char = TEXT.vocab.itos[m]
            sentence = sentence + next_char
    print(f'"{sentence}"')
    return sentence


test_sentence = "Hey, what's u"

predict(test_sentence)

"Hey, what's us to to to to to to to to to to to to"


TypeError: 'Vocab' object is not callable