In [4]:
!pip install torchtext==0.8.0
!pip freeze | grep torchtext

Collecting torchtext==0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/23/23/8499af6d9c22b29b01f66a2c11d38ce71cd1cafa2655913c29818ed4a00f/torchtext-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (6.9MB)
[K     |████████████████████████████████| 6.9MB 10.9MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed torchtext-0.8.0
torchtext==0.8.0


In [5]:
!rm -rf *
!git config --global user.name "Akhilez"
!git config --global user.email "akhild18@yahoo.com"
!git clone https://github.com/Akhilez/ml_gallery.git
%cd ml_gallery/ml_py

Cloning into 'ml_gallery'...
remote: Enumerating objects: 787, done.[K
remote: Counting objects: 100% (787/787), done.[K
remote: Compressing objects: 100% (516/516), done.[K
remote: Total 3962 (delta 471), reused 553 (delta 256), pack-reused 3175[K
Receiving objects: 100% (3962/3962), 41.09 MiB | 40.81 MiB/s, done.
Resolving deltas: 100% (2428/2428), done.
/content/ml_gallery/ml_py


In [23]:
from google.colab import drive
drive.mount('/content/gdrive')
models_path = '/content/gdrive/MyDrive/Projects/ML/next_char'

Mounted at /content/gdrive


In [6]:
import os
os.environ['SECRET_KEY'] = '1234'
from mlg.settings import BASE_DIR
os.environ['BASE'] = BASE_DIR
%mkdir -p ${BASE}/data/subtitles
%mkdir -p ${BASE}/models
!wget -O ${BASE}/data/subtitles/cleaned.txt https://storage.googleapis.com/akhilez/datasets/marvel_subtitles/cleaned.txt
!wget -O ${BASE}/data/subtitles/cleaned_test.txt https://storage.googleapis.com/akhilez/datasets/marvel_subtitles/cleaned_test.txt

--2020-12-06 05:01:40--  https://storage.googleapis.com/akhilez/datasets/marvel_subtitles/cleaned.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.15.80, 172.253.122.128, 172.217.7.144, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.15.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 349823 (342K) [text/plain]
Saving to: ‘/content/ml_gallery/ml_py/data/subtitles/cleaned.txt’


2020-12-06 05:01:40 (152 MB/s) - ‘/content/ml_gallery/ml_py/data/subtitles/cleaned.txt’ saved [349823/349823]

--2020-12-06 05:01:40--  https://storage.googleapis.com/akhilez/datasets/marvel_subtitles/cleaned_test.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.5.240, 172.217.15.80, 172.253.63.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.5.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6726 (6.6K) [text/plain]
Saving to: ‘/content/ml_gallery/ml_py

In [7]:
import torch
from torchtext.data import Field, TabularDataset, BucketIterator
from mlg.settings import BASE_DIR
from tqdm import tqdm
from torch import nn, optim
import torch.nn.functional as F
from datetime import datetime

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cleaned_data_path = f'{BASE_DIR}/data/subtitles/cleaned_test.txt'
data_path = f'{BASE_DIR}/data/subtitles'

batch_size = 64
seq_len = 25

pad_tkn = '~'
unk_tkn = '*'
eos_tkn = '\n'
init_tkn = '>'

cuda


In [9]:
TEXT = Field(sequential=True, tokenize=list, fix_length=seq_len, unk_token=unk_tkn, pad_first=False,
             pad_token=pad_tkn, eos_token=eos_tkn, init_token=init_tkn)

train_dataset, test_dataset = TabularDataset.splits(
    path=data_path,
    train='cleaned.txt', test='cleaned_test.txt',
    format='csv',
    skip_header=False,
    fields=[("text", TEXT)])

TEXT.build_vocab(train_dataset)
vocab_size = len(TEXT.vocab.itos)

train_iter, test_iter = BucketIterator.splits(
    (train_dataset, test_dataset),
    batch_sizes=(batch_size, batch_size),
    device=device,
    sort_key=lambda txt: len(txt.text),
    sort_within_batch=False,
    repeat=True
)




In [44]:
class NextCharModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embed_size
        )

        self.rnn = nn.RNN(
            input_size=self.embed_size,
            hidden_size=self.hidden_size,
            nonlinearity='relu'
        )

        self.y = nn.Linear(self.hidden_size, vocab_size)

    def forward(self, x):
        y = F.relu(self.embed(x))
        y, _ = self.rnn(y)
        return F.softmax(self.y(y), 2)


In [45]:
def load_model(latest=True, name=None):
    if latest:
        model_name = max(os.listdir(models_path))
        model = NextCharModel(vocab_size, 512, 512)
        model.load_state_dict(torch.load(f'{models_path}/{model_name}')).to(device)
        return model

In [46]:
model = NextCharModel(vocab_size, 32, 128).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
epochs = 5
model.train()
for epoch in range(epochs):
    i = 0
    losses = 0
    print(f'Epoch: {epoch}')
    for batch in train_iter:
        x_batch = batch.text
        y_batch = x_batch[1:]
        x_batch = x_batch[:-1]

        y_pred = model(x_batch)
        loss = loss_fn(y_pred.view((-1, vocab_size)), y_batch.flatten())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses += loss.item()

        i+=1
        if i % 3000 == 0:
            print(i / 3000, losses / 3000)
            losses = 0

        if i % 30000 == 0:
            test_sentence = "Hey, wha"
            pred = predict(test_sentence)
            print(f'"{pred}"')
            model.train()

        if i % 300000 == 0:
            save_model(model, f'epoch{epoch}_batch{i}')

    save_model(model) 

Epoch: 0




1.0 3.5008288300832113
2.0 3.4522635898590086
3.0 3.4447192271550495
4.0 3.435407151858012
5.0 3.415821606953939
6.0 3.4096887852350872
7.0 3.40635427292188
8.0 3.405636769294739
9.0 3.4036244746049245
10.0 3.4031432440280915
"Hey, wha the te the te the te the te the te the te"
11.0 3.402107408285141
12.0 3.401642345905304
13.0 3.400917260567347
14.0 3.400035128513972
15.0 3.397051036755244


In [11]:

def predict(sentence):
    terminal_chars = [eos_tkn, '\n', pad_tkn]
    max_len = 50
    next_char = 0
    model.eval()
    with torch.no_grad():
        while next_char not in terminal_chars and len(sentence) < max_len:
            seq = torch.tensor([TEXT.vocab[s] or TEXT.vocab[unk_tkn] for s in list(sentence.lower())], device=device, dtype=torch.long).view((-1, 1))
            preds = model(seq)
            m = int(preds[-1][0].argmax())
            next_char = TEXT.vocab.itos[m]
            sentence = sentence + next_char
    return sentence


test_sentence = "Hey, what's u"

pred = predict(test_sentence)
print(f'"{pred}"')


"Hey, what's u~"


In [36]:
last_saved_model_path = ''
def save_model(model, message):
    last_saved_model_path = f'{models_path}/model_{int(datetime.now().timestamp())}{f"_{message}" if message else ""}.pt'
    torch.save(model.state_dict, last_saved_model_path)
save_model(model, message='test')