In [1]:
import torch
import torch.nn as nn
import torch.distributions as dist
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from utils import ARDataset, AutoRegressiveNetwork, device
from data_rnn import load_toy
from tqdm import tqdm

def sample(lnprobs, temperature=1.0):
    if temperature == 0.0:
        return lnprobs.argmax()
    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)
    return cd.sample()

def norm(model: nn.Module):
    total_norm = 0
    for p in model.parameters():
        param_norm = p.grad.detach().data.norm(2)
        total_norm += param_norm.item() ** 2
    return total_norm ** 0.5

In [2]:
x_train, (i2w, w2i) = load_toy(n=150_000)

In [3]:
model = AutoRegressiveNetwork(w2i, emb=64, h=128).to(device)
optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=3e-4, weight_decay=1e-4)
dl = ARDataset(x_train, w2i, bs=8, maxsize=300)
criterion = nn.CrossEntropyLoss()
sw = SummaryWriter('runs/lang')

In [4]:
for epoch in range(10):
    model.train()
    dl.shuffle()
    total_loss = 0
    c = 0
    for x, y in tqdm(dl.dataloader()):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(x), y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        sw.add_scalar(f'Norm/epoch{epoch}', norm(model), c)
        c += 1
    sw.add_scalar('Loss/train', total_loss / c, epoch)
    print(f'Epoch {epoch}, Train Loss: {total_loss / c:.2f}')
sw.flush()

2744it [00:51, 53.29it/s]


Epoch 0, Train Loss: 0.55


2744it [00:47, 57.25it/s]


Epoch 1, Train Loss: 0.35


2742it [00:43, 63.27it/s]


Epoch 2, Train Loss: 0.34


2745it [00:51, 53.81it/s]


Epoch 3, Train Loss: 0.34


2744it [00:58, 47.11it/s]


Epoch 4, Train Loss: 0.34


2745it [00:55, 49.57it/s]


Epoch 5, Train Loss: 0.34


2746it [00:54, 50.13it/s]


Epoch 6, Train Loss: 0.34


2744it [00:54, 50.18it/s]


Epoch 7, Train Loss: 0.34


2745it [00:53, 50.88it/s]


Epoch 8, Train Loss: 0.34


2742it [00:53, 50.83it/s]

Epoch 9, Train Loss: 0.34





In [5]:
model = AutoRegressiveNetwork(w2i, emb=64, h=128).to(device)
optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=3e-4, weight_decay=1e-4)
dl = ARDataset(x_train, w2i, bs=8, maxsize=300)
criterion = nn.CrossEntropyLoss(weight=torch.Tensor([0 if i2w[i] == '.pad' else 1 for i in range(len(i2w))]).to('cuda'), reduction='none')
sw = SummaryWriter('runs/lang_w_clip')

In [6]:
for epoch in range(30):
    model.train()
    dl.shuffle()
    total_loss = 0
    c = 0
    for x, y in tqdm(dl.dataloader()):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(x), y).mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.3)
        optimizer.step()
        total_loss += loss.item()
        sw.add_scalar(f'Norm/epoch{epoch}', norm(model), c)
        c += 1
    sw.add_scalar('Loss/train', total_loss / c, epoch)
    print(f'Epoch {epoch}, Train Loss: {total_loss / c:.2f}')
sw.flush()

2744it [01:08, 39.83it/s]


Epoch 0, Train Loss: 0.52


2745it [01:01, 44.71it/s]


Epoch 1, Train Loss: 0.34


2744it [00:51, 53.28it/s]


Epoch 2, Train Loss: 0.34


2742it [01:04, 42.54it/s]


Epoch 3, Train Loss: 0.34


2744it [01:04, 42.57it/s]


Epoch 4, Train Loss: 0.34


2745it [01:03, 42.95it/s]


Epoch 5, Train Loss: 0.33


2740it [01:00, 45.56it/s]


Epoch 6, Train Loss: 0.34


2743it [00:56, 48.21it/s]


Epoch 7, Train Loss: 0.33


2745it [00:57, 47.79it/s]


Epoch 8, Train Loss: 0.33


2742it [00:57, 47.72it/s]


Epoch 9, Train Loss: 0.33


2741it [00:57, 47.99it/s]


Epoch 10, Train Loss: 0.33


2744it [00:58, 47.09it/s]


Epoch 11, Train Loss: 0.33


2745it [00:57, 48.07it/s]


Epoch 12, Train Loss: 0.33


2746it [00:56, 48.91it/s]


Epoch 13, Train Loss: 0.33


2745it [00:53, 51.03it/s]


Epoch 14, Train Loss: 0.33


2742it [00:46, 59.40it/s]


Epoch 15, Train Loss: 0.33


2743it [00:46, 59.00it/s]


Epoch 16, Train Loss: 0.33


2745it [00:46, 58.80it/s]


Epoch 17, Train Loss: 0.33


2741it [00:55, 49.11it/s]


Epoch 18, Train Loss: 0.33


2743it [00:50, 54.54it/s]


Epoch 19, Train Loss: 0.33


2744it [00:48, 56.79it/s]


Epoch 20, Train Loss: 0.33


2744it [00:48, 56.31it/s]


Epoch 21, Train Loss: 0.33


2742it [00:45, 59.82it/s]


Epoch 22, Train Loss: 0.33


2743it [00:48, 56.99it/s]


Epoch 23, Train Loss: 0.33


2745it [00:47, 57.68it/s]


Epoch 24, Train Loss: 0.33


2743it [00:49, 55.74it/s]


Epoch 25, Train Loss: 0.33


2747it [00:46, 59.27it/s]


Epoch 26, Train Loss: 0.33


2743it [00:45, 60.57it/s]


Epoch 27, Train Loss: 0.33


2741it [00:45, 59.86it/s]


Epoch 28, Train Loss: 0.33


2741it [00:49, 55.78it/s]

Epoch 29, Train Loss: 0.33





In [7]:
def generate_seq(model, n=10):
    for _ in range(n):
        seq = [w2i['.start']]
        while w2i['.end'] not in seq and len(seq) < 100:
            seq.append(sample(model(torch.tensor([seq], dtype=torch.long, device=device))[-1, :]))
        print('\t', ''.join([i2w[i] for i in seq]))

generate_seq(model)

	 .startthe mouse ran while the person walks ( while a man runs ( with a nice woman ) ).end
	 .startthe woman goes but a short mouse runs.end
	 .starta mouse runs quickly.end
	 .startthe gorgeous man runs while the mouse walkes.end
	 .starta cat runs to the quick woman.end
	 .starttce bunny runs ( while a quick man walks ( while a nice bunny ran on a bunny ) ).end
	 .starta short cat runs while the gorgeous person walked quickly.end
	 .startthe busy mouse walks to a short mouse.end
	 .startthe person ran but a mouse walks ( with a nice dog ).end
	 .starta person walks ( but the busy bunny runs ( while the cat runs while the woman walks ( to the short 
