In [2]:
import torch
import torch.nn as nn
from torch import optim
import random

from models.persona_extractor import PersonaExtractor
from dataset.msc_summary import MSC_Turns, extra_tokens
from dataset.vocab import Vocab, PAD_TOKEN, START_TOKEN


In [2]:
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())

# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

True
True


In [5]:
X, y = torch.rand(16, 10).to("mps"), torch.rand(16, 1).to("mps")
model = nn.Linear(10, 1).to("mps")
criterion = nn.L1Loss() # nn.KLDivLoss()
loss = criterion(model(X), y)
loss.backward()

In [3]:
args = {
    'datadir': '/Users/FrankVerhoef/Programming/PEX/data/',
    'traindata': 'msc/msc_personasummary/session_1/train.txt',
    'train_samples': 1000,
    'vocab_size': 2000,
    'embedding_size': 16,
    'hidden_size': 32,
    'aggregate_method': 'cpu',
    'encoder': 'lstm',
    'decoder': 'lstm',
    'device': 'mps',
    'batch_size': 8,
    'learning_rate': 0.01,
    'epochs': 1,
    'log_interval': 10
}

In [4]:
vocab = Vocab()
traindata = MSC_Turns(args['datadir'] + args['traindata'], vocab.text2vec, len_context=2, max_samples=args['train_samples'])
vocab.add_special_tokens(extra_tokens)
vocab.add_to_vocab(traindata.corpus())
vocab.cut_vocab(max_tokens=args['vocab_size'])

encoder_opts = {
    "input_size": len(vocab),
    "embedding_size": args['embedding_size'],
    "hidden_size": args['hidden_size'],
    "aggregate_method": args['aggregate_method']
}
decoder_opts = {
    "input_size": len(vocab),
    "embedding_size": args['embedding_size'],
    "hidden_size": {
        "mean": args['embedding_size'],
        "lstm": args['hidden_size'],
        "bilstm": args['hidden_size'] * 2,
        "poolbilstm": args['hidden_size'] * 2            
    }[args['encoder']],
    "output_size": len(vocab)
}
model = PersonaExtractor(args['encoder'], encoder_opts, args['decoder'], decoder_opts, start_token=vocab.tok2ind[START_TOKEN])

if args['device'] == "mps":
    assert torch.backends.mps.is_available(), "Device 'mps' not available"
    assert torch.backends.mps.is_built(), "PyTorch installation was not built with MPS activated"
elif args['device'] == "cuda":
    assert torch.cuda.is_available(), "Cuda not available"

train_loader = torch.utils.data.DataLoader(dataset=traindata, batch_size=args['batch_size'], shuffle=True, collate_fn=traindata.batchify)
optimizer = optim.SGD(model.parameters(), lr=args['learning_rate'])
criterion = nn.NLLLoss(ignore_index=vocab.tok2ind[PAD_TOKEN], reduction='mean')
device = args['device']


100%|██████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3913.19it/s]

Added 2930 tokens to vocabulary
Reduced vocab to 2000 tokens, covering 97.0% of corpus





In [5]:
model.to(device)

PersonaExtractor(
  (encoder): UniLSTM(
    (embed): Embedding(2004, 16)
    (lstm): LSTM(16, 32, batch_first=True)
  )
  (decoder): LSTM(
    (embed): Embedding(2004, 16)
    (lstm): LSTM(16, 32, batch_first=True)
    (out): Linear(in_features=32, out_features=2004, bias=True)
    (softmax): LogSoftmax(dim=-1)
  )
)

In [7]:
batch = next((iter(train_loader)))
xs, ys, xs_len, ys_len = batch
xs = xs.to(device)
ys = ys.to(device)
optimizer.zero_grad()

output = model(xs, xs_len, teacher_forcing=True, ys=ys)
loss = criterion(output.transpose(1,2), ys)

In [8]:
print(loss)

tensor(7.6256, device='mps:0', grad_fn=<NllLoss2DBackward0>)


  nonzero_finite_vals = torch.masked_select(


In [None]:
loss.backward()


In [None]:
optimizer.step()

In [None]:
best_model, train_stats = train(
    model, train_loader, optimizer, criterion,
    device=args['device'], epochs=args['epochs'], log_interval=args['log_interval']
)


In [None]:
for i in range(5):
    print(msc_turns[i][0])
    print(msc_turns[i][1])
    print('-'*40)

In [None]:
def train_step(batch, model, optimizer, criterion, device):

    xs, ys, xs_len, ys_len = batch
    xs = xs.to(device)
    ys = ys.to(device)
    optimizer.zero_grad()
    
    output = model(xs, xs_len, teacher_forcing=True, ys=ys)
    loss = criterion(output.transpose(1,2), ys)
    
    loss.backward()
    optimizer.step()
    
    return loss.item()

def train(model, dataloader, optimizer, criterion, device="cpu", epochs=1, log_interval=1000):

    losses = []
    model.to(device)

    for epoch in range(epochs):
        for step, batch in enumerate(iter(dataloader)):

            loss = train_step(batch, model, optimizer, criterion, device)
            losses.append(loss)

            if (step + 1) % log_interval == 0:
                loss_avg = sum(losses[-log_interval:]) / log_interval
                wandb.log({
                    "train_loss": loss_avg
                })
                print("Epoch {}, step {}: loss={}".format(epoch, step+1, loss_avg))
    
    return model, {"train_loss": losses}