In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
import json
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import copy

import pandas as pd

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [2]:
class SlotIntentDataset(Dataset):
    
    def __init__(self, datapath):
        self.data = []
        with open(datapath, 'r') as jsonl_file:
            for line in jsonl_file:
                self.data.append(json.loads(line))
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.data[idx]['input'], ", ".join(self.data[idx]['user_contacts']), self.data[idx]['output'])

def dl_collate_fn(batch):
    return list(batch)

## Training

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_path = '/kaggle/input/col772a3-data/A3'

In [37]:
train_ds = SlotIntentDataset(f'{data_path}/train.jsonl')
val_ds = SlotIntentDataset(f'{data_path}/dev.jsonl')

In [38]:
DEBUG = True
if DEBUG:
    train_ds.data = train_ds.data[:128]
    val_ds.data = val_ds.data[:512]

In [39]:
train_dl = DataLoader(train_ds, batch_size=16, num_workers=2, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=16, num_workers=2, shuffle=False)

In [6]:
model = torch.load('/kaggle/input/intent-slot-gpt2/intent-slot-gpt2.pt', map_location=device)# transformers.GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
# optimizer = optim.Adam(model.parameters(), lr=5e-5)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
def process_batch(batch, tokenizer):
    encoder_strs = [f'[{b}] {a}: {c}' for a,b,c in zip(batch[0], batch[1], batch[2])]

    return tokenizer(encoder_strs, return_tensors="pt", padding=True, truncation=True).to(device)

In [None]:
def train(model, tokenizer, train_dl, val_dl, optimizer, scheduler=None, max_epochs=20, patience_lim=2):

    best_model = None
    best_val_loss = 10000
    val_losses = []
    train_losses = []
    patience = 0

    for epoch in range(max_epochs):

        print(f'Epoch {epoch}:')
        train_loss = torch.tensor(0, dtype=torch.float, device=device)
        model.train()
        for batch in tqdm(train_dl):
            proc_batch = process_batch(batch, tokenizer)
            
            optimizer.zero_grad()
            loss = model(**proc_batch, labels=proc_batch['input_ids']).loss
            loss.backward()
            optimizer.step()

            train_loss += loss.detach()
        
        if scheduler:
            scheduler.step()

        train_loss = train_loss.cpu()
        train_loss /= len(train_dl)
        print(f' Train Loss: {train_loss}')
        train_losses.append(train_loss)

        val_loss = torch.tensor(0, dtype=torch.float, device=device)
        true_labels = []
        pred_labels = []
        model.eval()
        for batch in tqdm(val_dl):
            proc_batch = process_batch(batch, tokenizer)
            
            loss = model(**proc_batch, labels=proc_batch['input_ids']).loss

            val_loss += loss.detach()
            
        val_loss = val_loss.cpu()
        val_loss /= len(val_dl)
        val_losses.append(val_loss)

        print(f' Val Loss: {val_loss}')
        print('')

        # early stopping
        if val_loss >= best_val_loss:
            if patience >= patience_lim:
                break
            else:
                patience += 1
        else:
            patience = 0
            best_val_loss = val_loss
            best_model = copy.deepcopy(model)
            best_model = best_model.cpu()
            print(f'best model: {epoch}')
    
    return best_model, (train_losses, val_losses)

In [None]:
best_model, (train_losses, val_losses) = train(model, tokenizer, train_dl, val_dl, optimizer)
torch.save(best_model, 'intent-slot-gpt2.pt')

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses)
plt.plot(val_losses)

In [25]:
def generate(model, tokenizer, dl):

    pred_gens = []
    gold_gens = []
    
    model.eval()
    for batch in tqdm(dl):
        encoder_toks = process_batch_eval(batch, tokenizer)
        enc_len = encoder_toks['input_ids'].size(1)

        # beam search generations for syntax rather than nucleus sample
        gen = model.generate(
            **encoder_toks,
            num_beams=5,
            max_new_tokens=100,
        )
        
        pred_gens += tokenizer.batch_decode(gen[:,enc_len:], skip_special_tokens=True)
        gold_gens += batch[2]
    
    return pred_gens, gold_gens

In [26]:
eval_tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [27]:
def process_batch_eval(batch, tokenizer):
    encoder_strs = [f'[{b}] {a}: \xa0' for a,b,c in zip(batch[0], batch[1], batch[2])]

    return tokenizer(encoder_strs, return_tensors="pt", padding=True, truncation=True).to(device)

In [40]:
pred_gens, gold_gens = generate(model, eval_tokenizer, val_dl)

  0%|          | 0/32 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [41]:
def matches(y1, y2):
    return ("".join(y1.split()) == "".join(y2.split()))

def exact_match_metric(gold, pred):
    cnt_correct = 0
    for i in range(len(gold)):
        if(matches(gold[i], pred[i])):
            
            cnt_correct += 1
    return cnt_correct/len(gold)

In [42]:
exact_match_metric(gold_gens, pred_gens)

0.712890625