In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
import json
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import copy

import pandas as pd

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [2]:
class SlotIntentDataset(Dataset):
    
    def __init__(self, datapath):
        self.data = []
        with open(datapath, 'r') as jsonl_file:
            for line in jsonl_file:
                self.data.append(json.loads(line))
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.data[idx]['input'], ", ".join(self.data[idx]['user_contacts']), self.data[idx]['output'])

def dl_collate_fn(batch):
    return list(batch)

## Eval

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_path = '/kaggle/input/col772a3-data/A3'

In [4]:
train_ds = SlotIntentDataset(f'{data_path}/train.jsonl')
val_ds = SlotIntentDataset(f'{data_path}/dev.jsonl')

In [5]:
DEBUG = False
if DEBUG:
    train_ds.data = train_ds.data[:128]
    val_ds.data = val_ds.data[:128]

In [6]:
# train_dl = DataLoader(train_ds, batch_size=64, num_workers=2, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=2, shuffle=False)

In [7]:
model = torch.load('/kaggle/input/intent-slot-gpt2-model/intent-slot-gpt2.pt', map_location=device)
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
def process_batch_eval(batch, tokenizer):
    encoder_strs = [f'[{b}] {a}: \xa0' for a,b in zip(batch[0], batch[1])]

    return tokenizer(encoder_strs, return_tensors="pt", padding=True, truncation=True).to(device)

In [9]:
def generate(model, tokenizer, dl):

    pred_gens = []
    gold_gens = []
    
    model.eval()
    for batch in tqdm(dl):
        encoder_toks = process_batch_eval(batch, tokenizer)
        enc_len = encoder_toks['input_ids'].size(1)

        # beam search generations for syntax rather than nucleus sample
        gen = model.generate(
            **encoder_toks,
            num_beams=5,
            max_new_tokens=100,
        )
        
        pred_gens += [{
                'prompt': a,
                'gen': b
            } for a, b in zip(batch[0], tokenizer.batch_decode(gen[:,enc_len:], skip_special_tokens=True))]
        
        gold_gens += [{
                'prompt': a,
                'gen': b
            } for a, b in zip(batch[0], batch[2])]
    
    return pred_gens, gold_gens

In [10]:
pred_gens, gold_gens = generate(model, tokenizer, val_dl)

  0%|          | 0/290 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [11]:
def matches(y1, y2):
    return ("".join(y1.split()) == "".join(y2.split()))

def exact_match_metric(gold, pred):
    cnt_correct = 0
    for i in range(len(gold)):
        if(matches(gold[i]['gen'], pred[i]['gen'])):
            cnt_correct += 1
#         else:
#             print(f'GOLD: {gold[i]}')
#             print(f'PRED: {pred[i]}')
#             print()
    return cnt_correct/len(gold)

In [12]:
exact_match_metric(gold_gens, pred_gens)

0.7577653149266609

In [13]:
json.dump(pred_gens, open('pred_gens.csv', 'w'))
json.dump(gold_gens, open('gold_gens.csv', 'w'))