In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
import json
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import copy

import pandas as pd

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [2]:
class SlotIntentDataset(Dataset):
    
    def __init__(self, datapath):
        self.data = []
        with open(datapath, 'r') as jsonl_file:
            for line in jsonl_file:
                self.data.append(json.loads(line))
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.data[idx]['input'], ", ".join(self.data[idx]['user_contacts']), self.data[idx]['output'])

def dl_collate_fn(batch):
    return list(batch)

## Eval

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_path = '/kaggle/input/col772a3-data/A3'

In [4]:
train_ds = SlotIntentDataset(f'{data_path}/train.jsonl')
val_ds = SlotIntentDataset(f'{data_path}/dev.jsonl')

In [5]:
disfluent_words = ['uh', 'uhh', 'um', 'umm', 'uhm','uhmm', 'oh', 'ah', 'er', 'ummm', 'err']

def trim(s):
    s = s.lower()
    if s[-1] == ',':
        s = s[0:-1]
    return s

def remove_disfluency(ds):
    for i in range(len(ds.data)):
        words_list = ds.data[i]['input'].split(' ')
        last_word = ''
        s_last_word = ''
        for j in range(len(words_list)):
            if(len(words_list[j]) == 0):
                continue
            word = trim(words_list[j])
            if(word in disfluent_words):
                words_list[j] = ''
            elif (word == last_word):
                words_list[j] = ''
            elif (word == s_last_word):
                words_list[j] = ''
            else:
                s_last_word = last_word
                last_word = word
        words_list = list(filter(lambda x: x != '', words_list))        
        ds.data[i]['input'] = " ".join(words_list)
    return ds

In [6]:
# val_ds = remove_disfluency(val_ds)

In [7]:
DEBUG = False
import random
random.seed(42)
if DEBUG:
    train_ds.data = train_ds.data[:128]
    val_ds.data = random.sample(val_ds.data, 1024)

In [8]:
# train_dl = DataLoader(train_ds, batch_size=64, num_workers=2, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=2, shuffle=False)

In [9]:
model = torch.load('/kaggle/input/intent-slot-gpt2-model/intent-slot-gpt2.pt', map_location=device)
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
def process_batch_eval(batch, tokenizer):
    
    labels = [toks.strip().split(' ')[0] for toks in batch[2]]
    encoder_strs = [f'[{b}] {a} <{c}>: \xa0' for a,b,c in zip(batch[0], batch[1], labels)]

    return encoder_strs, tokenizer(encoder_strs, return_tensors="pt", padding=True, truncation=True).to(device)

In [11]:
def generate(model, tokenizer, dl):

    pred_gens = []
    gens = []
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dl):
            encoder_strs, encoder_toks = process_batch_eval(batch, tokenizer)
            enc_len = encoder_toks['input_ids'].size(1)

            # beam search generations for syntax rather than nucleus sample
            gen = model.generate(
                **encoder_toks,
                num_beams=5,
                max_new_tokens=100,
            )

            gens += [{
                    'prompt': a,
                    'pred': b,
                    'gold': c
                } for a, b, c in zip(encoder_strs, tokenizer.batch_decode(gen[:,enc_len:], skip_special_tokens=True), batch[2])]
    
    return gens

In [12]:
with torch.no_grad():
    gens = generate(model, tokenizer, val_dl)

  0%|          | 0/290 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [13]:
def matches(y1, y2):
    return ("".join(y1.split()) == "".join(y2.split()))

def exact_match_metric(gens):
    cnt_correct = 0
    errors = []
    for gen in gens:
        if(matches(gen['gold'], gen['pred'])):
            cnt_correct += 1
        else:
            errors.append(gen)

    return cnt_correct/len(gens), errors

In [14]:
accuracy, errors = exact_match_metric(gens)

In [15]:
json.dump(gens, open('gens.json', 'w'), indent=4)
# json.dump(gold_gens, open('gold_gens.json', 'w'))
json.dump(errors, open('errors.json', 'w'), indent=4)

In [16]:
accuracy*0.97

0.7908973252804142

In [17]:
# print(errors)