In [1]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
from pytorch_lightning import Trainer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from undecorated import undecorated
from types import MethodType
import wandb
from pytorch_lightning.loggers import WandbLogger

In [2]:
dataset = load_dataset("qanastek/MASSIVE", "en-US", data_dir='MASSIVE')

_INTENTS = ['audio_volume_other', 'play_music', 'iot_hue_lighton', 'general_greet', 'calendar_set', 'audio_volume_down', 'social_query', 'audio_volume_mute', 'iot_wemo_on', 'iot_hue_lightup', 'audio_volume_up', 'iot_coffee', 'takeaway_query', 'qa_maths', 'play_game', 'cooking_query', 'iot_hue_lightdim', 'iot_wemo_off', 'music_settings', 'weather_query', 'news_query', 'alarm_remove', 'social_post', 'recommendation_events', 'transport_taxi', 'takeaway_order', 'music_query', 'calendar_query', 'lists_query', 'qa_currency', 'recommendation_movies',
            'general_joke', 'recommendation_locations', 'email_querycontact', 'lists_remove', 'play_audiobook', 'email_addcontact', 'lists_createoradd', 'play_radio', 'qa_stock', 'alarm_query', 'email_sendemail', 'general_quirky', 'music_likeness', 'cooking_recipe', 'email_query', 'datetime_query', 'transport_traffic', 'play_podcasts', 'iot_hue_lightchange', 'calendar_remove', 'transport_query', 'transport_ticket', 'qa_factoid', 'iot_cleaning', 'alarm_set', 'datetime_convert', 'iot_hue_lightoff', 'qa_definition', 'music_dislikeness']
_TAGS = ['O', 'B-food_type', 'B-movie_type', 'B-person', 'B-change_amount', 'I-relation', 'I-game_name', 'B-date', 'B-movie_name', 'I-person', 'I-place_name', 'I-podcast_descriptor', 'I-audiobook_name', 'B-email_folder', 'B-coffee_type', 'B-app_name', 'I-time', 'I-coffee_type', 'B-transport_agency', 'B-podcast_descriptor', 'I-playlist_name', 'B-media_type', 'B-song_name', 'I-music_descriptor', 'I-song_name', 'B-event_name', 'I-timeofday', 'B-alarm_type', 'B-cooking_type', 'I-business_name', 'I-color_type', 'B-podcast_name', 'I-personal_info', 'B-weather_descriptor', 'I-list_name', 'B-transport_descriptor', 'I-game_type', 'I-date', 'B-place_name', 'B-color_type', 'B-game_name', 'I-artist_name', 'I-drink_type', 'B-business_name', 'B-timeofday', 'B-sport_type', 'I-player_setting', 'I-transport_agency', 'B-game_type', 'B-player_setting', 'I-music_album', 'I-event_name', 'I-general_frequency', 'I-podcast_name', 'I-cooking_type', 'I-radio_name', 'I-joke_type',
         'I-meal_type', 'I-transport_type', 'B-joke_type', 'B-time', 'B-order_type', 'B-business_type', 'B-general_frequency', 'I-food_type', 'I-time_zone', 'B-currency_name', 'B-time_zone', 'B-ingredient', 'B-house_place', 'B-audiobook_name', 'I-ingredient', 'I-media_type', 'I-news_topic', 'B-music_genre', 'I-definition_word', 'B-list_name', 'B-playlist_name', 'B-email_address', 'I-currency_name', 'I-movie_name', 'I-device_type', 'I-weather_descriptor', 'B-audiobook_author', 'I-audiobook_author', 'I-app_name', 'I-order_type', 'I-transport_name', 'B-radio_name', 'I-business_type', 'B-definition_word', 'B-artist_name', 'I-movie_type', 'B-transport_name', 'I-email_folder', 'B-music_album', 'I-house_place', 'I-music_genre', 'B-drink_type', 'I-alarm_type', 'B-music_descriptor', 'B-news_topic', 'B-meal_type', 'I-transport_descriptor', 'I-email_address', 'I-change_amount', 'B-device_type', 'B-transport_type', 'B-relation', 'I-sport_type', 'B-personal_info']


def index_to_intent(index):
    return _INTENTS[index]


def index_to_tag(index):
    return _TAGS[index]

Using custom data configuration en-US-data_dir=MASSIVE
Reusing dataset massive (C:\Users\andre\.cache\huggingface\datasets\qanastek___massive\en-US-data_dir=MASSIVE\1.0.0\31cdffab94ac97bfe5a394b1e96344c96f0ad847e1d796c7562d8c8b449e22e6)
100%|██████████| 3/3 [00:00<00:00, 221.13it/s]


In [3]:
def get_intent_slots():
    intent_slots = {}
    for data in dataset['train']:
        intent = data['intent']
        if intent not in intent_slots:
            intent_slots[intent] = set(data['ner_tags'])
        else:
            intent_slots[intent] = intent_slots[intent].union(
                set(data['ner_tags']))

    named_data = {}
    for intent, slots in intent_slots.items():
        named_data[intent] = 'slots: ' + ", ".join(set(
            [index_to_tag(slot)[2:] if index_to_tag(slot) != 'O' else 'O' for slot in slots]))
    return named_data


def get_data_per_intent():
    data_per_intent = {}
    for data in dataset['train']:
        intent = data['intent']
        if intent not in data_per_intent:
            data_per_intent[intent] = [data]
        else:
            data_per_intent[intent].append(data)
    return data_per_intent

In [4]:
class T5GenerationFineTune(Dataset):
    def __init__(self, dataset, intent_slots, tokenizer, data_per_intent, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.intent_slots = intent_slots
        self.data_per_intent = data_per_intent

    def __len__(self):
        return len(self.dataset)

    # format of data (intent, slots_for_intent, example: text), all_examples_except_current
    def __getitem__(self, index):
        data = self.dataset[index]
        intent = data['intent']
        slots_for_intent = self.intent_slots[intent]
        text = data['text']
        all_examples_except_current = self.data_per_intent[intent][:index] + \
            self.data_per_intent[intent][index+1:]
        all_examples_except_current = [example['text']
                                       for example in all_examples_except_current]

        input_text = f"intent: {index_to_intent(intent)}\nslots: {slots_for_intent}\nexample: {text}"
        tokenized_text = self.tokenizer(
            input_text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        tokenized_examples = [self.tokenizer(example, max_length=self.max_length, padding='max_length',
                                             truncation=True, return_tensors='pt') for example in all_examples_except_current]

        return tokenized_text.input_ids, tokenized_text.attention_mask, torch.Tensor(tokenized_examples)

In [5]:
class T5GenerationDataModule(LightningDataModule):
    def __init__(self, dataset, tokenizer, batch_size=8, max_length=512):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length

    def setup(self, stage=None):
        self.intent_slots = get_intent_slots()
        self.data_per_intent = get_data_per_intent()
        self.train_dataset = T5GenerationFineTune(
            self.dataset['train'], self.intent_slots, self.tokenizer, self.data_per_intent, self.max_length)
        self.val_dataset = T5GenerationFineTune(
            self.dataset['validation'], self.intent_slots, self.tokenizer, self.data_per_intent, self.max_length)
        self.test_dataset = T5GenerationFineTune(
            self.dataset['test'], self.intent_slots, self.tokenizer, self.data_per_intent, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4, pin_memory=True)

In [6]:
class T5DataGenerator(pl.LightningModule):
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(
            self.args['model_name'])
        self.tokenizer = T5Tokenizer.from_pretrained(
            self.args['tokenizer_name'])

        self.loss = torch.nn.CrossEntropyLoss()

        # allow grad in generate
        generate_with_grad = undecorated(self.model.generate)
        self.model.generate_with_grad = MethodType(
            generate_with_grad, self.model)

    # have the model generate multiple unique candidate outputs for a given input

    def forward(self, input_ids, attention_mask):
        return self.model.generate_with_grad(
            input_ids=input_ids,
            attention_mask=attention_mask,
            do_sample=True,
            max_length=50,
            top_k=self.args['top_k'],
            top_p=0.95,
            num_return_sequences=self.args['num_return_sequences'],
        )

    # for a given training step, generate a batch of candidate outputs for a given input
    # input_ids are the tokenized examples of the data we want to generate
    # the labels are all of the potential outputs the model could generate for a given intent
    # model out shape (bs * num_return_sequences, max_seq_len)
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, all_but_example = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask)
        # for each output sequence, calculate the least loss relative to all of teh examples in labels
        outputs = outputs.view(
            len(input_ids), self.args['num_return_sequences'], -1)

        loss = torch.sum([torch.min([self.loss(gen, data)
                         for xs in outputs for gen in xs]) for data in all_but_example])

        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, all_but_example = batch
        outputs = self(input_ids=input_ids, attention_mask=attention_mask)
        # for each output sequence, calculate the least loss relative to all of teh examples in labels
        outputs = outputs.view(
            len(input_ids), self.args['num_return_sequences'], -1)

        print(outputs.shape)
        print('calculating loss')
        loss = torch.sum([torch.min([self.loss(gen, data)
                         for xs in outputs for gen in xs]) for data in all_but_example])
        print('loss calculated')
        self.log('validation_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

In [7]:
args = {
    'model_name': 't5-small',
    'tokenizer_name': 't5-small',
    'top_k': 5,
    'num_return_sequences': 5,
}

model = T5DataGenerator(args)
tokenizer = T5Tokenizer.from_pretrained('t5-small')
data_module = T5GenerationDataModule(dataset, tokenizer, batch_size=8)

logger = WandbLogger(project='t5-data-generator', name='sanity_check')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=1)
trainer.fit(model, data_module)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: c:\Users\andre\Documents\Coding Things\NLU data gen\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
1 | loss  | CrossEntropyLoss           | 0     
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]