In [1]:
# If you are using Colab, run these commands
""" 
!wget -P dataset/ATIS https://raw.githubusercontent.com/BrownFortress/IntentSlotDatasets/main/ATIS/test.json
!wget -P dataset/ATIS https://raw.githubusercontent.com/BrownFortress/IntentSlotDatasets/main/ATIS/train.json
!wget https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/conll.py """

' \n!wget -P dataset/ATIS https://raw.githubusercontent.com/BrownFortress/IntentSlotDatasets/main/ATIS/test.json\n!wget -P dataset/ATIS https://raw.githubusercontent.com/BrownFortress/IntentSlotDatasets/main/ATIS/train.json\n!wget https://raw.githubusercontent.com/BrownFortress/NLU-2024-Labs/main/labs/conll.py '

In [2]:
# Global variables
#from models import MultiModel1
import sys
import os
DEVICE = 'cuda:0'  # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
EPOCHS = 200
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Used to report errors on CUDA side
os.environ["TORCH_USE_CUDA_DSA"] ="1"
PAD_TOKEN = 0
sys.path.append('../src/')

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import numpy as np
from torch.optim.optimizer import Optimizer, required
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertTokenizer, BertModel
from pprint import pprint


class MultiModel1(nn.Module):

    def __init__(self, hid_size, out_slot, out_int, emb_size, vocab_len, n_layer=1,
                 pad_index=0, bidirectional=False,
                 emb_dropout=0.1, n_layers=1, dropout=False, model_type="LSTM"):
        super(MultiModel1, self).__init__()
        # hid_size = Hidden size
        # out_slot = number of slots (output size for slot filling)
        # out_int = number of intents (output size for intent class)
        # emb_size = word embedding size
        assert model_type in [
            "LSTM", "BERT"], (f"Model Type {model_type} not Recognized")
        self.lstm_model = (model_type == "LSTM")
        self.dropout_flag = dropout
        if self.lstm_model:
            self.embedding = nn.Embedding(
                vocab_len, emb_size, padding_idx=pad_index)
        if (dropout):
            self.drop1 = nn.Dropout(emb_dropout)
        if model_type == "LSTM":
            self.utt_encoder = nn.LSTM(
                emb_size, hid_size, n_layer, bidirectional=bidirectional, batch_first=True)
        elif model_type == "BERT":
            self.utt_encoder = BertModel.from_pretrained("bert-base-uncased")
            self.utt_encoder.resize_token_embeddings(out_int)

        # we have 2 linear layers because we have 2 tasks with different layers
        if bidirectional and model_type == "LSTM":
            output_hidden_dim = hid_size*2*n_layers
        elif model_type == "LSTM":
            output_hidden_dim = hid_size*n_layers
        else:
            output_hidden_dim = 768

        self.slot_out = nn.Linear(output_hidden_dim, out_slot)
        self.intent_out = nn.Linear(output_hidden_dim, out_int)

    def forward(self, utterance, seq_lengths, attention_masks=None, token_types=None):
        # utterance.size() = batch_size X seq_len
        # we are vectorizing the inputs
        # utt_emb.size() = batch_size X seq_len X emb_size
        if (self.lstm_model):
            utt_emb = self.embedding(utterance)

        # pack_padded_sequence avoid computation over pad tokens reducing the computational cost
        if self.lstm_model and self.dropout_flag:
            utt_emb = self.drop1(utt_emb)
        if self.lstm_model:
            packed_input = pack_padded_sequence(
                utt_emb, seq_lengths.cpu().numpy(), batch_first=True)
        else:
            print("Pollo")
            """ packed_input = pack_padded_sequence(
        utterance, seq_lengths.cpu().numpy(), batch_first=True)  """
        # Process the batch
        if self.lstm_model:
            packed_output, (last_hidden, cell) = self.utt_encoder(packed_input)
        else:
            print("attention_masks:", attention_masks.size(), "utterance",
                  utterance.size(), "token_types", token_types.size())
            last_hidden = self.utt_encoder(
                utterance, token_types, attention_masks)

        # Unpack the sequence the encoder is the LSTM and we give it each time a token and see the hidden state for each token
        if self.lstm_model:
            utt_encoded, input_sizes = pad_packed_sequence(
                packed_output)

        # Get the last hidden state
        if self.lstm_model and self.utt_encoder.bidirectional:
            last_hidden = torch.cat(
                (last_hidden[-2, :, :], last_hidden[-1, :, :]), dim=1)
        elif self.lstm_model:
            last_hidden = last_hidden[-1, :, :]

        # Is this another possible way to get the last hiddent state? (Why?)
        # we use the last token since the network has seen the all sequence before the last
        # utt_encoded.permute(1,0,2)[-1]

        # Compute slot logits
        if self.lstm_model:
            slots = self.slot_out(utt_encoded)
            # Compute intent logits
            intent = self.intent_out(last_hidden)
        else:
            print(last_hidden[0][:, 0, :].size())
            print(last_hidden[0][:, 1:, :].size())

            slots = self.slot_out(last_hidden[0][:, 1:, :])
            intent = self.intent_out(last_hidden[0][:, 0, :])
        print("clear slots", slots.size())
        # Slot size: batch_size, seq_len, classes
        slots = slots.permute(0, 2, 1)  # We need this for computing the loss
        # Slot size: batch_size, classes, seq_len
        return slots, intent

In [4]:
import json
from pprint import pprint


def load_data(path):
    '''
        input: path/to/data
        output: json 
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset


tmp_train_raw = load_data(os.path.join('dataset', 'ATIS', 'train.json'))
# we are producing the set to compare against
test_raw = load_data(os.path.join('dataset', 'ATIS', 'test.json'))
print('Train samples:', len(tmp_train_raw))
print('Test samples:', len(test_raw))

pprint(tmp_train_raw[0])

Train samples: 4978
Test samples: 893
{'intent': 'flight',
 'slots': 'O O O O O B-fromloc.city_name O B-depart_time.time '
          'I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O '
          'B-arrive_time.period_of_day',
 'utterance': 'i want to fly from boston at 838 am and arrive in denver at '
              '1110 in the morning'}


In [5]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

# First we get the 10% of the training set, then we compute the percentage of these examples
# We are only splitting on intention but we should even split on labels of token

portion = 0.10

intents = [x['intent'] for x in tmp_train_raw]  # We stratify on intents
count_y = Counter(intents)

labels = []
inputs = []
mini_train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1:  # If some intents occurs only once, we put them in training
        inputs.append(tmp_train_raw[id_y])
        labels.append(y)
    else:
        mini_train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(inputs, labels, test_size=portion,
                                                  random_state=42,
                                                  shuffle=True,
                                                  stratify=labels)
X_train.extend(mini_train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

# Intent distributions
print('Train:')
pprint({k: round(v/len(y_train), 3)*100 for k,
       v in sorted(Counter(y_train).items())})
print('Dev:'),
pprint({k: round(v/len(y_dev), 3)*100 for k,
       v in sorted(Counter(y_dev).items())})
print('Test:')
pprint({k: round(v/len(y_test), 3)*100 for k,
       v in sorted(Counter(y_test).items())})
print('='*89)
# Dataset size
print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))

Train:
{'abbreviation': 2.9000000000000004,
 'aircraft': 1.6,
 'airfare': 8.5,
 'airline': 3.2,
 'airline+flight_no': 0.0,
 'airport': 0.4,
 'capacity': 0.3,
 'city': 0.4,
 'distance': 0.4,
 'flight': 73.7,
 'flight+airfare': 0.4,
 'flight_no': 0.2,
 'flight_time': 1.0999999999999999,
 'ground_fare': 0.4,
 'ground_service': 5.1,
 'meal': 0.1,
 'quantity': 1.0,
 'restriction': 0.1}
Dev:
{'abbreviation': 3.0,
 'aircraft': 1.6,
 'airfare': 8.4,
 'airline': 3.2,
 'airport': 0.4,
 'capacity': 0.4,
 'city': 0.4,
 'distance': 0.4,
 'flight': 73.7,
 'flight+airfare': 0.4,
 'flight_no': 0.2,
 'flight_time': 1.0,
 'ground_fare': 0.4,
 'ground_service': 5.0,
 'meal': 0.2,
 'quantity': 1.0,
 'restriction': 0.2}
Test:
{'abbreviation': 3.6999999999999997,
 'aircraft': 1.0,
 'airfare': 5.4,
 'airfare+flight': 0.1,
 'airline': 4.3,
 'airport': 2.0,
 'capacity': 2.4,
 'city': 0.7000000000000001,
 'day_name': 0.2,
 'distance': 1.0999999999999999,
 'flight': 70.8,
 'flight+airfare': 1.3,
 'flight+airline

In [6]:
from collections import Counter

w2id = {'pad': PAD_TOKEN, 'unk': 1}
slot2id = {'pad': PAD_TOKEN}
intent2id = {}
# Map the words only from the train set
# Map slot and intent labels of train, dev and test set. 'unk' is not needed.
for example in train_raw:
    for w in example['utterance'].split():
        if w not in w2id:
            w2id[w] = len(w2id)
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)

for example in dev_raw:
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)

for example in test_raw:
    for slot in example['slots'].split():
        if slot not in slot2id:
            slot2id[slot] = len(slot2id)
    if example['intent'] not in intent2id:
        intent2id[example['intent']] = len(intent2id)

print('# Vocab:', len(w2id)-2)  # we remove pad and unk from the count
print('# Slots:', len(slot2id)-1)
print('# Intent:', len(intent2id))

# Vocab: 864
# Slots: 129
# Intent: 26


In [7]:
from collections import Counter


class Lang():
    def __init__(self, words, intents, slots, cutoff=0):
        self.word2id = self.w2id(words, cutoff=cutoff, unk=True)
        self.slot2id = self.lab2id(slots)
        self.intent2id = self.lab2id(intents, pad=False)
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.id2slot = {v: k for k, v in self.slot2id.items()}
        self.id2intent = {v: k for k, v in self.intent2id.items()}

    def w2id(self, elements, cutoff=None, unk=True):
        vocab = {'pad': PAD_TOKEN}
        if unk:
            vocab['unk'] = len(vocab)
        count = Counter(elements)
        for k, v in count.items():
            if v > cutoff:
                vocab[k] = len(vocab)
        return vocab

    def lab2id(self, elements, pad=True):
        vocab = {}
        if pad:
            vocab['pad'] = PAD_TOKEN
        for elem in elements:
            vocab[elem] = len(vocab)
        return vocab

In [8]:
words = sum([x['utterance'].split() for x in train_raw],
            [])  # No set() since we want to compute
# the cutoff
corpus = train_raw + dev_raw + test_raw  # We do not wat unk labels,
# however this depends on the research purpose
slots = set(sum([line['slots'].split() for line in corpus], []))
intents = set([line['intent'] for line in corpus])

lang = Lang(words, intents, slots, cutoff=0)

In [9]:
import torch
import torch.utils.data as data
# we are defining a data loader


class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='unk'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk

        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):  # it will be called for each element of the batch
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample

    # Auxiliary methods

    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]

    def mapping_seq(self, data, mapper):  # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

In [10]:
# split them by white space
from pprint import pprint

sequences = ['I saw a man with a telescope',
             'book me a flight',
             'I want to see the flights from Milan to Ibiza']
splitted = [seq.split() for seq in sequences]
max_len = max([len(seq) for seq in splitted])
padded_seq = []
for seq in splitted:
    diff = max_len - len(seq)
    if diff != 0:
        # Depending on the side of the addition we can get left or right padding
        padded_seq.append(seq + ['PAD'] * diff)
    else:
        padded_seq.append(seq)
pprint(padded_seq)

[['I', 'saw', 'a', 'man', 'with', 'a', 'telescope', 'PAD', 'PAD', 'PAD'],
 ['book', 'me', 'a', 'flight', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'],
 ['I', 'want', 'to', 'see', 'the', 'flights', 'from', 'Milan', 'to', 'Ibiza']]


In [11]:
from torch.utils.data import DataLoader


def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths) == 0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(
            len(sequences), max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            # We copy each sequence into the matrix so that the padding are the empty elements of the matrix
            padded_seqs[i, :end] = seq
        # print(padded_seqs)
        # We remove these tensors from the computational graph
        padded_seqs = padded_seqs.detach()
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True)
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]

    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])

    src_utt = src_utt.to(DEVICE)  # We load the Tensor on our selected device
    y_slots = y_slots.to(DEVICE)
    intent = intent.to(DEVICE)
    y_lengths = torch.LongTensor(y_lengths).to(DEVICE)

    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    # the resulting matrix has dimension batch*max_token_sequence_of_the_batch
    return new_item

In [12]:
# Create our datasets
train_dataset_stock = IntentsAndSlots(train_raw, lang)
dev_dataset_stock = IntentsAndSlots(dev_raw, lang)
test_dataset_stock = IntentsAndSlots(test_raw, lang)


# Dataloader instantiations
train_loader_stock = DataLoader(train_dataset_stock, batch_size=128,
                                collate_fn=collate_fn,  shuffle=True)
dev_loader_stock = DataLoader(
    dev_dataset_stock, batch_size=64, collate_fn=collate_fn)
test_loader_stock = DataLoader(
    test_dataset_stock, batch_size=64, collate_fn=collate_fn)

In [13]:
import torch
import torch.utils.data as data
from transformers import BertTokenizer, BertModel

# we are defining a data loader


class IntentsAndSlots_BERTized (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, tokenizer,slot_dict, unk='unk'):

        self.tokenizer = tokenizer
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk

        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])
        #For slot filling, we feed the final hidden states of other tokens h2, . . . , hT into a softmax layer corresponding to the first sub-token as input to the
        """         slot_ids = []
        for slot_utt in slots:
            slot_utt_ids = []
            for slot in slot_utt.split(" "):
                slot_utt_ids.append(slot_dict[slot])
            slot_ids.append(slot_utt_ids) """


        whole_utt = self.tokenizer(
            self.utterances, return_tensors="pt", padding=True)

        self.utt_ids = whole_utt["input_ids"].tolist()
        self.attention_masks = whole_utt["attention_mask"].tolist()
        self.token_type = whole_utt["token_type_ids"].tolist()

        self.vocab = self.tokenizer.get_vocab()
        self.slot_ids= []
        for slot_utt in slots:
            slot_utt_ids = []
            print(slot_utt.split(" "))
            for slot in slot_utt.split(" "):
                slot_utt_ids.append(self.tokenizer(
                    slot, return_tensors="pt", padding=True)["input_ids"].tolist()[0][1])
            self.slot_ids.append(slot_utt_ids)

        """         self.slot_ids = self.tokenizer(
            self.slots, return_tensors="pt", padding=True)["input_ids"].tolist() """
        self.intent_ids = [self.vocab[int] for int in self.intents]

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):  # it will be called for each element of the batch
        utt = torch.LongTensor(self.utt_ids[idx])
        slots = torch.LongTensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        attention_mask = torch.LongTensor(self.attention_masks[idx])
        token_type = torch.LongTensor(self.token_type[idx])


        sample = {'utterance': utt, 'slots': slots, 'intent': intent,
                  "attention_mask": attention_mask, "token_type": token_type}
        return sample

    # Auxiliary methods
    def get_vocab_bert(self):
        return self.tokenizer.get_vocab()
    
    def get_vocab_slots(self):
        return self.slot_dict

    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]

    def mapping_seq(self, data, mapper):  # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

In [14]:
def collate_fn_bertized(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths) == 0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape
        # batch_size X maximum length of a sequence
        # LONG NORMAL TENSOR IS MABE SUFFICIENT
        padded_seqs = torch.LongTensor(
            len(sequences), max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            # We copy each sequence into the matrix so that the padding are the empty elements of the matrix
            padded_seqs[i, :end] = seq
        # print(padded_seqs)
        # We remove these tensors from the computational graph
        padded_seqs = padded_seqs.detach()
        return padded_seqs, lengths
    # Sort data by seq lengths

    data.sort(key=lambda x: len(x['utterance']), reverse=True)
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]

    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    src_mask, _ = merge(new_item['attention_mask'])
    src_type, _ = merge(new_item['token_type'])
    intent = torch.LongTensor(new_item["intent"])

    y_slots, y_lengths = merge(new_item["slots"])


    y_lengths = torch.LongTensor(y_lengths)

    new_item["utterances"] = src_utt.to(DEVICE)
    new_item["masks"] = src_mask.to(DEVICE)
    new_item["types"] = src_type.to(DEVICE)

    new_item["intents"] = intent.to(DEVICE)
    new_item["y_slots"] = y_slots.to(DEVICE)
    new_item["slots_len"] = y_lengths.to(DEVICE)
    # the resulting matrix has dimension batch*max_token_sequence_of_the_batch
    return new_item

In [15]:
slots_set = []
for slot_string in slots:
    for slot in slot_string.split(" "):
        slots_set.append(slot)
slots_set = set(slots_set)

slot_dict= {word:idx for idx, word in enumerate(slots_set,1)}
slot_dict["[PAD]"] = PAD_TOKEN



In [16]:
# Bert tokenized Dataset
from pprint import pprint
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased")
tokenizer.add_tokens(list(intents))

train_dataset_bert = IntentsAndSlots_BERTized(train_raw, tokenizer, slot_dict)
dev_dataset_bert = IntentsAndSlots_BERTized(dev_raw, tokenizer, slot_dict)
test_dataset_bert = IntentsAndSlots_BERTized(test_raw, tokenizer, slot_dict)


# Dataloader instantiations
train_loader_bert = DataLoader(train_dataset_bert, batch_size=128,
                               shuffle=True, collate_fn=collate_fn_bertized)
dev_loader_bert = DataLoader(
    dev_dataset_bert, batch_size=64, collate_fn=collate_fn_bertized)
test_loader_bert = DataLoader(
    test_dataset_bert, batch_size=64, collate_fn=collate_fn_bertized)  # , collate_fn=collate_fn

['B-flight']
['B-arrive_date.month_name']
['I-depart_date.day_number']
['B-depart_time.end_time']
['B-aircraft_code']
['B-time']
['I-meal_description']
['B-meal_description']
['I-meal_code']
['B-day_name']
['I-today_relative']
['B-fromloc.airport_code']
['B-fare_basis_code']
['B-arrive_date.day_name']
['B-depart_date.day_number']
['B-restriction_code']
['B-depart_date.date_relative']
['B-depart_date.today_relative']
['B-toloc.city_name']
['I-depart_date.day_name']
['I-arrive_time.start_time']
['I-flight_stop']
['O']
['B-round_trip']
['B-arrive_date.today_relative']
['I-restriction_code']
['B-arrive_date.date_relative']
['B-toloc.state_code']
['I-time']
['B-toloc.airport_name']
['B-day_number']
['I-cost_relative']
['I-arrive_date.day_number']
['B-cost_relative']
['I-economy']
['B-today_relative']
['B-state_code']
['B-meal_code']
['B-arrive_time.time_relative']
['I-depart_date.today_relative']
['B-toloc.state_name']
['B-flight_number']
['B-state_name']
['I-return_date.day_number']
['B-de

In [17]:
import torch.nn as nn


def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(
                            param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [18]:
from torch.utils.tensorboard import SummaryWriter


def log_values(writer, step, f1, acc, loss, prefix):
    if f1 != None:
        writer.add_scalar(f"{prefix}/f1", f1, step)
        writer.add_scalar(f"{prefix}/acc", acc, step)
    writer.add_scalar(f"{prefix}/loss", loss, step)

In [19]:
from conll import evaluate  # evaluates at chunk level
from sklearn.metrics import classification_report


def train_loop(data, optimizer, criterion_slots, criterion_intents, model, clip=5):
    model.train()
    loss_array = []
    for sample in data:

        optimizer.zero_grad()  # Zeroing the gradient

        slots, intent = model(
            sample['utterances'], sample['slots_len'], sample["masks"], sample["types"])
        print("intent_print",intent.size(), sample['intents'].size())
        print("slot_print", slots.size(), sample['y_slots'].size())

        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot  # In joint training we sum the losses.
        # Is there another way to do that?
        # an example is to have an hyper parameter such that loss = alpha loss_intent + alpha-1 loss_slot
        loss_array.append(loss.item())
        loss.backward()  # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()  # Update the weights
    return loss_array


def eval_loop(data, criterion_slots, criterion_intents, model, lang):
    model.eval()
    loss_array = []

    ref_intents = []
    hyp_intents = []

    ref_slots = []
    hyp_slots = []
    # softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad():  # It used to avoid the creation of computational graph
        for sample in data:
            slots, intents = model(
                sample['utterances'], sample['slots_len'], sample["masks"], sample["token_type"])
            loss_intent = criterion_intents(intents, sample['intents'])
            loss_slot = criterion_slots(slots, sample['y_slots'])
            loss = loss_intent + loss_slot
            loss_array.append(loss.item())
            # Intent inference
            # Get the highest probable class
            out_intents = [lang.id2intent[x]
                           for x in torch.argmax(intents, dim=1).tolist()]
            gt_intents = [lang.id2intent[x]
                          for x in sample['intents'].tolist()]
            ref_intents.extend(gt_intents)
            hyp_intents.extend(out_intents)

            # Slot inference
            output_slots = torch.argmax(slots, dim=1)
            for id_seq, seq in enumerate(output_slots):
                length = sample['slots_len'].tolist()[id_seq]
                utt_ids = sample['utterance'][id_seq][:length].tolist()
                gt_ids = sample['y_slots'][id_seq].tolist()
                gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
                utterance = [lang.id2word[elem] for elem in utt_ids]
                to_decode = seq[:length].tolist()
                ref_slots.append([(utterance[id_el], elem)
                                 for id_el, elem in enumerate(gt_slots)])
                tmp_seq = []
                for id_el, elem in enumerate(to_decode):
                    tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
                hyp_slots.append(tmp_seq)
    try:
        results = evaluate(ref_slots, hyp_slots)
    except Exception as ex:
        # Sometimes the model predicts a class that is not in REF
        print("Warning:", ex)
        ref_s = set([x[1] for x in ref_slots])
        hyp_s = set([x[1] for x in hyp_slots])
        print(hyp_s.difference(ref_s))
        results = {"total": {"f": 0}}

    report_intent = classification_report(ref_intents, hyp_intents,
                                          zero_division=False, output_dict=True)
    return results, report_intent, loss_array

In [20]:
import matplotlib.pyplot as plt
from tqdm import tqdm


def train_model(param_string, exp_name,
                n_epochs, patience_set, model, optimizer, model_type):
    losses_train = []
    losses_dev = []
    sampled_epochs = []
    best_f1 = 0
    patience = patience_set
    writer = SummaryWriter(log_dir=f"runs/{param_string}")
    criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
    criterion_intents = nn.CrossEntropyLoss()
    if model_type == "BERT":
        train_loader = train_loader_bert
        dev_loader = dev_loader_bert
        test_loader = test_loader_bert
    else:
        train_loader = train_loader_stock
        dev_loader = dev_loader_stock
        test_loader = test_loader_stock

    for x in tqdm(range(1, n_epochs)):
        loss = train_loop(train_loader, optimizer, criterion_slots,
                          criterion_intents, model)
        if x % 5 == 0:  # We check the performance every 5 epochs
            sampled_epochs.append(x)
            losses_train.append(np.asarray(loss).mean())
            results_dev, intent_res, loss_dev = eval_loop(dev_loader, criterion_slots,
                                                          criterion_intents, model, lang)
            losses_dev.append(np.asarray(loss_dev).mean())

            f1 = results_dev['total']['f']
            log_values(writer, x,  f1,
                       intent_res['accuracy'], np.asarray(loss_dev).mean(), exp_name)
            # For decreasing the patience you can also use the average between slot f1 and intent accuracy
            if f1 > best_f1:
                best_f1 = f1
                # Here you should save the model
                patience = patience_set
            else:
                patience -= 1
            if patience <= 0:  # Early stopping with patience
                break  # Not nice but it keeps the code clean
        else:
            log_values(writer, x, None, None,
                       np.asarray(loss).mean(), exp_name)

    results_test, intent_test, _ = eval_loop(test_loader, criterion_slots,
                                             criterion_intents, model, lang)

    print('Slot F1: ', results_test['total']['f'])
    print('Intent Accuracy:', intent_test['accuracy'])
    return intent_test['accuracy'], model

In [21]:
import itertools


def train_grid_search(lr_list, hid_size_list, emb_size_list, weight_decay_list, patience_list, emb_drop_list, out_drop_list, vocab_len, out_slot, out_int, model_type="None", bidirectional=False, dropout=False, optimizer=None, n_epochs=100, exp_name="Pollo"):
    best_model = None
    best_accuracy = 0
    best_params = {}
    optimizer_class = optimizer
    model = None
    # Iterate over all combinations of the parameter lists
    for lr, hid_size, emb_size, weight_decay, patience, emb_drop in itertools.product(lr_list, hid_size_list, emb_size_list, weight_decay_list, patience_list, emb_drop_list):

        print(
            f"Training with parameters: lr={lr}, hid_size={hid_size}, emb_size={emb_size}, weight_decay={weight_decay}, patience={patience}, emb_drop={emb_drop}")
        # Initialize the model with current parameters

        model = MultiModel1(emb_size=emb_size, hid_size=hid_size, vocab_len=vocab_len, model_type=model_type, out_slot=out_slot,
                            pad_index=PAD_TOKEN, dropout=dropout, out_int=out_int,
                            emb_dropout=emb_drop, bidirectional=bidirectional).to(DEVICE)

        model.apply(init_weights)

        # Set up the optimizer with current parameters
        optimizer_obj = optimizer_class(model.parameters(), lr=lr,
                                        weight_decay=weight_decay)

        # Train the model
        accuracy, model = train_model(exp_name+"/"+f"LR={lr}, HS={hid_size}, ES={emb_size}, WD={weight_decay}, P={patience}, E_D={emb_drop},M_T={model_type},D={dropout}", model=model,
                                      optimizer=optimizer_obj, patience_set=patience,
                                      n_epochs=n_epochs, exp_name=exp_name, model_type=model_type)

        # Check if the current model is the best so far
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_params = {
                'lr': lr,
                'hid_size': hid_size,
                'emb_size': emb_size,
                'weight_decay': weight_decay,
                'patience': patience,
                'emb_drop': emb_drop,
            }

            torch.save({
                'model_state_dict': model.state_dict(),
                "best_params": best_params,
                "exp": exp_name
            }, "models/"+exp_name+".model")

    print(f"Best model parameters: {best_params}")
    print(f"Best model accuracy: {best_accuracy}")
    return best_model, best_params

In [22]:
from torch import optim

out_slot = len(tokenizer.get_vocab())
out_int = len(dev_dataset_bert.get_vocab_bert())
vocab_len = len(dev_dataset_bert.get_vocab_bert())
print("out_slot:", out_slot, "out_int:", out_int, "vocab_len:",vocab_len)
pprint(dev_dataset_bert.__getitem__(0))

out_slot: 30537 out_int: 30537 vocab_len: 30537
{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'intent': 3462,
 'slots': tensor([1038]),
 'token_type': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'utterance': tensor([  101,  2265,  2033, 25493,  3462,  1055,  2013,  5631,  2000,  2047,
         2259,  2007,  2873,  2465, 27092,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])}


In [23]:
lr_list = [.001, .0005, .0001]
hid_size_list = [200, 300, 500]
emb_size_list = [200, 300, 500]
weight_decay_list = [0.0001, 0.001]
patience_list = [3]
emb_drop_list = [.01, .1, .15]
out_drop_list = [0]
best_model, best_params = train_grid_search(
    lr_list, hid_size_list, emb_size_list,
    weight_decay_list, patience_list, emb_drop_list,
    out_drop_list, vocab_len=vocab_len, out_slot=out_slot, out_int=out_int, exp_name="BERT_STOCK", model_type="BERT", optimizer=optim.Adam, dropout=False, n_epochs=EPOCHS)

Training with parameters: lr=0.001, hid_size=200, emb_size=200, weight_decay=0.0001, patience=3, emb_drop=0.01


  0%|          | 0/199 [00:00<?, ?it/s]


IndexError: list index out of range

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs --host localhost --port 8088