In [None]:
%%capture
!pip install transformers
!pip install TorchCRF
!pip install seqeval

In [None]:
#importing dependecies
import re
import os
import logging
import random
from tqdm import tqdm, trange
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, DistilBertConfig, AlbertConfig
from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertConfig
from transformers.models.distilbert.modeling_distilbert import DistilBertPreTrainedModel, DistilBertModel, DistilBertConfig
from transformers.models.albert.modeling_albert import AlbertPreTrainedModel, AlbertModel, AlbertConfig
from TorchCRF import CRF
from seqeval.metrics import precision_score, recall_score, f1_score

In [None]:
def readIOB(filename, drop_list):
  words_list = list() # a list of list of words for every sentence in dataset
  tags_list = list() # a list of list of IOB tags for every sentence in dataset
  intents_list = list() # a list of intent for every sentence in dataset
  data = list() # a list of dicts contaiting above data in a more structured form
  vocabs = set() # a set containing whole dataset words

  with open(filename) as f:
    for line in f.readlines():
        line = line.strip().split()
        
        index = 0
        words = list()
        for i, word in enumerate(line[:-1]):
            if (word == 'EOS'):
                index = i
                break
            if (word != 'BOS'): 
                # word = re.sub(r'[^\w\s]', '', word) # removing punctuations
                word = re.sub(r"[?|'|!|.]","", word) # removing punctuations
                word = word.lower() # lowerizing word
                words.append(word) 
                vocabs.add(word)
        
        tags = list()
        for tag in line[index:-1]:
            tags.append(tag)
                    
        # handling entries with multiple intents
        intent = ""
        if ('#' in line[-1]): 
            for item in line[-1].split('#'):
                intent = item
                break
        else: intent = line[-1]

        if (intent not in drop_list): #dropping 
          words_list.append(words)
          tags_list.append(tags)
          intents_list.append(intent)
          data.append({
              # 'sentence': ' '.join(words),
              'words': words,
              'iob_tags': tags,
              'length': len(words),
              'intent': intent})
          

  word_dict = {'UNK': 0, 'PAD': 1}
  for i, item in enumerate(sorted(vocabs)):
    word_dict[item] = i + 2

  tags = set()
  for item in tags_list:
    for tag in item:
      tags.add(tag)
  slot_dict = dict()
  for i, item in enumerate(sorted(tags)):
    slot_dict[item] = i

  intent_dict = dict()
  for i, item in enumerate(sorted(set(intents_list))):
    intent_dict[item] = i

  return intent_dict, slot_dict, word_dict, words_list, tags_list, intents_list, sorted(vocabs), data


def dataStatistics(tags_list, intents_list, vocabs):
  print('dataset vocab size:', len(vocabs))
  print('# of dataset rows:', len(intents_list))
  print('# of dataset unique intents:', len(set(intents_list)))
  print('# of dataset unique IOB tags:', len(set([tag for item in tags_list for tag in item])))

  print('-' * 35 + '\nintents distribution:') 
  intents_freq = dict()
  for intent in set(intents_list):
      intents_freq[intent] = intents_list.count(intent)
  for key in intents_freq:
      value = intents_freq[key]
      print('%s: %d (%.2f%%),' % (key, value, value / len(intents_list) * 100))
  print()
  plt.figure(figsize=(24, 8))
  plt.bar(list(intents_freq.keys()),intents_freq.values())
  plt.show()

  print('\n' + '-' * 35 + '\ntags distribution:')
  tags_freq = dict()
  count = 0
  for item in tags_list:
      for tag in item:
          if (tag in tags_freq): tags_freq[tag] += 1 
          else: tags_freq[tag] = 1
          count += 1
  for key in tags_freq:
      value = tags_freq[key]
      print('%s: %d (%.2f%%),' % (key, value, value / count * 100))
    
  return intents_freq, tags_freq

In [None]:
drop_list = ['atis_cheapest', 'atis_city', 'atis_restriction', 'atis_meal', 'atis_distance', 'atis_airport', 'atis_capacity', 'atis_ground_fare','atis_flight_no']

In [None]:
#importing train data
train_intent_dict, train_tag_dict, train_word_dict, train_words, train_tags, train_intents, train_vocabs, train_data = readIOB('atis-train-final-w-intent.iob', drop_list = drop_list)
train_intents_freq, train_tags_freq = dataStatistics(train_tags, train_intents, train_vocabs)

In [None]:
#importing test data
test_intent_dict, test_tag_dict, test_word_dict, test_words, test_tags, test_intents, test_vocabs, test_data = readIOB('atis-test-final-w-intent.iob', drop_list = drop_list)
test_intents_freq, test_tags_freq = dataStatistics(test_tags, test_intents, test_vocabs)

In [None]:
class IntentClassifier(nn.Module):
    def __init__(self, input_dim, num_intent_labels, dropout_rate=0.):
        super(IntentClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_intent_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)


class SlotClassifier(nn.Module):
    def __init__(self, input_dim, num_slot_labels, dropout_rate=0.):
        super(SlotClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_slot_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)

In [None]:
class JointBERT(BertPreTrainedModel):
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointBERT, self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.bert = BertModel(config=config)  # Load pretrained bert

        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate)
        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)

        if args.use_crf:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, token_type_ids, intent_label_ids, slot_labels_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0
        # 1. Intent Softmax
        if intent_label_ids is not None:
            if self.num_intent_labels == 1:
                intent_loss_fct = nn.MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1))
            else:
                intent_loss_fct = nn.CrossEntropyLoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
            total_loss += intent_loss

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            if self.args.use_crf:
                slot_loss = self.crf(slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction='mean')
                slot_loss = -1 * slot_loss  # negative log-likelihood
            else:
                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args.ignore_index)
                # Only keep active parts of the loss
                if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                    active_labels = slot_labels_ids.view(-1)[active_loss]
                    slot_loss = slot_loss_fct(active_logits, active_labels)
                else:
                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
            total_loss += self.args.slot_loss_coef * slot_loss

        outputs = ((intent_logits, slot_logits),) + outputs[2:]  # add hidden states and attention if they are here

        outputs = (total_loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

In [None]:
class JointDistilBERT(DistilBertPreTrainedModel):
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointDistilBERT, self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.distilbert = DistilBertModel(config=config)  # Load pretrained bert

        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate)
        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)

        if args.use_crf:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, intent_label_ids, slot_labels_ids):
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)  # last-layer hidden-state, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = sequence_output[:, 0]  # [CLS]

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0
        # 1. Intent Softmax
        if intent_label_ids is not None:
            if self.num_intent_labels == 1:
                intent_loss_fct = nn.MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1))
            else:
                intent_loss_fct = nn.CrossEntropyLoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
            total_loss += intent_loss

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            if self.args.use_crf:
                slot_loss = self.crf(slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction='mean')
                slot_loss = -1 * slot_loss  # negative log-likelihood
            else:
                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args.ignore_index)
                # Only keep active parts of the loss
                if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                    active_labels = slot_labels_ids.view(-1)[active_loss]
                    slot_loss = slot_loss_fct(active_logits, active_labels)
                else:
                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
            total_loss += self.args.slot_loss_coef * slot_loss

        outputs = ((intent_logits, slot_logits),) + outputs[1:]  # add hidden states and attention if they are here

        outputs = (total_loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

In [None]:

class JointAlbert(AlbertPreTrainedModel):
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointAlbert, self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.albert = AlbertModel(config=config)  # Load pretrained bert

        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate)
        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)

        if args.use_crf:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, token_type_ids, intent_label_ids, slot_labels_ids):
        outputs = self.albert(input_ids, attention_mask=attention_mask,
                              token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0
        # 1. Intent Softmax
        if intent_label_ids is not None:
            if self.num_intent_labels == 1:
                intent_loss_fct = nn.MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1))
            else:
                intent_loss_fct = nn.CrossEntropyLoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
            total_loss += intent_loss

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            if self.args.use_crf:
                slot_loss = self.crf(slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction='mean')
                slot_loss = -1 * slot_loss  # negative log-likelihood
            else:
                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args.ignore_index)
                # Only keep active parts of the loss
                if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                    active_labels = slot_labels_ids.view(-1)[active_loss]
                    slot_loss = slot_loss_fct(active_logits, active_labels)
                else:
                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
            total_loss += self.args.slot_loss_coef * slot_loss

        outputs = ((intent_logits, slot_logits),) + outputs[2:]  # add hidden states and attention if they are here

        outputs = (total_loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

In [None]:
MODEL_CLASSES = {
    'bert': (BertConfig, JointBERT, BertTokenizer),
    'distilbert': (DistilBertConfig, JointDistilBERT, DistilBertTokenizer),
    'albert': (AlbertConfig, JointAlbert, AlbertTokenizer)
}

MODEL_PATH_MAP = {
    'bert': 'bert-base-uncased',
    'distilbert': 'distilbert-base-uncased',
    'albert': 'albert-xxlarge-v1'
}

logger = logging.getLogger(__name__)

In [None]:
temp = set()
for tag in train_intent_dict:
  temp.add(tag)
for tag in test_intent_dict:
  temp.add(tag)
f = open('vocab.intents','w')
for item in sorted(temp):
  f.write(item + '\n')
f.close()

In [None]:
temp = set()
for tag in train_tag_dict:
  temp.add(tag)
for tag in test_tag_dict:
  temp.add(tag)
f = open('vocab.tags','w')
for item in sorted(temp):
  f.write(item + '\n')
f.close()

In [None]:
def get_intent_labels():
  train_intents = list()
  with open('vocab.intents') as f:
    for i, line in enumerate(f.readlines()):
        train_intents.append(line.strip())
  
  return train_intents


def get_slot_labels():  
  train_tags = list()
  with open('vocab.tags') as f:
    for i, line in enumerate(f.readlines()):
      train_tags.append(line.strip())

  return train_tags


def load_tokenizer(args):
    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)


def init_logger():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if not args.no_cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)


def compute_metrics(intent_preds, intent_labels, slot_preds, slot_labels):
    assert len(intent_preds) == len(intent_labels) == len(slot_preds) == len(slot_labels)
    results = {}
    intent_result = get_intent_acc(intent_preds, intent_labels)
    slot_result = get_slot_metrics(slot_preds, slot_labels)
    sementic_result = get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels)

    results.update(intent_result)
    results.update(slot_result)
    results.update(sementic_result)

    return results


def get_chunks(labels):
    chunks = []
    start_idx,end_idx = 0,0
    for idx in range(1,len(labels)-1):
        chunkStart, chunkEnd = False,False
        if labels[idx-1] not in ('O', '<pad>', '<unk>', '<s>', '</s>', '<STOP>', '<START>'):
            prevTag, prevType = labels[idx-1][:1], labels[idx-1][2:]
        else:
            prevTag, prevType = 'O', 'O'
        if labels[idx] not in ('O', '<pad>', '<unk>', '<s>', '</s>', '<STOP>', '<START>'):
            Tag, Type = labels[idx][:1], labels[idx][2:]
        else:
            Tag, Type = 'O', 'O'
        if labels[idx+1] not in ('O', '<pad>', '<unk>', '<s>', '</s>', '<STOP>', '<START>'):
            nextTag, nextType = labels[idx+1][:1], labels[idx+1][2:]
        else:
            nextTag, nextType = 'O', 'O'

        if (Tag == 'B' and prevTag in ('B', 'I', 'O')) or (prevTag, Tag) in [('O', 'I'), ('E', 'E'), ('E', 'I'), ('O', 'E')]:
            chunkStart = True
        if Tag != 'O' and prevType != Type:
            chunkStart = True

        if (Tag in ('B','I') and nextTag in ('B','O')) or (Tag == 'E' and nextTag in ('E', 'I', 'O')):
            chunkEnd = True
        if Tag != 'O' and Type != nextType:
            chunkEnd = True

        if chunkStart:
            start_idx = idx
        if chunkEnd:
            end_idx = idx
            chunks.append((start_idx,end_idx,Type))
            start_idx,end_idx = 0,0
    return chunks

def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    # print('labels')
    # print(labels)
    # print('preds')
    # print(preds)
    # return {
        # "slot_precision": precision_score(labels, preds),
        # "slot_recall": recall_score(labels, preds),
        # "slot_f1": f1_score(labels, preds)
    # }
    TP, FP, FN = 0, 0, 0
    new_labels = list()
    for i in range(len(labels)):
      label_chunks = get_chunks(['O'] + labels[i] + ['O'])
      pred_chunks = get_chunks(['O'] + preds[i] + ['O']) 
      # print('label')
      # print(label_chunks)
      # print('pred')
      # print(pred_chunks)
      for pred_chunk in pred_chunks:
        if pred_chunk in label_chunks:
            TP += 1
        else:
            FP += 1
    for label_chunk in label_chunks:
        if label_chunk not in pred_chunks:
          FN += 1

    F1_score = 2 * TP / (2 * TP + FN + FP)
    recall = TP / (TP + FN)
    precision = TP  / (TP + FP)
    return {
        "slot_precision": precision,
        "slot_recall": recall,
        "slot_f1": F1_score
    }


def get_intent_acc(preds, labels):
    acc = (preds == labels).mean()
    return {
        "intent_acc": acc
    }


def read_prediction_text(args):
    return [text.strip() for text in open(os.path.join(args.pred_dir, args.pred_input_file), 'r', encoding='utf-8')]


def get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels):
    """For the cases that intent and all the slots are correct (in one sentence)"""
    # Get the intent comparison result
    intent_result = (intent_preds == intent_labels)

    # Get the slot comparision result
    slot_result = []
    for preds, labels in zip(slot_preds, slot_labels):
        assert len(preds) == len(labels)
        one_sent_result = True
        for p, l in zip(preds, labels):
            if p != l:
                one_sent_result = False
                break
        slot_result.append(one_sent_result)
    slot_result = np.array(slot_result)

    sementic_acc = np.multiply(intent_result, slot_result).mean()
    return {
        "sementic_frame_acc": sementic_acc
    }

In [None]:
max_len = 50

# Make tag dict 
train_tag_dict = {}

with open('vocab.tags') as f:
    for i, line in enumerate(f.readlines()):
        train_tag_dict[line.strip()] = i

# Make intent dict 
train_intent_dict = {}

with open('vocab.intents') as f:
    for i, line in enumerate(f.readlines()):
        train_intent_dict[line.strip()] = i

def intent2index(intent, intent_dict=train_intent_dict):
  return intent_dict[intent]

def tags2index(tags, tags_dict=train_tag_dict):
  tagsindex = list()
  for tag in tags:
    tagsindex.append(tags_dict[tag])
  
  while len(tagsindex) < max_len:
    tagsindex.append(tags_dict['O'])

  return tagsindex

def words2index(words, word_dict=train_word_dict):
  sentence = list()
  for word in words:
    if word in word_dict:
      sentence.append(word_dict[word])
    else:
      sentence.append(word_dict['UNK'])

  while len(sentence) < max_len:
    sentence.append(word_dict['PAD'])

  return sentence


In [None]:
def convert_to_tensor_dataset(data,
                              args,
                              tokenizer,
                              pad_token_label_id,
                              cls_token_segment_id=0,
                              pad_token_segment_id=0,
                              sequence_a_segment_id=0,
                              mask_padding_with_zero=True):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []
    all_slot_label_mask = []
    all_slot_label = []
    all_intent_label = []
    for item in data:
        all_slot_label.append(tags2index(item['iob_tags']))
        all_intent_label.append(intent2index(item['intent']))
        words = item['words']
        tokens = []
        slot_label_mask = []
        for word in words:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_label_mask.extend([pad_token_label_id + 1] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > args.max_seq_len - special_tokens_count:
            tokens = tokens[: (args.max_seq_len - special_tokens_count)]
            slot_label_mask = slot_label_mask[:(args.max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)
        slot_label_mask += [pad_token_label_id]

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids
        slot_label_mask = [pad_token_label_id] + slot_label_mask

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args.max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)

        all_input_ids.append(input_ids)
        all_attention_mask.append(attention_mask)
        all_token_type_ids.append(token_type_ids)
        all_slot_label_mask.append(slot_label_mask)

    # Change to Tensor
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    all_input_ids = torch.tensor(all_input_ids, dtype=torch.long).to(device)
    all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long).to(device)
    all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long).to(device)
    all_slot_label_mask = torch.tensor(all_slot_label_mask, dtype=torch.long).to(device)
    all_intent_label = torch.tensor(all_intent_label, dtype=torch.long).to(device)
    all_slot_label = torch.tensor(all_slot_label, dtype=torch.long).to(device)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_intent_label, all_slot_label)#all_slot_label_mask, )

    return dataset

In [None]:
class Trainer(object):
    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.intent_label_lst = get_intent_labels()
        self.slot_label_lst = get_slot_labels()
        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = args.ignore_index

        self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
        self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task)
        self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                      config=self.config,
                                                      args=args,
                                                      intent_label_lst=self.intent_label_lst,
                                                      slot_label_lst=self.slot_label_lst)

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        self.model.to(self.device)

    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.args.logging_steps)
        logger.info("  Save steps = %d", self.args.save_steps)

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
        for _ in train_iterator:
          
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()

                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}
                if self.args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                loss = outputs[0]

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                        self.evaluate("test")

                    if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                        self.save_model()

                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.max_steps < global_step:
                train_iterator.close()
                break

        return global_step, tr_loss / global_step

    def evaluate(self, mode):
        if mode == 'test':
            dataset = self.test_dataset
        elif mode == 'dev':
            dataset = self.dev_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        intent_preds = None
        slot_preds = None
        out_intent_label_ids = None
        out_slot_labels_ids = None

        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}
                if self.args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                tmp_eval_loss, (intent_logits, slot_logits) = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            # Intent prediction
            if intent_preds is None:
                intent_preds = intent_logits.detach().cpu().numpy()
                out_intent_label_ids = inputs['intent_label_ids'].detach().cpu().numpy()
            else:
                intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)
                out_intent_label_ids = np.append(
                    out_intent_label_ids, inputs['intent_label_ids'].detach().cpu().numpy(), axis=0)

            # Slot prediction
            if slot_preds is None:
                if self.args.use_crf:
                    # decode() in `torchcrf` returns list with best index directly
                    slot_preds = np.array(self.model.crf.decode(slot_logits))
                else:
                    slot_preds = slot_logits.detach().cpu().numpy()

                out_slot_labels_ids = inputs["slot_labels_ids"].detach().cpu().numpy()
            else:
                if self.args.use_crf:
                    slot_preds = np.append(slot_preds, np.array(self.model.crf.decode(slot_logits)), axis=0)
                else:
                    slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)

                out_slot_labels_ids = np.append(out_slot_labels_ids, inputs["slot_labels_ids"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        results = {
            "loss": eval_loss
        }

        # Intent result
        intent_preds = np.argmax(intent_preds, axis=1)

        # Slot result
        if not self.args.use_crf:
            slot_preds = np.argmax(slot_preds, axis=2)
        slot_label_map = {i: label for i, label in enumerate(self.slot_label_lst)}
        out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
        slot_preds_list = [[] for _ in range(out_slot_labels_ids.shape[0])]

        for i in range(out_slot_labels_ids.shape[0]):
            for j in range(out_slot_labels_ids.shape[1]):
                if out_slot_labels_ids[i, j] != self.pad_token_label_id:
                    out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
                    slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])

        total_result = compute_metrics(intent_preds, out_intent_label_ids, slot_preds_list, out_slot_label_list)
        results.update(total_result)

        logger.info("***** Eval results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))

        return results

    def save_model(self):
        # Save model checkpoint (Overwrite)
        if not os.path.exists(self.args.model_dir):
            os.makedirs(self.args.model_dir)
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        model_to_save.save_pretrained(self.args.model_dir)

        # Save training arguments together with the trained model
        # torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
        # logger.info("Saving model checkpoint to %s", self.args.model_dir)

    def load_model(self):
        # Check whether model exists
        if not os.path.exists(self.args.model_dir):
            raise Exception("Model doesn't exists! Train first!")

        try:
            self.model = self.model_class.from_pretrained(self.args.model_dir,
                                                          args=self.args,
                                                          intent_label_lst=self.intent_label_lst,
                                                          slot_label_lst=self.slot_label_lst)
            self.model.to(self.device)
            logger.info("***** Model Loaded *****")
        except:
            raise Exception("Some model files might be missing...")

In [None]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [None]:
args = dotdict(dict())
args.seed = 1
args.model_type = 'bert'
args.model_name_or_path = MODEL_PATH_MAP[args.model_type]
args.dropout_rate =  0.1
args.do_train = True
args.do_eval = True
args.ignore_index = 0 #?
args.train_batch_size = 32 
args.max_steps = -1
args.task = 'atis' #?
args.no_cuda = False
args.weight_decay = 0
args.num_train_epochs = 50 
args.gradient_accumulation_steps = 1 #?
args.learning_rate = 5e-5 
args.adam_epsilon = 1e-8
args.warmup_steps = 0 #?
args.logging_steps = 200
args.save_steps = 200
args.max_grad_norm = 1
args.eval_batch_size = 64
args.use_crf = False #?
args.model_dir = 'atis_model' 
args.slot_loss_coef = 1
args.max_seq_len = 50

In [42]:
init_logger()
set_seed(args)
tokenizer = load_tokenizer(args)

train_dataset = convert_to_tensor_dataset(train_data, args, tokenizer, args.ignore_index)
test_dataset = convert_to_tensor_dataset(test_data, args, tokenizer, args.ignore_index)

trainer = Trainer(args, train_dataset, None, test_dataset)

if args.do_train:
    trainer.train()

if args.do_eval:
    trainer.load_model()
    trainer.evaluate("test")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration:  58%|█████▊    | 87/150 [00:49<00:35,  1.77it/s][A
Iteration:  59%|█████▊    | 88/150 [00:49<00:34,  1.77it/s][A
Iteration:  59%|█████▉    | 89/150 [00:50<00:34,  1.75it/s][A
Iteration:  60%|██████    | 90/150 [00:50<00:34,  1.75it/s][A
Iteration:  61%|██████    | 91/150 [00:51<00:33,  1.76it/s][A
Iteration:  61%|██████▏   | 92/150 [00:52<00:32,  1.76it/s][A
Iteration:  62%|██████▏   | 93/150 [00:52<00:32,  1.77it/s][A
Iteration:  63%|██████▎   | 94/150 [00:53<00:31,  1.77it/s][A
Iteration:  63%|██████▎   | 95/150 [00:53<00:31,  1.75it/s][A
Iteration:  64%|██████▍   | 96/150 [00:54<00:30,  1.76it/s][A
Iteration:  65%|██████▍   | 97/150 [00:54<00:29,  1.77it/s][A
Iteration:  65%|██████▌   | 98/150 [00:55<00:29,  1.76it/s][A
Iteration:  66%|██████▌   | 99/150 [00:56<00:28,  1.77it/s][A09/08/2021 11:07:14 - INFO - __main__ -   ***** Running evaluation on test dataset *****
09/08/2021 11:07:14 - INFO -