In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 12.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 37.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

## Read Dataset

In [2]:
import json 
import os

In [3]:
class readData(object):

  def __init__(self, id, intent, positions, slots, text): 
    self.id = id
    self.intent = intent
    self.positions = positions
    self.slots = slots
    self.text = text

  def __repr__(self): 
    return str(json.dumps(self.__dict__, indent=2))

In [4]:
def readJson(filename): 
  if os.path.exists(filename): 
    intents = []

    with open(filename, "r", encoding="utf-8") as json_file: 
      data = json.load(json_file)

      for k in data.keys(): 
        intent = data[k]["intent"]
        positions = data[k]["positions"]
        slots = data[k]["slots"]
        text = data[k]["text"]

        temp = readData(k, intent, positions, slots, text)
        intents.append(temp)

    return intents
  else:
      raise FileNotFoundError("No file found with that path!")

In [7]:
train_data = readJson("train.json")

In [8]:
train_data[0]

{
  "id": "0",
  "intent": "AddToPlaylist",
  "positions": {
    "music_item": [
      6,
      9
    ],
    "playlist_owner": [
      14,
      15
    ],
    "playlist": [
      17,
      32
    ]
  },
  "slots": {
    "music_item": "tune",
    "playlist_owner": "my",
    "playlist": "elrow Guest List"
  },
  "text": "Add a tune to my elrow Guest List"
}

## Preprocess data

In [9]:
import tensorflow as tf
from transformers import BertTokenizer

In [10]:
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
def encode_texts(tokenizer, texts):
  return tokenizer(texts, padding=True, truncation=True, return_tensors="tf")

In [12]:
texts = [d.text for d in train_data]
tds = encode_texts(tokenizer, texts)
tds.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
encoded_texts = tds

### Encode Labels

In [14]:
intents = [d.intent for d in train_data]
intent_names = list(set(intents))
intent_names

['RateBook',
 'BookRestaurant',
 'GetWeather',
 'SearchCreativeWork',
 'PlayMusic',
 'AddToPlaylist',
 'SearchScreeningEvent']

In [15]:
intent_map = dict()
for idx, ui in enumerate(intent_names): 
  intent_map[ui] = idx

intent_map
  

{'AddToPlaylist': 5,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 4,
 'RateBook': 0,
 'SearchCreativeWork': 3,
 'SearchScreeningEvent': 6}

In [16]:
def encode_intents(intents, intent_map): 
  encoded = []
  for i in intents: 
    encoded.append(intent_map[i])

  return tf.convert_to_tensor(encoded, dtype="int32")
    

In [17]:
encoded_intents = encode_intents(intents, intent_map)

### Handle Slots

In [18]:
slot_names = set()
for td in train_data: 
  slots = td.slots
  for slot in slots: 
    slot_names.add(slot)

slot_names = list(slot_names)
slot_names.insert(0, "<PAD>")
slot_names

['<PAD>',
 'service',
 'year',
 'poi',
 'rating_value',
 'music_item',
 'object_select',
 'rating_unit',
 'object_name',
 'genre',
 'party_size_description',
 'location_name',
 'party_size_number',
 'object_location_type',
 'city',
 'geographic_poi',
 'served_dish',
 'condition_description',
 'object_type',
 'playlist_owner',
 'object_part_of_series_type',
 'restaurant_type',
 'state',
 'current_location',
 'track',
 'best_rating',
 'movie_type',
 'artist',
 'entity_name',
 'sort',
 'spatial_relation',
 'condition_temperature',
 'cuisine',
 'playlist',
 'album',
 'facility',
 'movie_name',
 'timeRange',
 'country',
 'restaurant_name']

In [19]:
train_data[0].slots


{'music_item': 'tune', 'playlist': 'elrow Guest List', 'playlist_owner': 'my'}

In [20]:
slot_map = dict() # slot -> index
for idx, us in enumerate(slot_names):
    slot_map[us] = idx
slot_map

{'<PAD>': 0,
 'album': 34,
 'artist': 27,
 'best_rating': 25,
 'city': 14,
 'condition_description': 17,
 'condition_temperature': 31,
 'country': 38,
 'cuisine': 32,
 'current_location': 23,
 'entity_name': 28,
 'facility': 35,
 'genre': 9,
 'geographic_poi': 15,
 'location_name': 11,
 'movie_name': 36,
 'movie_type': 26,
 'music_item': 5,
 'object_location_type': 13,
 'object_name': 8,
 'object_part_of_series_type': 20,
 'object_select': 6,
 'object_type': 18,
 'party_size_description': 10,
 'party_size_number': 12,
 'playlist': 33,
 'playlist_owner': 19,
 'poi': 3,
 'rating_unit': 7,
 'rating_value': 4,
 'restaurant_name': 39,
 'restaurant_type': 21,
 'served_dish': 16,
 'service': 1,
 'sort': 29,
 'spatial_relation': 30,
 'state': 22,
 'timeRange': 37,
 'track': 24,
 'year': 2}

In [21]:
def get_slot_from_word(word, slot_dict):
    for slot_label,value in slot_dict.items():
        if word in value.split():
            return slot_label
    return None

print(train_data[0].text)
print(train_data[0].slots)
print("slot_name for my is : ", get_slot_from_word("my", train_data[0].slots))

Add a tune to my elrow Guest List
{'music_item': 'tune', 'playlist_owner': 'my', 'playlist': 'elrow Guest List'}
slot_name for my is :  playlist_owner


In [22]:
import numpy as np

max_len = len(encoded_texts["input_ids"][0])

def encode_slots(all_slots, all_texts, 
                 toknizer, slot_map, max_len=max_len):
    encoded_slots = np.zeros(shape=(len(all_texts), max_len), dtype=np.int32)
    
    for idx, text in enumerate(all_texts):
        enc = [] 
        
        slot_names = all_slots[idx]
        
        
        raw_tokens = text.split()

       
        for rt in raw_tokens:
            
            bert_tokens = tokenizer.tokenize(rt)
            
            
            rt_slot_name = get_slot_from_word(rt, slot_names)
            if rt_slot_name is not None:
                
                enc.append(slot_map[rt_slot_name])
                enc.extend([slot_map[rt_slot_name]] * (len(bert_tokens) - 1))

            else:
              
                enc.append(0)

        
        
        encoded_slots[idx, 1:len(enc)+1] = enc
    
    return encoded_slots
    

In [23]:
all_slots = [td.slots for td in train_data]
all_texts = [td.text for td in train_data]

In [24]:
encoded_slots = encode_slots(all_slots, all_texts, tokenizer, slot_map)

In [25]:
encoded_slots[0]

array([ 0,  0,  0,  5,  0, 19, 33, 33, 33, 33,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

### Define Model 

In [26]:
from transformers import TFBertModel
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name=model_name, dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        # two outputs from BERT
        trained_bert = self.bert(inputs, **kwargs)
        pooled_output = trained_bert.pooler_output
        sequence_output = trained_bert.last_hidden_state
        
        # sequence_output will be used for slot_filling / classification
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_logits = self.slot_classifier(sequence_output)

        # pooled_output for intent classification
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_logits = self.intent_classifier(pooled_output)

        return slot_logits, intent_logits

In [27]:
joint_model = JointIntentAndSlotFillingModel(
    intent_num_labels=len(intent_map), slot_num_labels=len(slot_map))

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [28]:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)

# two outputs, one for slots, another for intents
# we have to fine tune for both
losses = [SparseCategoricalCrossentropy(from_logits=True),
          SparseCategoricalCrossentropy(from_logits=True)]

metrics = [SparseCategoricalAccuracy("accuracy")]
# compile model
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

In [29]:
x = {"input_ids": encoded_texts["input_ids"], "token_type_ids": encoded_texts["token_type_ids"],  "attention_mask": encoded_texts["attention_mask"]}

history = joint_model.fit(
    x, (encoded_slots, encoded_intents), epochs=2, batch_size=32, shuffle=True)

Epoch 1/2
Epoch 2/2


### Inference 

In [30]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs

    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, :]
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]

    info = {"intent": intent_names[intent_id], "slots": {}}

    out_dict = {}
    
    predicted_slots = set([slot_names[s] for s in slot_ids if s != 0])
    for ps in predicted_slots:
      out_dict[ps] = []

    
    if text[0].islower():
      tokens = tokenizer.tokenize(text, add_special_tokens=True)
    else:
      tokens = tokenizer.tokenize(text)
    for token, slot_id in zip(tokens, slot_ids):
       
        slot_name = slot_names[slot_id]

        if slot_name == "<PAD>":
            continue

        
        collected_tokens = [token]
        idx = tokens.index(token)

        
        if token.startswith("##"):
          
          if tokens[idx - 1] not in out_dict[slot_name]:
            collected_tokens.insert(0, tokens[idx - 1])

        
        out_dict[slot_name].extend(collected_tokens)

    
    for slot_name in out_dict:
        tokens = out_dict[slot_name]
        slot_value = tokenizer.convert_tokens_to_string(tokens)

        info["slots"][slot_name] = slot_value.strip()

    return info


In [31]:
nlu("add Madchild to Electro Latino", tokenizer, joint_model, 
    intent_names, slot_names)

Keyword arguments {'add_special_tokens': True} not recognized.


{'intent': 'AddToPlaylist',
 'slots': {'artist': 'to',
  'entity_name': 'Madchild',
  'playlist': 'Electro Latino'}}

In [32]:
nlu("add Brian May to my Reggae Infusions list", tokenizer, joint_model, 
    intent_names, slot_names)

Keyword arguments {'add_special_tokens': True} not recognized.


{'intent': 'AddToPlaylist',
 'slots': {'artist': 'May to',
  'playlist': 'Reggae Infusions list',
  'playlist_owner': 'Reg'}}

In [33]:
import calendar
import time

# to generate timestamps for prediction file
def get_time_stamp():
    ts = calendar.timegm(time.gmtime())
    return ts

get_time_stamp()

1647939458

In [34]:
def read_dev_data(file="dev.json"):
    dev_texts = []
    with open(file, "r", encoding="utf-8") as json_file:
        data = json.load(json_file)

        for k in data.keys():
          text = data[k]["text"]
          dev_texts.append(text)
          
    return dev_texts
dev_texts = read_dev_data()

In [None]:
from tqdm import tqdm

results = []
for i in tqdm(range(len(dev_texts))):
    res = nlu(dev_texts[i], tokenizer, joint_model, intent_names, slot_names)
    results.append(res)

In [36]:

results_dict = dict()

for idx, res in enumerate(results):
    results_dict[str(idx)] = res

In [37]:
with open("prediction.json", "w") as f:
    json.dump(results_dict, f, indent=2)

In [40]:
!head prediction.json

{
  "0": {
    "intent": "AddToPlaylist",
    "slots": {
      "entity_name": "changes & things",
      "playlist": "hot 50 play"
    }
  },
  "1": {
    "intent": "AddToPlaylist",


In [42]:
dev_texts[0]

'Add changes & things to hot 50 playlist'