---

### Access to Data and Env



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/VietNamese_SlotFilling_IntentDetect

/content/drive/MyDrive/VietNamese_SlotFilling_IntentDetect


---

### Env v2

In [5]:
!pip install transformers



In [6]:
# Import Package

# Torch Library
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Metric from sklearn
from sklearn.metrics import f1_score,accuracy_score

# Transformer to get Bert model
from transformers import AutoModel, AutoTokenizer

# Other
import random
import numpy as np
import pickle
from tqdm import tqdm
import math
import re
import pandas as pd
import os
import time

---

### Some Config of model



In [7]:
# Path
# Dataset Training Path
train_input_path = "./dataset/bkai_dataset/training_data/training_data/seq.in"
train_labels_path = "./dataset/bkai_dataset/training_data/training_data/label"
train_intentTag_path = "./dataset/bkai_dataset/training_data/training_data/seq.out"

# Data Augmentation Dataset
rs_augment_input_path = "./dataset/bkai_dataset/training_data/training_data/rs_augment_seq.in"
rs_augment_labels_path = "./dataset/bkai_dataset/training_data/training_data/rs_augment_intent_label.txt"
rs_augment_intentTag_path = "./dataset/bkai_dataset/training_data/training_data/rs_augment_seq.out"

# Dataset Test Path
dev_input_path = "./dataset/bkai_dataset/dev_data/dev_data/seq.in"
dev_labels_path = "./dataset/bkai_dataset/dev_data/dev_data/label"
dev_slotTag_path = "./dataset/bkai_dataset/dev_data/dev_data/seq.out"

# List output path
lst_labels_path = "./dataset/bkai_dataset/public_test_data/public_test_data/intent_label.txt"
lst_slotTag_path = "./dataset/bkai_dataset/public_test_data/public_test_data/slot_label.txt"

In [8]:
_fn="final" # file unique id for saving and loading models

MAX_LEN=128 # Length of tokens input BERT
ENV_BERT_ID_CLS=False # use cls token for id classification
ENV_EMBEDDING_SIZE=768 # dimention of embbeding, bertbase=768,bertlarge&elmo=1024
ENV_SEED=1331
ENV_CNN_FILTERS=128
ENV_CNN_KERNELS=4
ENV_HIDDEN_SIZE=ENV_CNN_FILTERS*ENV_CNN_KERNELS

#these are related to training
BATCH_SIZE=32
STEP_SIZE=10

# you must use cuda to run this code. if this returns false, you can not proceed.
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    print("You are using cuda. Good!")
else:
    print('You are NOT using cuda! Some problems may occur.')

torch.manual_seed(ENV_SEED)
random.seed(ENV_SEED)

You are using cuda. Good!


---

### Some function


In [9]:
def add_paddings(seq_out, MAX_LEN):
    sout=[]
    for i in range(len(seq_out)):
        # add padding inside output tokens
        temp = seq_out[i]
        if len(temp)<MAX_LEN:
            while len(temp)<MAX_LEN:
                temp.append('<PAD>')
        else:
            temp = temp[:MAX_LEN]
        sout.append(temp)
    return sout

In [10]:
import torch
import torch.nn as nn

def get_subtoken_mask(current_tokens, bert_tokenizer, MAX_LEN):
    '''
    Description:
        Create attention masks for BERT-based models that consider both word-level and subtoken-level information.
    Args:
        current_tokens: A list of input text strings.
        bert_tokenizer: A BERT tokenizer object used to split text into subtokens.
        MAX_LEN: An integer representing the maximum length of the input sequences.
    Returns:
        sub_mask: tensor sub-mask of sentences for BERT-based models.
    '''
    temp_mask = []
    for i in current_tokens:
        temp_row_mask = []
        temp_row_mask.append(False)  # for cls token
        temp = bert_tokenizer.tokenize(i)
        for j in temp:
            temp_row_mask.append(j[:2] != "##")  # Check if subtoken is not a padding token
        while len(temp_row_mask) < MAX_LEN:
            temp_row_mask.append(False)  # Pad mask to maximum length
        temp_mask.append(temp_row_mask)
        if sum(temp_row_mask) != len(i.split(" ")):
            print(f"inconsistent:{temp}")
            print(i)
            print(sum(temp_row_mask))
            print(len(i.split(" ")))
    return torch.tensor(temp_mask).cuda()

In [11]:
# this function turns class text to id
def prepare_intent(intent, to_ix):
    '''
    Converts an intent text string to its corresponding integer ID.

    Args:
        intent (str): The text of the intent class.
        to_ix (dict): A dictionary mapping intent text strings to their integer IDs.

    Returns:
        list: The integer ID of the intent, or the ID of the "UNKNOWN" intent if the
             provided intent is not found in the dictionary.

    Raises:
        KeyError: If the provided intent is not found in the dictionary and there is no
                 "UNKNOWN" intent defined in the dictionary.
    '''
    idxs = to_ix[intent] if intent in to_ix.keys() else to_ix['<UNK>']
    return idxs

In [12]:
#this function converts tokens to ids and then to a tensor
def prepare_sequence(seq, to_ix):
    '''
    Converts a sequence of tokens to a PyTorch tensor of integer IDs.

    Args:
        seq (list): A list of tokens (words).
        to_ix (dict): A dictionary mapping tokens to their integer IDs.

    Returns:
        torch.Tensor: A PyTorch tensor of integer IDs, where each element corresponds
                      to the ID of the corresponding token in the original sequence.

    Raises:
        KeyError: If any token in the sequence is not found in the `to_ix` dictionary.

    '''
    idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix['<UNK>'], seq))
    return idxs

# converts numbers to <NUM> TAG
def number_to_tag(txt):
    return "<NUM>" if txt.isdecimal() else txt

# Here we remove multiple spaces and punctuation which cause errors in tokenization for bert & elmo.
def remove_punc(mlist):
    mlist = [re.sub(" +"," ",t.split("\t")[0][4:-4]) for t in mlist] # remove spaces down to 1
    temp_train_tokens = []
    # punct remove example:  play samuel-el jackson from 2009 - 2010 > play samuelel jackson from 2009 - 2010
    for row in mlist:
        tokens = row.split(" ")
        newtokens = []
        for token in tokens:
            newtoken = re.sub(r"[.,'\"\\/\-:&’—=–官方杂志¡…“”~%]",r"",token) # remove punc
            newtoken = re.sub(r"[楽園追放�]",r"A",newtoken)
            newtokens.append(newtoken if len(token)>1 else token)
        if newtokens[-1]=="":
            newtokens.pop(-1)
        if newtokens[0]=="":
            newtokens.pop(0)
        temp_train_tokens.append(" ".join(newtokens))
    return temp_train_tokens

# To flatten all
flatten = lambda l: [number_to_tag(item) for sublist in l for item in sublist]

---

### Get datatrain and dev

In [13]:
def file2list(path):
    '''
    Get a list of text strings from a file.

    Args:
        path (str): The path to the file.

    Returns:
        list: A list of text strings.
    '''
    dataList = []

    with open(path, 'r') as f_r:
        data = f_r.readlines()
        for text in data:
            text = text.strip()
            dataList.append(text)

    return dataList

In [14]:
# Get data from file
train_text = file2list(train_input_path)
train_label = file2list(train_labels_path)
train_intentTag = file2list(train_intentTag_path)

dev_text = file2list(dev_input_path)
dev_label = file2list(dev_labels_path)
dev_intentTag = file2list(dev_slotTag_path)

augment_text = file2list(rs_augment_input_path)
augment_label = file2list(rs_augment_labels_path)
augment_intentTag = file2list(rs_augment_intentTag_path)

In [15]:
# Print example
print("Train Example: ")
print(train_text[0])
print(train_label[0])
print(train_intentTag[0])

print("Dev Example: ")
print(dev_text[0])
print(dev_label[0])
print(dev_intentTag[0])

Train Example: 
tăng bóng 3 26 phần trăm
smart.home.increase.percentage
O B-devicedevice I-devicedevice B-change-valuesyspercentage I-change-valuesyspercentage I-change-valuesyspercentage
Dev Example: 
anh ơi thiết bị là đầy đủ rgb 4
smart.home.decrease.percentage
O O O O O O O B-devicedevice I-devicedevice


In [16]:
print(train_text)

['tăng bóng 3 26 phần trăm', 'hãy tăng thêm độ sáng phòng ngủ con trai lên 91 phần trăm', 'giúp mình tăng đèn âm trần thứ 2 lên 10 phần trăm ở phòng con nhỏ 4 tầng 5 nhé', 'bạn có thể tăng giúp mình bóng chùm thứ 3 lên mức 21 phần trăm được không', 'tăng bình nóng lạnh 5 phần trăm', 'mình muốn tăng bóng vách', 'tăng đèn cảnh lên 23 phần trăm ở khách 2 6', 'bạn có thể giúp mình tăng mức độ của đèn màu bên phòng giặt ủi 4 lên 8 phần trăm được không', 'bạn tăng đèn hắt tường thứ 1 ở phòng sách 5 lên 13 phần trăm giúp mình nhé', 'tăng đèn treo tường lên 7 phần trăm giúp mình', 'hãy tăng giúp ta cái nóng lạnh 4 ở phòng trẻ em 5', 'tăng thêm độ sáng bóng trụ cổng 3 lễ tân 4 lên 82 phần trăm', 'tăng điện thứ 1 lên 5 phần trăm giúp mình', 'tăng hắt trần thứ 2 lên 22 phần trăm', 'tăng bóng kiểng thứ 3 lên 1 phần trăm', 'tăng thêm ánh sáng bóng hắt tường 2 lên mức 51 phần trăm', 'bạn tăng bóng led 3 ở phòng tắm xông hơi 2 phòng 10 lên 26 phần trăm hộ mình với bôi đen bạn bỏ dấu ngoặc kép ra nhé 

In [17]:
# Print Augment data
print("Augmentation Data:")
print(len(augment_text))
print(augment_text[0])
print(augment_label[0])
print(augment_intentTag[0])

Augmentation Data:
4496
tăng đèn học 1 28 phần trăm
smart.home.increase.percentage
O B-devicedevice I-devicedevice I-devicedevice B-change-valuesyspercentage I-change-valuesyspercentage I-change-valuesyspercentage


In [18]:
print(augment_text)

['tăng đèn học 1 28 phần trăm', 'hãy tăng thêm độ sáng phòng ngủ con trai lên 42 phần trăm', 'giúp mình tăng đèn âm trần thứ 2 lên 42 phần trăm ở phòng con nhỏ 4 tầng 5 nhé', 'bạn có thể tăng giúp mình bóng chùm thứ 3 lên mức 72 phần trăm được không', 'tăng downlight thứ 4 76 phần trăm', 'mình muốn tăng đèn bếp', 'tăng đèn cảnh lên 61 phần trăm ở khách 2 6', 'bạn có thể giúp mình tăng mức độ của bóng trụ cổng bên phòng giặt ủi 4 lên 37 phần trăm được không', 'bạn tăng đèn trang trí thứ 2 ở phòng sách 5 lên 43 phần trăm giúp mình nhé', 'tăng đèn treo tường lên 5 phần trăm giúp mình', 'hãy tăng giúp ta cái loa còi 2 ở phòng trẻ em 5', 'chạy độ sáng bóng trụ cổng 3 lễ tân 4 lên 20 phần trăm', 'tăng điện thứ 1 lên 1 phần trăm giúp mình', 'tăng hắt trần thứ 2 lên 87 phần trăm', 'tăng kiểng thứ 2 lên 65 phần trăm', 'tăng thêm ánh sáng bóng hắt tường 2 lên mức 13 phần trăm', 'bạn tăng bóng led 3 ở phòng tắm xông hơi 2 phòng 10 lên 87 phần trăm hộ mình với bôi đen bạn bỏ dấu ngoặc kép ra nhé v

In [19]:
# merge
train_text = train_text + augment_text
train_label = train_label + augment_label
train_intentTag = train_intentTag + augment_intentTag

In [20]:
# Print example
print("Train Example: ")
print(len(train_text))
print(train_text[300])
print(train_label[300])
print(train_intentTag[300])


Train Example: 
6286
cài đặt cho mình cột đèn 2 ở phòng ăn 1 4 mức độ 4 với
smart.home.set.level
B-commandcommand I-commandcommand O O B-devicedevice I-devicedevice I-devicedevice O B-roomroom I-roomroom I-roomroom B-sysnumbersysnumber O O O O


---

### Pre Processing

In [21]:
import string
import re

def remove_dup(text):

    def replace(match):
        m = match.group(0)
        if d[m[0]] == d[m[1]]:
            return m[0]
        else:
            return m[0] + m[1]

    uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
    unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
    uniChars += string.ascii_letters
    unsignChars += string.ascii_letters

    d = {k: v for (k, v) in zip(uniChars, unsignChars)}
    return re.sub(fr'\S([{uniChars}])\1+', replace, text)

In [22]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.0-py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: underthesea-core, python-crfsuite, underthesea
Successfully installed python-crfsuite-0.9.10 underthesea-6.8.0 underthesea-core-1.0.4


In [23]:
from underthesea import text_normalize

def normalize_text(text):
    text = text_normalize(text)
    text = remove_dup(text)
    return text

train_text = [normalize_text(text) for text in train_text]
dev_text = [normalize_text(text) for text in dev_text]

In [24]:
print("Train Text: ")
print(train_text[0])
print("Dev Text: ")
print(dev_text[0])

Train Text: 
tăng bóng 3 26 phần trăm
Dev Text: 
anh ơi thiết bị là đầy đủ rgb 4


---

### Get the dictionary of text, label and intentTag

In [25]:
# Get all unique tokens from labels
unique_labels = set(train_label)
print(unique_labels)
print(len(unique_labels))

{'smart.home.device.onoff', 'smart.home.set.percentage', 'smart.home.set.level', 'smart.home.decrease.level', 'smart.home.decrease.percentage', 'smart.home.increase.percentage', 'greeting', 'smart.home.check.status', 'smart.home.increase.level', 'smart.home.set.color'}
10


In [26]:
# Create dictionary token for labels
#initialize intent to index
label2index={'UNKNOWN':0}
for label in unique_labels:
    if label not in label2index.keys():
        label2index[label] = len(label2index)

# Covert from index to labels
index2intent = {v:k for k,v in label2index.items()}

In [27]:
print(label2index)
print(len(label2index))

{'UNKNOWN': 0, 'smart.home.device.onoff': 1, 'smart.home.set.percentage': 2, 'smart.home.set.level': 3, 'smart.home.decrease.level': 4, 'smart.home.decrease.percentage': 5, 'smart.home.increase.percentage': 6, 'greeting': 7, 'smart.home.check.status': 8, 'smart.home.increase.level': 9, 'smart.home.set.color': 10}
11


In [28]:
print(label2index)
print(len(label2index))

{'UNKNOWN': 0, 'smart.home.device.onoff': 1, 'smart.home.set.percentage': 2, 'smart.home.set.level': 3, 'smart.home.decrease.level': 4, 'smart.home.decrease.percentage': 5, 'smart.home.increase.percentage': 6, 'greeting': 7, 'smart.home.check.status': 8, 'smart.home.increase.level': 9, 'smart.home.set.color': 10}
11


In [29]:
# Get all unique tokens from intentTag
intentTag = []
for tag in train_intentTag:
    intentTag.extend(tag.split())

unique_intentTag = set(intentTag)
print(unique_intentTag)
print(len(unique_intentTag))

{'O', 'B-final-valuesyspercentage', 'I-final-valuesyspercentage', 'B-final-valuesysnumber', 'B-change-valuesysnumber', 'B-sysnumbersysnumber', 'B-colorcolor', 'I-colorcolor', 'B-statusstatus', 'I-floornumberfloornumber', 'I-commandcommand', 'I-statusstatus', 'B-devicedevice', 'I-change-valuesyspercentage', 'I-roomroom', 'B-roomroom', 'B-allall', 'B-floornumberfloornumber', 'B-change-valuesyspercentage', 'B-commandcommand', 'I-devicedevice'}
21


In [30]:
# Create a dictionảy token for intentTag
# Tag dictionary
tag2index = {'<BOS>':0, '<PAD>' : 1, '<EOS>':2, '<UNK>':3}

for tag in unique_intentTag:
    if tag not in tag2index.keys():
        tag2index[tag] = len(tag2index)

# Covert from index to tag
index2tag = {v:k for k,v in tag2index.items()}

In [31]:
print(tag2index)
print(len(tag2index))

{'<BOS>': 0, '<PAD>': 1, '<EOS>': 2, '<UNK>': 3, 'O': 4, 'B-final-valuesyspercentage': 5, 'I-final-valuesyspercentage': 6, 'B-final-valuesysnumber': 7, 'B-change-valuesysnumber': 8, 'B-sysnumbersysnumber': 9, 'B-colorcolor': 10, 'I-colorcolor': 11, 'B-statusstatus': 12, 'I-floornumberfloornumber': 13, 'I-commandcommand': 14, 'I-statusstatus': 15, 'B-devicedevice': 16, 'I-change-valuesyspercentage': 17, 'I-roomroom': 18, 'B-roomroom': 19, 'B-allall': 20, 'B-floornumberfloornumber': 21, 'B-change-valuesyspercentage': 22, 'B-commandcommand': 23, 'I-devicedevice': 24}
25


In [32]:
# Khởi tạo dictionary cho input
train_toks_text = []

for sen_text in train_text:
    listSenText = sen_text.split()
    train_toks_text.append(listSenText)

for test_text in dev_text:
    devSenText = test_text.split()
    train_toks_text.append(devSenText)


vocab = []
for lstSen in train_toks_text:
    vocab.extend(lstSen)

vocab = set(vocab)

# making dictionary (token:id), initial value
word2index = {'<PAD>': 1, '<UNK>':0,'<BOS>':2,'<EOS>':3,'<NUM>':4}
# add rest of token list to dictionary
for token in vocab:
    if token not in word2index.keys():
        word2index[token]=len(word2index)

#make id to token list ( reverse )
index2word = {v:k for k,v in word2index.items()}

print(word2index)

{'<PAD>': 1, '<UNK>': 0, '<BOS>': 2, '<EOS>': 3, '<NUM>': 4, '49': 5, 'cần': 6, '39': 7, 'trên': 8, '73': 9, 'rom': 10, 'thật': 11, '22': 12, '81': 13, 'chính': 14, 'đài': 15, 'chùm': 16, 'khởi': 17, 'thống': 18, '17': 19, 'đầu': 20, 'nhanh': 21, 'mọi': 22, 'tôi': 23, 'bắt': 24, 'ảo': 25, 'màu': 26, 'hệ': 27, 'cây': 28, 'nhée': 29, 'tiện': 30, 'sang': 31, 'quang': 32, 'ná': 33, 'vấn': 34, '72': 35, '20': 36, '60': 37, 'nhen': 38, 'nghiệm': 39, 'kiểm': 40, 'người': 41, 'rèm': 42, 'thái': 43, 'với': 44, 'quét': 45, 'nha': 46, 'mặt': 47, 'bòng': 48, 'trời': 49, 'nhất': 50, '53': 51, 'thang97': 52, 'alo': 53, 'cập': 54, 'bình': 55, 'bại': 56, 'và': 57, 'tường': 58, 'cậu': 59, 'đề': 60, '11': 61, 'biển': 62, 'bóng': 63, 'ngơi': 64, 'sinh': 65, '4': 66, 'nhà': 67, '48': 68, 'có': 69, 'viện': 70, 'các': 71, 'đánh': 72, 'cơ': 73, '59': 74, 'rồi': 75, 'cáo': 76, '97': 77, '33': 78, '64': 79, 'hồng': 80, 'bôi': 81, 'ơi': 82, 'gara': 83, 'tắm': 84, 'hồ': 85, 'ơn': 86, '100': 87, 'ốp': 88, 'nếu': 

---

### Convert all Input




In [33]:
# Convert from label
train_num_label = [prepare_intent(temp,label2index) for temp in train_label]
dev_num_label = [prepare_intent(temp,label2index) for temp in dev_label]

In [34]:
print(train_num_label)
print(len(train_num_label))
print(dev_num_label)
print(len(dev_num_label))

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [35]:
# Convert from content Tag
# Convert from to list per tag
lst_slotTag = []
for tag in train_intentTag:
    lst_slotTag.append(tag.split())

train_num_slotTag = []
for sen_slotTag in lst_slotTag:
    sen_slotTag.extend(['<PAD>']*(MAX_LEN-len(sen_slotTag)))
    sen_slotTag = [prepare_intent(temp, tag2index) for temp in sen_slotTag]
    train_num_slotTag.append(sen_slotTag)

print(train_num_slotTag[0])
print(len(train_num_slotTag[0]))


[4, 16, 24, 22, 17, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


In [36]:
# Do the same with dev
dev_lst_slotTag = []
for dev_tag in dev_intentTag:
    dev_lst_slotTag.append(dev_tag.split())
print(dev_lst_slotTag[0])
print(len(dev_lst_slotTag[0]))

dev_num_slotTag = []
for slotTag in dev_lst_slotTag:
    slotTag.extend(['<PAD>']*(MAX_LEN-len(slotTag)))
    slotTag = [prepare_intent(temp, tag2index) for temp in slotTag]
    dev_num_slotTag.append(slotTag)

print(dev_num_slotTag[0])
print(len(dev_num_slotTag[0]))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-devicedevice', 'I-devicedevice']
9
[4, 4, 4, 4, 4, 4, 4, 16, 24, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


In [37]:
# Convert the text
train_lst_text = []
for text in train_text:
    train_lst_text.append(text.split())

print(train_lst_text[0])

['tăng', 'bóng', '3', '26', 'phần', 'trăm']


In [38]:
train_num_text = []
for trainText in train_lst_text:
    trainText.extend(['<PAD>']*(MAX_LEN-len(trainText)))
    trainText = [prepare_intent(temp, word2index) for temp in trainText]
    train_num_text.append(trainText)

print(train_num_text[0])
print(len(train_num_text[0]))

[479, 63, 198, 553, 122, 348, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


In [39]:
# Do the same with dev_test
dev_lst_text = []
for devText in dev_text:
    dev_lst_text.append(devText.split())

dev_num_text = []
for devText in dev_lst_text:
    devText.extend(['<PAD>']*(MAX_LEN-len(devText)))
    devText = [prepare_intent(temp, word2index) for temp in devText]
    dev_num_text.append(devText)

print(dev_num_text[0])
print(len(dev_num_text[0]))


[241, 82, 305, 514, 237, 98, 537, 458, 66, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
128


---

### Get tokennization


In [40]:
print(train_text)

['tăng bóng 3 26 phần trăm', 'hãy tăng thêm độ sáng phòng ngủ con trai lên 91 phần trăm', 'giúp mình tăng đèn âm trần thứ 2 lên 10 phần trăm ở phòng con nhỏ 4 tầng 5 nhé', 'bạn có thể tăng giúp mình bóng chùm thứ 3 lên mức 21 phần trăm được không', 'tăng bình nóng lạnh 5 phần trăm', 'mình muốn tăng bóng vách', 'tăng đèn cảnh lên 23 phần trăm ở khách 2 6', 'bạn có thể giúp mình tăng mức độ của đèn màu bên phòng giặt ủi 4 lên 8 phần trăm được không', 'bạn tăng đèn hắt tường thứ 1 ở phòng sách 5 lên 13 phần trăm giúp mình nhé', 'tăng đèn treo tường lên 7 phần trăm giúp mình', 'hãy tăng giúp ta cái nóng lạnh 4 ở phòng trẻ em 5', 'tăng thêm độ sáng bóng trụ cổng 3 lễ tân 4 lên 82 phần trăm', 'tăng điện thứ 1 lên 5 phần trăm giúp mình', 'tăng hắt trần thứ 2 lên 22 phần trăm', 'tăng bóng kiểng thứ 3 lên 1 phần trăm', 'tăng thêm ánh sáng bóng hắt tường 2 lên mức 51 phần trăm', 'bạn tăng bóng led 3 ở phòng tắm xông hơi 2 phòng 10 lên 26 phần trăm hộ mình với bôi đen bạn bỏ dấu ngoặc kép ra nhé 

In [41]:
# Add PhoBert Tokenization
from transformers import AutoTokenizer

phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")  # Or PhoBert-large

dataset_toks = phobert_tokenizer.batch_encode_plus(train_text,
                                              max_length=MAX_LEN ,
                                              add_special_tokens=True,
                                              return_tensors='pt',
                                              return_attention_mask=True,
                                              padding='max_length',
                                              truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [42]:
print(dataset_toks['input_ids'][0])
print(dataset_toks['attention_mask'][0])
print(dataset_toks['token_type_ids'][0])
print(dataset_toks['input_ids'].shape)
print(dataset_toks['attention_mask'].shape)
print(dataset_toks['token_type_ids'].shape)

tensor([   0,  128,  301,  107, 1742,  230,  877,    2,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [43]:
# Do the same with dev dataset
dev_toks = phobert_tokenizer.batch_encode_plus(dev_text,
                                              max_length=MAX_LEN ,
                                              add_special_tokens=True,
                                              return_tensors='pt',
                                              return_attention_mask=True,
                                              padding='max_length',
                                              truncation=True)

---
### Get subtokens mask


In [44]:
train_subtoken_mask = get_subtoken_mask(train_text,phobert_tokenizer, MAX_LEN)
print(train_subtoken_mask[0])
print(len(train_subtoken_mask))

inconsistent:['hãy', 'tăng', 'giúp', 'ta', 'cái', 'b@@', 'òng', 'hắt', 'trần', '1', 'ở', 'phòng', 'ăn', 'trưa', '1']
hãy tăng giúp ta cái bòng hắt trần 1 ở phòng ăn trưa 1
15
14
inconsistent:['tăng', 'đến', 'mỗi', 't@@', 'vs', 'thứ', '2', 'lên', '95', 'phần', 'trăm']
tăng đến mỗi tvs thứ 2 lên 95 phần trăm
11
10
inconsistent:['trợ', 'lý', 'béo', 'tăng', 'dow@@', 'n@@', 'light', 'thứ', '2', 'lên', '4', 'phần', 'trăm', 'ở', 'sân', 'sau', 'phòng', '8']
trợ lý béo tăng downlight thứ 2 lên 4 phần trăm ở sân sau phòng 8
18
16
inconsistent:['mình', 'cần', 'tăng', 'thiết', 'bị', 't@@', 'vs', 'thứ', '4', 'lên', '10', 'phần', 'trăm', 'với']
mình cần tăng thiết bị tvs thứ 4 lên 10 phần trăm với
14
13
inconsistent:['làm', 'ơn', 'tăng', 't@@', 'vs', '3', 'lên', '52', 'phần', 'trăm', 'giúp', 'tôi']
làm ơn tăng tvs 3 lên 52 phần trăm giúp tôi
12
11
inconsistent:['tăng', 't@@', 'vs', 'thứ', '1', 'lên', '21', 'phần', 'trăm']
tăng tvs thứ 1 lên 21 phần trăm
9
8
inconsistent:['n@@', 'ice', 'giúp', 'mình'

In [45]:
# Do the same in dev dataset
dev_subtoken_mask = get_subtoken_mask(dev_text,phobert_tokenizer, MAX_LEN)
print(dev_subtoken_mask[0])
print(len(dev_subtoken_mask))

inconsistent:['anh', 'ơi', 'thiết', 'bị', 'là', 'đầy', 'đủ', 'r@@', 'g@@', 'b', '4']
anh ơi thiết bị là đầy đủ rgb 4
11
9
inconsistent:['chào', 'bạn', 'kiểm', 'soát', 'được', 'bóng', 'dow@@', 'n@@', 'light', '2', 'ở', 'phòng', 'ngủ', 'con', 'trai', '5', 'trong', 'nhà', 'không']
chào bạn kiểm soát được bóng downlight 2 ở phòng ngủ con trai 5 trong nhà không
19
17
inconsistent:['giúp', 'tui', 'giảm', 'thiết', 'bị', 'dow@@', 'n@@', 'light', '1']
giúp tui giảm thiết bị downlight 1
9
7
inconsistent:['chào', 'giảm', 'giúp', 'tôi', 'b@@', 'òng', 'hắt', 'trần', 'thứ', '4', 'trong', 'phòng', 'nghỉ', '4']
chào giảm giúp tôi bòng hắt trần thứ 4 trong phòng nghỉ 4
14
13
inconsistent:['à', 'biết', 'r', 'kiểm', 'tra', 'cho', 'tôi', 'đèn', 'dow@@', 'n@@', 'light', '4', 'ở', 'nhà', 'vệ', 'sinh', 'nhé', 'v']
à biết r kiểm tra cho tôi đèn downlight 4 ở nhà vệ sinh nhé v
18
16
inconsistent:['hệ', 'th@@', 'ống', 'gì', 'vậy', 'kiểm', 'tra', 'ốp', 'trần', '3', 'nhé']
hệ thống gì vậy kiểm tra ốp trần 3 nhé
1

---

### DataLoader

In [46]:
#defining datasets.
def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

class NLUDataset(Dataset):
    def __init__(self, sin,sout,intent,input_ids,attention_mask,token_type_ids,subtoken_mask):
        self.test = sin
        self.sin = Variable(torch.LongTensor(sin)).cuda() if USE_CUDA else Variable(torch.LongTensor(sin))
        self.sout = Variable(torch.LongTensor(sout)).cuda() if USE_CUDA else Variable(torch.LongTensor(sin))
        self.intent = Variable(torch.LongTensor(intent)).cuda() if USE_CUDA else Variable(torch.LongTensor(sin))
        self.input_ids=input_ids.cuda()
        self.attention_mask=attention_mask.cuda()
        self.token_type_ids=token_type_ids.cuda()
        self.subtoken_mask=subtoken_mask.cuda()
        self.x_mask = [Variable(torch.BoolTensor(tuple(map(lambda s: s ==1, t )))).cuda() for t in self.sin]
    def __len__(self):
        return len(self.intent)
    def __getitem__(self, idx):
        sample = self.sin[idx],self.sout[idx],self.intent[idx],self.input_ids[idx],self.attention_mask[idx],self.token_type_ids[idx],self.subtoken_mask[idx],self.x_mask[idx]
        return sample

#making single list
train_data=NLUDataset(train_num_text, train_num_slotTag, train_num_label, dataset_toks['input_ids'], dataset_toks['attention_mask'], dataset_toks['token_type_ids'],train_subtoken_mask)
test_data=NLUDataset(dev_num_text, dev_num_slotTag, dev_num_label, dev_toks['input_ids'], dev_toks['attention_mask'], dev_toks['token_type_ids'],dev_subtoken_mask)

In [47]:
train_data.__len__()

6286

In [48]:
train_data.__getitem__(4)

(tensor([479,  55, 251, 152, 371, 122, 348,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1], device='cuda:0'),
 tensor([ 4, 16, 24, 24, 22, 17, 17,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        

In [49]:
train_data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_data = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

---

### Model

In [50]:

# generates transformer mask
def generate_square_subsequent_mask(sz: int) :
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
def generate_square_diagonal_mask(sz: int) :
    """Generates a matrix which there are zeros on diag and other indexes are -inf."""
    return torch.triu(torch.ones(sz,sz)-float('inf'), diagonal=1)+torch.tril(torch.ones(sz,sz)-float('inf'), diagonal=-1)
# positional embedding used in transformers
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


#start of the shared encoder
class BertLayer(nn.Module):
    def __init__(self):
        super(BertLayer, self).__init__()
        self.bert_model = AutoModel.from_pretrained("vinai/phobert-base")


    def forward(self, bert_info=None):
        (bert_tokens, bert_mask, bert_tok_typeid) = bert_info
        bert_encodings = self.bert_model(bert_tokens, bert_mask, bert_tok_typeid)
        bert_last_hidden = bert_encodings['last_hidden_state']
        bert_pooler_output = bert_encodings['pooler_output']
        return bert_last_hidden, bert_pooler_output


class Encoder(nn.Module):
    def __init__(self, p_dropout=0.5):
        super(Encoder, self).__init__()
        self.filter_number = ENV_CNN_FILTERS
        self.kernel_number = ENV_CNN_KERNELS  # tedad size haye filter : 2,3,5 = 3
        self.embedding_size = ENV_EMBEDDING_SIZE
        self.activation = nn.ReLU()
        self.p_dropout = p_dropout
        self.softmax = nn.Softmax(dim=1)
        self.conv1 = nn.Conv1d(in_channels=self.embedding_size, out_channels=self.filter_number, kernel_size=(2,),
                               padding="same", padding_mode="zeros")
        self.conv2 = nn.Conv1d(in_channels=self.embedding_size, out_channels=self.filter_number, kernel_size=(3,),
                               padding="same", padding_mode="zeros")
        self.conv3 = nn.Conv1d(in_channels=self.embedding_size, out_channels=self.filter_number, kernel_size=(5,),
                               padding="same", padding_mode="zeros")
        self.conv4 = nn.Conv1d(in_channels=self.embedding_size, out_channels=self.filter_number, kernel_size=(1,),
                               padding="same", padding_mode="zeros")

    def forward(self, bert_last_hidden):
        trans_embedded = torch.transpose(bert_last_hidden, dim0=1, dim1=2)
        convolve1 = self.activation(self.conv1(trans_embedded))
        convolve2 = self.activation(self.conv2(trans_embedded))
        convolve3 = self.activation(self.conv3(trans_embedded))
        convolve4 = self.activation(self.conv4(trans_embedded))
        convolve1 = torch.transpose(convolve1, dim0=1, dim1=2)
        convolve2 = torch.transpose(convolve2, dim0=1, dim1=2)
        convolve3 = torch.transpose(convolve3, dim0=1, dim1=2)
        convolve4 = torch.transpose(convolve4, dim0=1, dim1=2)
        output = torch.cat((convolve4, convolve1, convolve2, convolve3), dim=2)
        return output


In [51]:
#Middle
class Middle(nn.Module):
    def __init__(self ,p_dropout=0.5):
        super(Middle, self).__init__()
        self.activation = nn.ReLU()
        self.p_dropout = p_dropout
        self.softmax = nn.Softmax(dim=1)
        #Transformer
        nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        self.pos_encoder = PositionalEncoding(ENV_HIDDEN_SIZE, dropout=0.1)
        encoder_layers = nn.TransformerEncoderLayer(ENV_HIDDEN_SIZE, nhead=2,batch_first=True, dim_feedforward=2048 ,activation="relu", dropout=0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers,enable_nested_tensor=False)
        self.transformer_mask = generate_square_subsequent_mask(MAX_LEN).cuda()

    def forward(self, fromencoder,input_masking,training=True):
        src = fromencoder * math.sqrt(ENV_HIDDEN_SIZE)
        src = self.pos_encoder(src)
        output = (self.transformer_encoder(src,src_key_padding_mask=input_masking)) # outputs probably
        return output

In [52]:
#start of the decoder
class Decoder(nn.Module):

    def __init__(self,slot_size,intent_size,dropout_p=0.5):
        super(Decoder, self).__init__()
        self.slot_size = slot_size
        self.intent_size = intent_size
        self.dropout_p = dropout_p
        self.softmax= nn.Softmax(dim=1)
        # Define the layers
        self.embedding = nn.Embedding(self.slot_size, ENV_HIDDEN_SIZE)
        self.activation = nn.ReLU()
        self.dropout1 = nn.Dropout(self.dropout_p)
        self.dropout2 = nn.Dropout(self.dropout_p)
        self.dropout3 = nn.Dropout(self.dropout_p)
        self.slot_trans = nn.Linear(ENV_HIDDEN_SIZE, self.slot_size)
        self.intent_out = nn.Linear(ENV_HIDDEN_SIZE,self.intent_size)
        self.intent_out_cls = nn.Linear(ENV_EMBEDDING_SIZE,self.intent_size) # dim of bert
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=ENV_HIDDEN_SIZE, nhead=2,batch_first=True,dim_feedforward=300 ,activation="relu")
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=2)
        self.transformer_mask = generate_square_subsequent_mask(MAX_LEN).cuda()
        self.transformer_diagonal_mask = generate_square_diagonal_mask(MAX_LEN).cuda()
        self.pos_encoder = PositionalEncoding(ENV_HIDDEN_SIZE, dropout=0.1)
        self.self_attention = nn.MultiheadAttention(embed_dim=ENV_HIDDEN_SIZE
                                                    ,num_heads=8,dropout=0.1
                                                    ,batch_first=True)
        self.layer_norm = nn.LayerNorm(ENV_HIDDEN_SIZE)


    def forward(self, input,encoder_outputs,encoder_maskings,bert_subtoken_maskings=None,infer=False):
        # encoder outputs: BATCH,LENGTH,Dims (16,60,1024)
        batch_size = encoder_outputs.shape[0]
        length = encoder_outputs.size(1) #for every token in batches
        embedded = self.embedding(input)

        # print("NOT CLS")
        encoder_outputs2=encoder_outputs
        context,attn_weight = self.self_attention(encoder_outputs2,encoder_outputs2,encoder_outputs2
                                                  ,key_padding_mask=encoder_maskings)
        encoder_outputs2 = self.layer_norm(self.dropout2(context))+encoder_outputs2
        sum_mask = (~encoder_maskings).sum(1).unsqueeze(1)
        sum_encoder = ((((encoder_outputs2)))*((~encoder_maskings).unsqueeze(2))).sum(1)
        intent_score = self.intent_out(self.dropout1(sum_encoder/sum_mask)) # B,D


        newtensor = torch.cuda.FloatTensor(batch_size, length,ENV_HIDDEN_SIZE).fill_(0.) # size of newtensor same as original
        for i in range(batch_size): # per batch
            newtensor_index=0
            for j in range(length): # for each token
                if bert_subtoken_maskings[i][j].item()==1:
                    newtensor[i][newtensor_index] = encoder_outputs[i][j]
                    newtensor_index+=1

        if infer==False:
            embedded=embedded*math.sqrt(ENV_HIDDEN_SIZE)
            embedded = self.pos_encoder(embedded)
            zol = self.transformer_decoder(tgt=embedded,memory=newtensor
                                           ,memory_mask=self.transformer_diagonal_mask
                                           ,tgt_mask=self.transformer_mask)

            scores = self.slot_trans(self.dropout3(zol))
            slot_scores = F.log_softmax(scores,dim=2)
        else:
            bos = Variable(torch.LongTensor([[tag2index['<BOS>']]*batch_size])).cuda().transpose(1,0)
            bos = self.embedding(bos)
            tokens=bos
            for i in range(length):
                temp_embedded=tokens*math.sqrt(ENV_HIDDEN_SIZE)
                temp_embedded = self.pos_encoder(temp_embedded)
                zol = self.transformer_decoder(tgt=temp_embedded,
                                               memory=newtensor,
                                               tgt_mask=self.transformer_mask[:i+1,:i+1],
                                               memory_mask=self.transformer_diagonal_mask[:i+1,:]
                                               )
                scores = self.slot_trans(self.dropout3(zol))
                softmaxed = F.log_softmax(scores,dim=2)
                #the last token is apended to vectors
                _,input = torch.max(softmaxed,2)
                newtok = self.embedding(input)
                tokens=torch.cat((bos,newtok),dim=1)
            slot_scores = softmaxed

        return slot_scores.view(input.size(0)*length,-1), intent_score

---

### Run code

In [53]:
from torch.nn import functional as F

smoothing = 0.1

def smooth_one_hot(labels, epsilon=smoothing, num_classes=0):
    """
    Applies label smoothing to one-hot encoded labels.

    Args:
        labels (torch.Tensor): One-hot encoded labels with shape (batch_size, num_classes).
        epsilon (float, optional): Smoothing factor. Defaults to 0.1.

    Returns:
        torch.Tensor: Smoothed labels with the same shape as input.
    """
    return (1 - epsilon) * labels + epsilon * labels.new_full((labels.size()), 1 / num_classes)

def smooth_labels(labels, epsilon=smoothing, num_classes=0):
    """
    Applies label smoothing to raw integer labels.

    Args:
        labels (torch.Tensor): Raw integer labels with shape (batch_size,).
        epsilon (float, optional): Smoothing factor. Defaults to 0.1.

    Returns:
        torch.Tensor: Smoothed one-hot encoded labels with shape (batch_size, num_classes).
    """
    one_hot = F.one_hot(labels, num_classes=num_classes)
    return smooth_one_hot(one_hot, epsilon, num_classes=num_classes)

In [54]:
# For loss_function_1 (with ignore_index):
def loss_function_1_smoothed(outputs, targets, num_classes=len(tag2index)):
    smoothed_targets = smooth_labels(targets.masked_fill_(targets == 0, -100), epsilon=smoothing, num_classes=num_classes)  # Ignore index handling
    return F.cross_entropy(outputs, smoothed_targets, ignore_index=-100)

# For loss_function_2 (without ignore_index):
def loss_function_2_smoothed(outputs, targets, num_classes=len(label2index)):
    smoothed_targets = smooth_labels(targets, epsilon=smoothing, num_classes=len(label2index))
    return F.cross_entropy(outputs, smoothed_targets)

In [55]:
bert_layer = BertLayer()
encoder = Encoder(len(word2index))
middle = Middle()
decoder = Decoder(len(tag2index),len(label2index))
if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    middle = middle.cuda()
    bert_layer.cuda()

dec_optim = optim.AdamW(decoder.parameters(),lr=0.0001)
enc_optim = optim.AdamW(encoder.parameters(),lr=0.001)
ber_optim = optim.AdamW(bert_layer.parameters(),lr=0.0001)
mid_optim = optim.AdamW(middle.parameters(), lr=0.0001)
enc_scheduler = torch.optim.lr_scheduler.StepLR(enc_optim, 1, gamma=0.96)
dec_scheduler = torch.optim.lr_scheduler.StepLR(dec_optim, 1, gamma=0.96)
mid_scheduler = torch.optim.lr_scheduler.StepLR(mid_optim, 1, gamma=0.96)
ber_scheduler = torch.optim.lr_scheduler.StepLR(ber_optim, 1, gamma=0.96)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [56]:
def mask_important_tags(predictions,tags,masks):
    result_tags=[]
    result_preds=[]
    for pred,tag,mask in zip(predictions.tolist(),tags.tolist(),masks.tolist()):
        #index [0] is to get the data
        for p,t,m in zip(pred,tag,mask):
            if not m:
                result_tags.append(p)
                result_preds.append(t)
        #result_tags.pop()
        #result_preds.pop()
    return result_preds,result_tags


In [57]:
max_id_prec=0.
max_sf_f1=0.
max_id_prec_both=0.
max_sf_f1_both=0.

for step in tqdm(range(2)):
    losses=[]
    id_precision=[]
    sf_f1=[]

    ### TRAIN
    encoder.train() # set to train mode
    middle.train()
    decoder.train()
    bert_layer.train()
    for i,(x,tag_target,intent_target,bert_tokens,bert_mask,bert_toktype,subtoken_mask,x_mask) in enumerate(train_data):
        batch_size=tag_target.size(0)
        bert_layer.zero_grad()
        encoder.zero_grad()
        middle.zero_grad()
        decoder.zero_grad()
        bert_hidden,bert_pooler = bert_layer(bert_info=(bert_tokens,bert_mask,bert_toktype))
        encoder_output = encoder(bert_last_hidden=bert_hidden)
        output = middle(encoder_output,bert_mask==0,training=True)
        start_decode = Variable(torch.LongTensor([[tag2index['<BOS>']]*batch_size])).cuda().transpose(1,0)
        start_decode = torch.cat((start_decode,tag_target[:,:-1]),dim=1)
        tag_score, intent_score = decoder(start_decode,output,bert_mask==0,bert_subtoken_maskings=subtoken_mask)
        loss_1 = loss_function_1_smoothed(tag_score, tag_target.view(-1), num_classes=len(tag2index))
        loss_2 = loss_function_2_smoothed(intent_score,intent_target)
        loss = loss_1+loss_2
        losses.append(loss.data.cpu().numpy() if USE_CUDA else loss.data.numpy()[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(middle.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(bert_layer.parameters(), 0.5)
        enc_optim.step()
        mid_optim.step()
        dec_optim.step()
        ber_optim.step()
        #print(bert_tokens[0])
        #print(tag_target[0])
        id_precision.append(accuracy_score(intent_target.detach().cpu(),torch.argmax(intent_score,dim=1).detach().cpu()))
        pred_list,target_list=mask_important_tags(torch.argmax(tag_score,dim=1).view(batch_size,MAX_LEN),tag_target,x_mask)
        sf_f1.append(f1_score(pred_list,target_list,average="micro",zero_division=0))
    #print report
    print("Step",step," batches",i," :")
    print("Train-")
    print(f"loss:{round(float(np.mean(losses)),4)}")
    print(f"SlotFilling F1:{round(float(np.mean(sf_f1)),3)}")
    print(f"IntentDet Prec:{round(float(np.mean(id_precision)),3)}")
    losses=[]
    sf_f1=[]
    id_precision=[]

    #### TEST
    encoder.eval() # set to test mode
    middle.eval()
    decoder.eval()
    bert_layer.eval()
    with torch.no_grad(): # to turn off gradients computation
        for i,(x,tag_target,intent_target,bert_tokens,bert_mask,bert_toktype,subtoken_mask,x_mask) in enumerate(test_data):
            batch_size=tag_target.size(0)
            encoder.zero_grad()
            middle.zero_grad()
            decoder.zero_grad()
            bert_layer.zero_grad()
            bert_hidden,bert_pooler = bert_layer(bert_info=(bert_tokens,bert_mask,bert_toktype))
            encoder_output = encoder(bert_last_hidden=bert_hidden)
            output = middle(encoder_output,bert_mask==0,training=True)
            start_decode = Variable(torch.LongTensor([[tag2index['<BOS>']]*batch_size])).cuda().transpose(1,0)
            tag_score, intent_score = decoder(start_decode,output,bert_mask==0,bert_subtoken_maskings=subtoken_mask,infer=True)
            loss_1 = loss_function_1_smoothed(tag_score,tag_target.view(-1))
            loss_2 = loss_function_2_smoothed(intent_score,intent_target)
            loss = loss_1 +  loss_2
            losses.append(loss.data.cpu().numpy() if USE_CUDA else loss.data.numpy()[0])
            id_precision.append(accuracy_score(intent_target.detach().cpu(),torch.argmax(intent_score,dim=1).detach().cpu()))
            pred_list,target_list=mask_important_tags(torch.argmax(tag_score,dim=1).view(batch_size,MAX_LEN),tag_target,x_mask)
            sf_f1.append(f1_score(pred_list,target_list,average="micro",zero_division=0))
    print("Test-")
    print(f"loss:{round(float(np.mean(losses)),4)}")
    print(f"SlotFilling F1:{round(float(np.mean(sf_f1)),4)}")
    print(f"IntentDet Prec:{round(float(np.mean(id_precision)),4)}")
    print("--------------")
    max_sf_f1 = max_sf_f1 if round(float(np.mean(sf_f1)),4)<=max_sf_f1 else round(float(np.mean(sf_f1)),4)
    max_id_prec = max_id_prec if round(float(np.mean(id_precision)),4)<=max_id_prec else round(float(np.mean(id_precision)),4)
    if max_sf_f1_both<=round(float(np.mean(sf_f1)),4) and max_id_prec_both<=round(float(np.mean(id_precision)),4):
        max_sf_f1_both=round(float(np.mean(sf_f1)),4)
        max_id_prec_both=round(float(np.mean(id_precision)),4)
        torch.save(bert_layer,f"models/ctran{_fn}-bertlayer.pkl")
        torch.save(encoder,f"models/ctran{_fn}-encoder.pkl")
        torch.save(middle,f"models/ctran{_fn}-middle.pkl")
        torch.save(decoder,f"models/ctran{_fn}-decoder.pkl")
    enc_scheduler.step()
    dec_scheduler.step()
    mid_scheduler.step()
    ber_scheduler.step()
print(f"max single SF F1: {max_sf_f1}")
print(f"max single ID PR: {max_id_prec}")
print(f"max mutual SF:{max_sf_f1_both}  PR: {max_id_prec_both}")

  return F.conv1d(input, weight, bias, self.stride,
  newtensor = torch.cuda.FloatTensor(batch_size, length,ENV_HIDDEN_SIZE).fill_(0.) # size of newtensor same as original


Step 0  batches 196  :
Train-
loss:0.634
SlotFilling F1:0.615
IntentDet Prec:0.838
Test-
loss:1.4856
SlotFilling F1:0.4785
IntentDet Prec:0.7668
--------------


 50%|█████     | 1/2 [03:23<03:23, 203.46s/it]

Step 1  batches 196  :
Train-
loss:0.2228
SlotFilling F1:0.707
IntentDet Prec:0.958
Test-
loss:1.6101
SlotFilling F1:0.628
IntentDet Prec:0.7885
--------------


100%|██████████| 2/2 [06:54<00:00, 207.08s/it]

max single SF F1: 0.628
max single ID PR: 0.7885
max mutual SF:0.628  PR: 0.7885





---

### Inference

In [58]:
# This cell reloads the best model during training from hard-drive.
bert_layer.load_state_dict(torch.load(f'models/ctran{_fn}-bertlayer.pkl').state_dict())
encoder.load_state_dict(torch.load(f'models/ctran{_fn}-encoder.pkl').state_dict())
middle.load_state_dict(torch.load(f'models/ctran{_fn}-middle.pkl').state_dict())
decoder.load_state_dict(torch.load(f'models/ctran{_fn}-decoder.pkl').state_dict())
if USE_CUDA:
    bert_layer = bert_layer.cuda()
    encoder = encoder.cuda()
    middle = middle.cuda()
    decoder = decoder.cuda()

In [59]:
global clipindex
clipindex=0
def removepads(toks,clip=False):
    global clipindex
    result = toks.copy()
    for i,t in enumerate(toks):
        if t=="<PAD>":
            result.remove(t)
        elif t=="<EOS>":
            result.remove(t)
            if not clip:
                clipindex=i
    if clip:
        result=result[:clipindex]
    return result

In [72]:
print("Example of model prediction on test dataset")
encoder.eval()
middle.eval()
decoder.eval()
bert_layer.eval()

with torch.no_grad():
    index = random.choice(range(len(dev_text)))
    test_raw = dev_text[index]
    print(test_raw)

    bert_tokens = dev_toks['input_ids'][index].unsqueeze(0).cuda()
    print(bert_tokens)
    bert_mask = dev_toks['attention_mask'][index].unsqueeze(0).cuda()
    bert_toktype = dev_toks['token_type_ids'][index].unsqueeze(0).cuda()
    subtoken_mask = dev_subtoken_mask[index].unsqueeze(0).cuda()
    test_in = Variable(torch.LongTensor(prepare_sequence(test_raw,word2index))).cuda()
    test_mask = Variable(torch.BoolTensor(tuple(map(lambda s: s ==0, test_in.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, test_in.data)))).view(1,-1)
    start_decode = Variable(torch.LongTensor([[word2index['<BOS>']]*1])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<BOS>']]*1])).transpose(1,0)
    # test_raw = [removepads(torch.LongTensor(test_raw))]
    bert_hidden,bert_pooler = bert_layer(bert_info=(bert_tokens,bert_mask,bert_toktype))
    encoder_output = encoder(bert_last_hidden=bert_hidden)
    output = middle(encoder_output,bert_mask==0)
    tag_score, intent_score = decoder(start_decode,output,bert_mask==0,bert_subtoken_maskings=subtoken_mask,infer=True)

    v,i = torch.max(tag_score,1)
    print("Sentence           : ",test_raw)
    print("Tag Truth          : ", dev_intentTag[index])
    number = (list(map(lambda ii:index2tag[ii],i.data.tolist()))[:len(test_raw)])
    print("Tag Prediction     : ", number)
    v,i = torch.max(intent_score,1)
    print("Intent Truth       : ", dev_label[index])
    test = index2intent[i.data.tolist()[0]]
    print("Intent Prediction  : ",test)

Example of model prediction on test dataset
kiểm tra giúp mình bóng sợi đốt thứ 4 ở cổng chính nhé
tensor([[   0, 7303, 5761,  171,   68,  301, 2809, 1933,  129,  163,   25, 1904,
          159, 2083,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]], device='cuda:0')
Sentence       

In [84]:
def predict_intent(str_number):
    index = int(str_number)
    test_raw = dev_text[index]
    # print(test_raw)

    bert_tokens = dev_toks['input_ids'][index].unsqueeze(0).cuda()
    # print(bert_tokens)
    bert_mask = dev_toks['attention_mask'][index].unsqueeze(0).cuda()
    bert_toktype = dev_toks['token_type_ids'][index].unsqueeze(0).cuda()
    subtoken_mask = dev_subtoken_mask[index].unsqueeze(0).cuda()
    test_in = Variable(torch.LongTensor(prepare_sequence(test_raw,word2index))).cuda()
    test_mask = Variable(torch.BoolTensor(tuple(map(lambda s: s ==0, test_in.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, test_in.data)))).view(1,-1)
    start_decode = Variable(torch.LongTensor([[word2index['<BOS>']]*1])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<BOS>']]*1])).transpose(1,0)
    # test_raw = [removepads(torch.LongTensor(test_raw))]
    bert_hidden,bert_pooler = bert_layer(bert_info=(bert_tokens,bert_mask,bert_toktype))
    encoder_output = encoder(bert_last_hidden=bert_hidden)
    output = middle(encoder_output,bert_mask==0)
    tag_score, intent_score = decoder(start_decode,output,bert_mask==0,bert_subtoken_maskings=subtoken_mask,infer=True)

    v,i = torch.max(tag_score,1)
    tag_out = list(map(lambda ii:index2tag[ii],i.data.tolist()))[:len(test_raw)]
    str_tag = ' '.join(tag_out)
    v,i = torch.max(intent_score,1)
    intent_out = index2intent[i.data.tolist()[0]]

    return {"Predicted Tags": str(str_tag), "Predicted Intent": str(intent_out)}

In [85]:
t = predict_intent(str_number= 4)
print(t)

{'Predicted Tags': 'B-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice I-devicedevice O <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>', 'Predicted Intent': 'smart.home.decrease.percentage'}


## Demo Deploy

In [62]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.19.1-py3-none-any.whl (16.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.109.2-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.10.0 (from gradio)
  Downloading gradio_client-0.10.0-py3-none-any.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [90]:
import gradio as gr
import torch
from transformers import BertModel

def predict_intent(str_number):
    index = int(str_number)
    test_raw = dev_text[index]
    # print(test_raw)

    bert_tokens = dev_toks['input_ids'][index].unsqueeze(0).cuda()
    # print(bert_tokens)
    bert_mask = dev_toks['attention_mask'][index].unsqueeze(0).cuda()
    bert_toktype = dev_toks['token_type_ids'][index].unsqueeze(0).cuda()
    subtoken_mask = dev_subtoken_mask[index].unsqueeze(0).cuda()
    test_in = Variable(torch.LongTensor(prepare_sequence(test_raw,word2index))).cuda()
    test_mask = Variable(torch.BoolTensor(tuple(map(lambda s: s ==0, test_in.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, test_in.data)))).view(1,-1)
    start_decode = Variable(torch.LongTensor([[word2index['<BOS>']]*1])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<BOS>']]*1])).transpose(1,0)
    # test_raw = [removepads(torch.LongTensor(test_raw))]
    bert_hidden,bert_pooler = bert_layer(bert_info=(bert_tokens,bert_mask,bert_toktype))
    encoder_output = encoder(bert_last_hidden=bert_hidden)
    output = middle(encoder_output,bert_mask==0)
    tag_score, intent_score = decoder(start_decode,output,bert_mask==0,bert_subtoken_maskings=subtoken_mask,infer=True)

    v,i = torch.max(tag_score,1)
    tag_out = list(map(lambda ii:index2tag[ii],i.data.tolist()))[:len(test_raw)]
    str_tag = ' '.join(tag_out)
    v,i = torch.max(intent_score,1)
    intent_out = index2intent[i.data.tolist()[0]]

    return  str(str_tag), str(intent_out)

In [91]:
iface = gr.Interface(
    fn=predict_intent,
    inputs=["text"],
    outputs=["text", "text"],
    title="Intent Prediction",
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://83900a4a5d321d26ae.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


