In [11]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q -U "tensorflow==2.8.*"

In [31]:
import pandas as pd
import json
import string
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tf_text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Preprocessing

In [12]:
leyzer_dev_url = 'https://raw.githubusercontent.com/cartesinus/leyzer/master/corpora/0.1.0/leyzer-dev-en-US-0.1.0.tsv'
leyzer_dev_df = pd.DataFrame(pd.read_csv(leyzer_dev_url, sep='\t'))

In [13]:
leyzer_df = leyzer_dev_df
leyzer_df.drop(["bio"], axis = 1, inplace = True)
leyzer_df

Unnamed: 0,domain,intent,utterance
0,Airconditioner,TurnOff,switch the air conditioning off
1,Airconditioner,TurnOff,turn the ac off
2,Airconditioner,TurnOn,switch on the aircon
3,Airconditioner,TurnOn,turn the aircon on
4,Airconditioner,ChangeTemperature,change the minimum and maximum temperatures on...
...,...,...,...
373,Youtube,FindQueryOnChannel,show a video from niki and gabi matching seinf...
374,Youtube,FindQueryOnChannel,show me a video from cineamasin matching lipst...
375,Youtube,FindQueryOnChannel,show me a videos published by wwe matching gre...
376,Youtube,NotifyOnNewFromFollowing,notify me when there is new video from youtube...


In [14]:
leyzer_df['utterance'] = leyzer_df['utterance'].apply(lambda sequence: [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
leyzer_df

Unnamed: 0,domain,intent,utterance
0,Airconditioner,TurnOff,"[s, w, i, t, c, h, , t, h, e, , a, i, r, , ..."
1,Airconditioner,TurnOff,"[t, u, r, n, , t, h, e, , a, c, , o, f, f]"
2,Airconditioner,TurnOn,"[s, w, i, t, c, h, , o, n, , t, h, e, , a, ..."
3,Airconditioner,TurnOn,"[t, u, r, n, , t, h, e, , a, i, r, c, o, n, ..."
4,Airconditioner,ChangeTemperature,"[c, h, a, n, g, e, , t, h, e, , m, i, n, i, ..."
...,...,...,...
373,Youtube,FindQueryOnChannel,"[s, h, o, w, , a, , v, i, d, e, o, , f, r, ..."
374,Youtube,FindQueryOnChannel,"[s, h, o, w, , m, e, , a, , v, i, d, e, o, ..."
375,Youtube,FindQueryOnChannel,"[s, h, o, w, , m, e, , a, , v, i, d, e, o, ..."
376,Youtube,NotifyOnNewFromFollowing,"[n, o, t, i, f, y, , m, e, , w, h, e, n, , ..."


In [15]:
leyzer_df['utterance'] = leyzer_df['utterance'].apply(lambda word: ''.join(word))
leyzer_df

Unnamed: 0,domain,intent,utterance
0,Airconditioner,TurnOff,switch the air conditioning off
1,Airconditioner,TurnOff,turn the ac off
2,Airconditioner,TurnOn,switch on the aircon
3,Airconditioner,TurnOn,turn the aircon on
4,Airconditioner,ChangeTemperature,change the minimum and maximum temperatures on...
...,...,...,...
373,Youtube,FindQueryOnChannel,show a video from niki and gabi matching seinf...
374,Youtube,FindQueryOnChannel,show me a video from cineamasin matching lipst...
375,Youtube,FindQueryOnChannel,show me a videos published by wwe matching gre...
376,Youtube,NotifyOnNewFromFollowing,notify me when there is new video from youtube...


In [16]:
from nltk.tokenize import word_tokenize 
leyzer_df['utterance'] = leyzer_df['utterance'].apply(lambda sent: word_tokenize(sent))
leyzer_df

Unnamed: 0,domain,intent,utterance
0,Airconditioner,TurnOff,"[switch, the, air, conditioning, off]"
1,Airconditioner,TurnOff,"[turn, the, ac, off]"
2,Airconditioner,TurnOn,"[switch, on, the, aircon]"
3,Airconditioner,TurnOn,"[turn, the, aircon, on]"
4,Airconditioner,ChangeTemperature,"[change, the, minimum, and, maximum, temperatu..."
...,...,...,...
373,Youtube,FindQueryOnChannel,"[show, a, video, from, niki, and, gabi, matchi..."
374,Youtube,FindQueryOnChannel,"[show, me, a, video, from, cineamasin, matchin..."
375,Youtube,FindQueryOnChannel,"[show, me, a, videos, published, by, wwe, matc..."
376,Youtube,NotifyOnNewFromFollowing,"[notify, me, when, there, is, new, video, from..."


In [69]:
col = leyzer_df[['intent']]
print("Total number of intents:", len(col.intent.unique()))
# print(col.intent.unique())

Total number of intents: 178


In [18]:
leyzer_test_df = pd.DataFrame(pd.read_csv("https://raw.githubusercontent.com/cartesinus/leyzer/master/corpora/0.1.0/leyzer-test-en-US-0.1.0.tsv", sep="\t"))
print(len(leyzer_test_df[['intent']].intent.unique()))
# print(leyzer_test_df[['intent']].intent.unique())

154


In [73]:
tokenizer = Tokenizer(1000)
tokenizer.fit_on_texts(leyzer_df['utterance'])
train = tokenizer.texts_to_sequences(leyzer_df['utterance'])
print(train)
features = pad_sequences(train)
print(len(features[0]))

[[118, 4, 173, 174, 119], [120, 4, 277, 119], [118, 1, 4, 175], [120, 4, 175, 1], [56, 4, 176, 14, 177, 178, 1, 3, 101], [57, 57, 3, 101, 2, 278, 143], [57, 4, 81, 2, 279, 280, 281, 1, 3, 282, 101], [57, 4, 283, 81, 2, 284, 143, 1, 3, 101, 14, 4, 285, 2, 286, 143], [56, 4, 176, 14, 177, 178, 1, 3, 179, 144, 101], [9, 4, 180, 181], [58, 4, 180, 181], [121, 81, 74, 16, 82, 11, 102, 182, 2, 3, 287, 288], [121, 81, 59, 52, 11, 102, 182, 2, 3, 173, 174], [58, 4, 81, 8, 4, 289, 144, 81, 183], [121, 290, 4, 179, 144, 291, 81, 183, 9], [26, 3, 60], [184, 1, 3, 185, 1, 292], [184, 1, 3, 185, 1, 293, 294], [35, 21, 103, 122, 295], [35, 21, 103, 122, 296, 297, 2, 60], [35, 21, 103, 122, 145, 298, 1, 95, 299], [35, 21, 103, 122, 300, 2, 60, 1, 95, 8, 186], [48, 10, 53, 21, 123, 1, 3, 60, 83], [74, 146, 48, 10, 53, 21, 103, 1, 3, 60, 83], [74, 146, 48, 10, 53, 21, 187, 1, 3, 60, 32, 75, 301, 83], [74, 146, 48, 10, 53, 21, 123, 1, 3, 60, 32, 75, 302, 83], [48, 10, 53, 21, 187, 1, 3, 60, 32, 75, 147,

In [77]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
labels = OneHotEncoder().fit_transform(leyzer_df[['intent']]).toarray()
label_2 = LabelEncoder().fit_transform(leyzer_df['intent'])
print(labels)
print(label_2)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[171 171 172 172  10 120 121 118 119  30  30  31  31  32  32  50  11  11
   2   2   1   1  44  42  41  41  43  43  43  51  58  22  22  22  23  24
  25 128 128 128 129 129 129 130 130 130 130 130 130 130 130 130 130 130
 130 130 130 130 131 131 131 132 132 132  52  52  52  52 134 134 134 134
 134 134 134 136 136 136 136 135 135 135 138 138 137 108 108 108 108 107
 107 107 105 106 106 106  82  82  54  76  76  72  72  72  73  74  75 124
 154 154 154 155 155 155 155 155 125   6  46  56 142 141 139 140  14  15
 147 159  55 123  57 148 144 144 150 150 152 152 152 152 151  78 149 164
  77  77 145 145 146  47  48   8   8   8   8   7 153  83  83  83  84  84
  84  84  61  61  12  12  12  12  13 110 110 110 110 110 110 110 117 122
 122 112 112 112 112 113 113 113 114 114 114 173 173  34  35  18  19  19
  39  38  62  62  70  70  70  70  71  71 

In [89]:
tokenizer.word_index

{'on': 1,
 'to': 2,
 'my': 3,
 'the': 4,
 'with': 5,
 'a': 6,
 'from': 7,
 'of': 8,
 'show': 9,
 'me': 10,
 'in': 11,
 'new': 12,
 'slack': 13,
 'and': 14,
 'number': 15,
 'i': 16,
 '555': 17,
 'create': 18,
 'contact': 19,
 'spotify': 20,
 'an': 21,
 'for': 22,
 'email': 23,
 'playlist': 24,
 'play': 25,
 'open': 26,
 'google': 27,
 'emails': 28,
 'post': 29,
 'translation': 30,
 'youtube': 31,
 'at': 32,
 'get': 33,
 'send': 34,
 'add': 35,
 'as': 36,
 'that': 37,
 'message': 38,
 'instagram': 39,
 'restaurants': 40,
 'contacts': 41,
 'name': 42,
 'track': 43,
 'italian': 44,
 'display': 45,
 'translate': 46,
 'french': 47,
 'notify': 48,
 '1': 49,
 'this': 50,
 'by': 51,
 'it': 52,
 'when': 53,
 'picture': 54,
 'english': 55,
 'change': 56,
 'set': 57,
 'check': 58,
 'is': 59,
 'calendar': 60,
 'using': 61,
 'drive': 62,
 'pictures': 63,
 'polish': 64,
 'twitter': 65,
 'call': 66,
 'subject': 67,
 'facebook': 68,
 'channel': 69,
 'speaker': 70,
 'song': 71,
 'tweets': 72,
 'matching

In [82]:
vocabulary = len(tokenizer.word_index)
print("Total numbers of vocabulary:", vocabulary)

Total numbers of vocabulary: 861


# Model