In [1]:
import wget

In [2]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [3]:
url= 'https://raw.githubusercontent.com/clinc/oos-eval/master/data/data_full.json'
wget.download(url)

url= 'https://www.dropbox.com/s/a247ju2qsczh0be/glove.6B.100d.txt?dl=1'
wget.download(url)

'glove.6B.100d.txt'

In [4]:
import numpy as np
import json

In [5]:
with open('data_full.json') as file:
    data=json.loads(file.read())

In [6]:
val_ues=np.array(data['oos_val'])
train=np.array(data['oos_train'])
test=np.array(data['oos_test'])

In [7]:
val_others=np.array(data['val'])
train_o=np.array(data['train'])
test_o=np.array(data['test'])

In [8]:
val=np.concatenate([val_ues, val_others])
train=np.concatenate([train, train_o])
test=np.concatenate([test, test_o])

In [9]:
data=np.concatenate([train, test, val])
data=data.T

In [10]:
text=data[0]
labels=data[1]

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_txt,test_txt,train_label,test_labels = train_test_split(text,labels,test_size = 0.3)

In [13]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [14]:
ls=[]
for c in train_txt:
    ls.append(len(c.split()))
maxLen=int(np.percentile(ls, 98))

In [15]:
embeddings_index={}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        coefs=np.asarray(values[1:], dtype='float32')
        embeddings_index[word]=coefs

In [16]:
all_embs=np.stack(embeddings_index.values())
emb_mean, emb_std= all_embs.mean(), all_embs.std()
emb_mean, emb_std

  if (await self.run_code(code, result,  async_=asy)):


(0.004451992, 0.4081574)

In [17]:
max_num_words=40000
embedding_dim=len(embeddings_index['the'])
classes=np.unique(labels)

tokenizer=Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(train_txt)

In [18]:
train_sequences=tokenizer.texts_to_sequences(train_txt)
train_sequences=pad_sequences(train_sequences, maxlen=maxLen, padding='post')
test_sequences=tokenizer.texts_to_sequences(test_txt)
test_sequences=pad_sequences(test_sequences, maxlen=maxLen, padding='post')
word_index=tokenizer.word_index

In [19]:
num_words=min(max_num_words, len(word_index))+1
embedding_matrix=np.random.normal(emb_mean, emb_std, (num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_num_words:
        break
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

In [20]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(classes)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder.fit(integer_encoded)

OneHotEncoder(sparse=False)

In [21]:
train_label_encoded = label_encoder.transform(train_label)
train_label_encoded = train_label_encoded.reshape(len(train_label_encoded), 1)
train_label = onehot_encoder.transform(train_label_encoded)

In [22]:
test_labels_encoded = label_encoder.transform(test_labels)
test_labels_encoded = test_labels_encoded.reshape(len(test_labels_encoded), 1)
test_labels = onehot_encoder.transform(test_labels_encoded)

In [23]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional,Embedding

In [24]:
model = Sequential()

model.add(Embedding(num_words, 100, trainable=False,input_length=train_sequences.shape[1], weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.1, dropout=0.1), 'concat'))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=False, recurrent_dropout=0.1, dropout=0.1))
model.add(Dropout(0.3))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(classes.shape[0], activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 100)           633700    
_________________________________________________________________
bidirectional (Bidirectional (None, 16, 512)           731136    
_________________________________________________________________
dropout (Dropout)            (None, 16, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               787456    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                12850     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [26]:
history = model.fit(train_sequences, train_label, epochs = 20,
          batch_size = 64, shuffle=True,
          validation_data=[test_sequences, test_labels])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [27]:
import pickle
import json

In [29]:
model.save('models/intents.h5')

with open('utils/classes.pkl', 'wb') as file:
    pickle.dump(classes, file)

with open('utils/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

with open('utils/label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

In [30]:
class IntentClassifier:
    def __init__(self, classes, model, tokenizer, label_encoder):
        self.classes=classes
        self.classifier=model
        self.tokenizer=tokenizer
        self.label_encoder=label_encoder
    
    def get_intent(self, text):
        self.text=[text]
        self.test_keras=self.tokenizer.texts_to_sequences(self.text)
        self.test_keras_sequence=pad_sequences(self.test_keras, maxlen=16, padding='post')
        self.pred=self.classifier.predict(self.test_keras_sequence)
        return label_encoder.inverse_transform(np.argmax(self.pred, 1))[0]

In [31]:
import pickle

from tensorflow.python.keras.models import load_model

model=load_model('models/intents.h5')

with open('utils/classes.pkl', 'rb') as file:
    classes=pickle.load(file)

with open('utils/tokenizer.pkl', 'rb') as file:
    tokenizer=pickle.load(file)

with open('utils/label_encoder.pkl', 'rb') as file:
    label_encoder=pickle.load(file)

In [32]:
nlu=IntentClassifier(classes, model, tokenizer, label_encoder)

In [33]:
nlu.get_intent("is it cold in India right now")

'weather'