In [75]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
tf.compat.v1.get_default_graph()

<tensorflow.python.framework.ops.Graph at 0x1c3a2436cd0>

In [76]:
def load_dataset(filename):
    df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
    print(df.head())
    intent = df["Intent"]
    unique_intent = list(set(intent))
    sentences = list(df["Sentence"])

    return (intent, unique_intent, sentences)

In [77]:
intent, unique_intent, sentences = load_dataset("Cars.csv")

                               Sentence                      Intent
0                           ï»¿Sentence                      Intent
1    I want to know about car insurance  Detail_about_car_insurance
2  Can you tell me about car insurance?  Detail_about_car_insurance
3          How can I buy car insurance?  Detail_about_car_insurance
4                What is car insurance?  Detail_about_car_insurance


In [78]:
print(sentences[:5])

['ï»¿Sentence', 'I want to know about car insurance', 'Can you tell me about car insurance?', 'How can I buy car insurance?', 'What is car insurance?']


In [79]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     H:\Anaconda\envs\tensorflow\lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     H:\Anaconda\envs\tensorflow\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [80]:
#define stemmer
stemmer = LancasterStemmer()

In [81]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([i.lower() for i in w])

    return words

In [82]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

33
[['sentence'], ['i', 'want', 'to', 'know', 'about', 'car', 'insurance']]


In [83]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

In [84]:
def max_length(words):
    return(len(max(words, key = len)))

In [85]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 93 and Maximum length = 14


In [86]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [87]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [88]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [89]:
padded_doc = padding_doc(encoded_doc, max_length)

In [90]:
padded_doc[:5]

array([[32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2, 14,  8, 21, 22,  1,  3,  0,  0,  0,  0,  0,  0,  0],
       [ 5, 18, 33, 15, 22,  1,  3,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  5,  2, 19,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7, 10,  1,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [91]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (33, 14)


In [92]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [93]:
output_tokenizer.word_index

{'detail_about_car_insurance': 1,
 'sell_a_car': 2,
 'purchasing_a_car': 3,
 'intent': 4}

In [94]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [95]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [96]:
encoded_output.shape

(33, 1)

In [97]:
def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [98]:
encoded_output

array([[4],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2]])

In [99]:
output_one_hot = one_hot(encoded_output)

In [100]:
output_one_hot.shape

(33, 4)

In [101]:
from sklearn.model_selection import train_test_split

In [102]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.3)

In [103]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (23, 14) and train_Y = (23, 4)
Shape of val_X = (10, 14) and val_Y = (10, 4)


In [104]:
def create_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
    model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation = "softmax"))
  
    return model

In [105]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 14, 128)           11904     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_8 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 132       
Total params: 283,428
Trainable params: 271,524
Non-trainable params: 11,904
_________________________________________________________________


In [107]:
filename = 'model.h4'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 1, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.29477, saving model to model.h4
INFO:tensorflow:Assets written to: model.h4\assets
Epoch 2/100
Epoch 00002: val_loss did not improve from 1.29477
Epoch 3/100
Epoch 00003: val_loss did not improve from 1.29477
Epoch 4/100
Epoch 00004: val_loss improved from 1.29477 to 1.23934, saving model to model.h4
INFO:tensorflow:Assets written to: model.h4\assets
Epoch 5/100
Epoch 00005: val_loss did not improve from 1.23934
Epoch 6/100
Epoch 00006: val_loss improved from 1.23934 to 1.23257, saving model to model.h4
INFO:tensorflow:Assets written to: model.h4\assets
Epoch 7/100
Epoch 00007: val_loss improved from 1.23257 to 1.05138, saving model to model.h4
INFO:tensorflow:Assets written to: model.h4\assets
Epoch 8/100
Epoch 00008: val_loss improved from 1.05138 to 0.77221, saving model to model.h4
INFO:tensorflow:Assets written to: model.h4\assets
Epoch 9/100
Epoch 00009: val_loss did not improve from 0.77221
Epoch 10/100
Epoch 00010: val_lo

In [109]:
model = load_model("model.h4")

In [110]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    print(test_word)
  #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    
        test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
    x = padding_doc(test_ls, max_length)
  
    pred = model.predict_proba(x)
  
  
    return pred

In [111]:
def get_final_output(pred, classes):
    predictions = pred[0]
 
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
 
    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [112]:
text = "I want to purchase a car"
pred = predictions(text)
get_final_output(pred, unique_intent)

['i', 'want', 'to', 'purchase', 'a', 'car']
Instructions for updating:
Please use `model.predict()` instead.
Purchasing_a_car has confidence = 0.967582
Sell_a_car has confidence = 0.029549874
Detail_about_car_insurance has confidence = 0.00280835
Intent has confidence = 5.969917e-05
