In [34]:
# https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557
# Classifies into 21 intents


import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Load Dataset

In [2]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [3]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


# Get stopwords and punkt

In [4]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define stemmer

In [5]:
stemmer = LancasterStemmer()

# Data cleaning

In [6]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        # lemmatizing
        words.append([i.lower() for i in w])
    return words

In [7]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

1113
[['need', 'help', 'pleese'], ['need', 'help']]


# Input encoding

In [8]:
def create_tokenizer( words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    """Create tokenizer
    """
    token = Tokenizer(filters=filters)
    token.fit_on_texts(words)
    return token

def get_max_length(words):
    """Gets max length of a word
    """
    return(len(max(words, key=len)))


In [9]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = get_max_length(cleaned_words)

print("Vocab size = ", vocab_size, " and Maximum length = ", max_length)

Vocab size =  492  and Maximum length =  28


# Output Encoding

In [10]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [11]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [12]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen=max_length, padding="post"))

In [13]:
padded_doc = padding_doc(encoded_doc, max_length)
padded_doc[:5]

array([[ 25,  77, 332,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 25,  77,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  1,  25, 198, 181,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 51,  10,  77,  16,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8, 268,   4,  10,  30,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int32)

In [14]:
print("Shape of padded docs = ", padded_doc.shape)

Shape of padded docs =  (1113, 28)


### Tokenizer

In [15]:
# tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [16]:
output_tokenizer.word_index

{'faq.borrow_limit': 1,
 'contact.contact': 2,
 'commonq.assist': 3,
 'faq.application_process': 4,
 'commonq.not_giving': 5,
 'faq.borrow_use': 6,
 'faq.biz_new': 7,
 'faq.bad_service': 8,
 'commonq.name': 9,
 'commonq.query': 10,
 'commonq.just_details': 11,
 'faq.biz_simpler': 12,
 'faq.aadhaar_missing': 13,
 'faq.address_proof': 14,
 'commonq.bot': 15,
 'commonq.wait': 16,
 'faq.banking_option_missing': 17,
 'faq.apply_register': 18,
 'faq.biz_category_missing': 19,
 'commonq.how': 20,
 'faq.approval_time': 21}

### Encode output given intent and tokenizer and reshape

In [17]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [18]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [19]:
encoded_output.shape

(1113, 1)

### Create one hot encoding
Example of one hot encoding:
Consider a domain of [ a, e, i, o, u] and an intent of [ a, i, u ]
The one hot encoding is [ 1, 0, 1, 0, 1] for the domain.

In [20]:
def one_hot(encode):
    o = OneHotEncoder(sparse=False)
    return(o.fit_transform(encode))

In [21]:
output_one_hot = one_hot(encoded_output)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [22]:
output_one_hot.shape

(1113, 21)

## Create Model

In [24]:
from sklearn.model_selection import train_test_split

In [25]:

train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [26]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (890, 28) and train_Y = (890, 21)
Shape of val_X = (223, 28) and val_Y = (223, 21)


### Sequential Model
[ conv ] -> [ batch norm ] -> [ relu ]

A sequential model allows you to create models layer-by-layer in a step-by-step fashion

We instantiate the sequential model first, then add each layer one at a time.
Layers:
    
    Embedding:
        Vocab size:
        Input length:
    Bidirectional:
        LSTM:
    Dense: Relu
    Dropout: 0.5
    Dense: Softmax

In [27]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(21, activation = "softmax"))
  
  return model

### Compile and give summary of model
#### Compile
A loss function (or objective function, or optimization score function) is one of the two parameters required to compile a model. We use categorical cross entropy to train a CNN to output a probability over the C classes for each image. It is used for multi-class classification. It is also called softmax loss, a softmax activation plus a cross-entropy loss.

Adam is an adaptive learning rate optimization algorithm designed to train deep neural nets. Adam computes individual learning rates for different parameters, using the first and second moments of gradient to adapt the learning rate for each weight of the neural network.



In [28]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 28, 128)           62976     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                693       
Total params: 335,061
Trainable params: 272,085
Non-trainable params: 62,976
_________________________________________________________________


## Train Model
Uses checkpoint to save best model at each training

In [29]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 890 samples, validate on 223 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.89138, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.89138 to 2.81374, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 2.81374 to 2.73351, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 2.73351 to 2.70432, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 2.70432 to 2.56915, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss improved from 2.56915 to 2.43322, saving model to model.h5
Epoch 7/100
Epoch 00007: val_loss improved from 2.43322 to 2.39440, saving model to model.h5
Epoch 8/100
Epoch 00008: val_loss improved from 2.39440 to 2.29890, saving model to model.h5
Epoch 9/100
Epoch 00009: val_loss improved from 2.29890 to 2.18376, saving model to model.h5
Epoch 10/100
Epoch 00010: val_loss improved from 2.18376 to 2.02262, saving model to model.h5
Epoch 11/100
Epoch 00011: v

## Load Model
Loads the best model found from training above, model.h5

In [37]:
model = load_model("model.h5")

### Get predicted probability for a given text

In [31]:

def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

### Get final output for the prediction and the classes of intents

In [32]:

def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

### Use the model =]

In [33]:

text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['can', 'you', 'help', 'me']
commonQ.assist has confidence = 0.32932204
commonQ.query has confidence = 0.139143
contact.contact has confidence = 0.09659231
commonQ.bot has confidence = 0.08079043
faq.biz_new has confidence = 0.077273935
commonQ.name has confidence = 0.07718031
commonQ.how has confidence = 0.069322
faq.bad_service has confidence = 0.059703138
faq.apply_register has confidence = 0.022363504
commonQ.wait has confidence = 0.014274337
commonQ.not_giving has confidence = 0.013744186
faq.aadhaar_missing has confidence = 0.006254018
commonQ.just_details has confidence = 0.0044244975
faq.application_process has confidence = 0.0040695267
faq.borrow_use has confidence = 0.0026477
faq.borrow_limit has confidence = 0.0013174948
faq.biz_simpler has confidence = 0.00070938957
faq.approval_time has confidence = 0.00048008468
faq.biz_category_missing has confidence = 0.00029588217
faq.banking_option_missing has confidence = 6.383373e-05
faq.address_proof has confidence = 2.8412622e-05


In [38]:

text = "How do I apply for this position?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['how', 'do', 'i', 'apply', 'for', 'this', 'position']
faq.application_process has confidence = 0.9228019
faq.apply_register has confidence = 0.07120995
commonQ.assist has confidence = 0.0022965271
contact.contact has confidence = 0.0022584863
faq.borrow_use has confidence = 0.0011869266
faq.biz_simpler has confidence = 7.222695e-05
faq.approval_time has confidence = 3.9839113e-05
faq.biz_new has confidence = 3.6369358e-05
faq.borrow_limit has confidence = 2.3749713e-05
faq.bad_service has confidence = 2.2953378e-05
faq.biz_category_missing has confidence = 2.0747395e-05
faq.banking_option_missing has confidence = 1.7692144e-05
commonQ.how has confidence = 8.244553e-06
commonQ.wait has confidence = 1.4011844e-06
commonQ.query has confidence = 1.3473338e-06
commonQ.name has confidence = 1.0343318e-06
faq.aadhaar_missing has confidence = 3.699744e-07
faq.address_proof has confidence = 1.3429752e-07
commonQ.just_details has confidence = 1.1053147e-07
commonQ.bot has confidence = 7.3278206

In [40]:

text = "Wait for me"
pred = predictions(text)
get_final_output(pred, unique_intent)

['wait', 'for', 'me']
faq.apply_register has confidence = 0.2630215
commonQ.assist has confidence = 0.1846362
commonQ.how has confidence = 0.119165204
commonQ.wait has confidence = 0.112347364
contact.contact has confidence = 0.07797397
commonQ.name has confidence = 0.052337676
commonQ.bot has confidence = 0.045500476
commonQ.query has confidence = 0.030452987
faq.aadhaar_missing has confidence = 0.027099965
faq.biz_new has confidence = 0.01953664
commonQ.not_giving has confidence = 0.015911063
faq.application_process has confidence = 0.01587559
faq.bad_service has confidence = 0.01562931
commonQ.just_details has confidence = 0.012118423
faq.approval_time has confidence = 0.0032076482
faq.borrow_use has confidence = 0.0016377135
faq.banking_option_missing has confidence = 0.0010719093
faq.biz_category_missing has confidence = 0.00094919227
faq.borrow_limit has confidence = 0.0007261337
faq.address_proof has confidence = 0.00048034557
faq.biz_simpler has confidence = 0.00032065253


In [41]:

text = "Who can I call to help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['who', 'can', 'i', 'call', 'to', 'help', 'me']
faq.apply_register has confidence = 0.89374787
faq.application_process has confidence = 0.036169574
commonQ.assist has confidence = 0.031955425
contact.contact has confidence = 0.030594548
commonQ.how has confidence = 0.005335092
commonQ.wait has confidence = 0.001275502
commonQ.name has confidence = 0.00029222755
faq.bad_service has confidence = 0.00012833613
faq.biz_new has confidence = 0.00011115512
commonQ.query has confidence = 0.00010855391
faq.approval_time has confidence = 8.3946055e-05
commonQ.just_details has confidence = 5.583135e-05
faq.aadhaar_missing has confidence = 3.8708153e-05
faq.banking_option_missing has confidence = 3.30851e-05
faq.borrow_use has confidence = 2.912586e-05
commonQ.bot has confidence = 1.8081726e-05
faq.biz_category_missing has confidence = 7.941922e-06
commonQ.not_giving has confidence = 7.613979e-06
faq.borrow_limit has confidence = 4.9950563e-06
faq.biz_simpler has confidence = 1.3852948e-06
faq.add