In [32]:
# https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557
# Classifies into 21 intents


import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Load Dataset

In [33]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])

  df.Intent=df.Intent.astype(str)
  df.Sentence=df.Sentence.astype(str)

  #df = df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')

  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [100]:
intent, unique_intent, sentences = load_dataset("questions_categories.csv")

                                Sentence              Intent
0                 When do classes start?  faq.important_date
1          When does the semester start?  faq.important_date
2      What day does the semester start?  faq.important_date
3     Which day does the semester start?  faq.important_date
4  What date does the semester start on?  faq.important_date


In [101]:
print("Intents: ")
print(intent)
print("Unique Intents: ")
print(unique_intent)
print("Sentences: ")
print(sentences)


Intents: 
0      faq.important_date
1      faq.important_date
2      faq.important_date
3      faq.important_date
4      faq.important_date
              ...        
114          faq.employee
115          faq.employee
116           faq.student
117           faq.student
118         faq.professor
Name: Intent, Length: 119, dtype: object
Unique Intents: 
['faq.professor', 'faq.department', 'class.professor', 'faq.employee', 'department.time', 'faq.important_date', 'faq.class', 'faq.student', 'class.location', 'class.time']
Sentences: 
['When do classes start?', 'When does the semester start?', 'What day does the semester start?', 'Which day does the semester start?', 'What date does the semester start on?', 'When does school start?', 'What day does school start?', 'What is the first day of class?', 'What day do classes start?', 'What day do classes end?', 'When do classes end?', 'When does the semester end?', 'When is the last day of classes?', 'What day does the semester end on?', 'When 

# Get stopwords and punkt

In [102]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define stemmer

In [103]:
stemmer = LancasterStemmer()

# Data cleaning

In [104]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        # lemmatizing
        words.append([i.lower() for i in w])
    return words

In [105]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

119
[['when', 'do', 'classes', 'start'], ['when', 'does', 'the', 'semester', 'start']]


# Input encoding

In [106]:
def create_tokenizer( words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    """Create tokenizer
    """
    token = Tokenizer(filters=filters)
    token.fit_on_texts(words)
    return token

def get_max_length(words):
    """Gets max length of a word
    """
    return(len(max(words, key=len)))


In [107]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = get_max_length(cleaned_words)

print("Vocab size = ", vocab_size, " and Maximum length = ", max_length)

Vocab size =  158  and Maximum length =  17


# Output Encoding

In [108]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [109]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [110]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen=max_length, padding="post"))

In [111]:
padded_doc = padding_doc(encoded_doc, max_length)
padded_doc[:5]

array([[ 6,  5, 24, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 6, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 9, 14, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [66, 14, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 9, 94, 12,  3, 21, 17, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]], dtype=int32)

In [112]:
print("Shape of padded docs = ", padded_doc.shape)

Shape of padded docs =  (119, 17)


### Tokenizer

In [113]:
# tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [114]:
output_tokenizer.word_index

{'faq.professor': 1,
 'faq.department': 2,
 'class.professor': 3,
 'faq.employee': 4,
 'department.time': 5,
 'faq.important_date': 6,
 'faq.class': 7,
 'faq.student': 8,
 'class.location': 9,
 'class.time': 10}

### Encode output given intent and tokenizer and reshape

In [115]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [117]:
print(encoded_output)

[[6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [9], [3], [3], [3], [3], [3], [7], [7], [7], [7], [7], [10], [10], [10], [5], [5], [5], [5], [5], [2], [2], [2], [2], [2], [2], [2], [2], [2], [2], [1], [1], [1], [1], [1], [1], [1], [8], [8], [2], [8], [8], [8], [8], [8], [8], [8], [2], [2], [8], [8], [4], [4], [4], [4], [4], [8], [8], [1]]


In [118]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [119]:
encoded_output.shape

(119, 1)

In [120]:
print(encoded_output)

[[ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 9]
 [ 3]
 [ 3]
 [ 3]
 [ 3]
 [ 3]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [ 7]
 [10]
 [10]
 [10]
 [ 5]
 [ 5]
 [ 5]
 [ 5]
 [ 5]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 1]
 [ 8]
 [ 8]
 [ 2]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 8]
 [ 2]
 [ 2]
 [ 8]
 [ 8]
 [ 4]
 [ 4]
 [ 4]
 [ 4]
 [ 4]
 [ 8]
 [ 8]
 [ 1]]


### Create one hot encoding
Example of one hot encoding:
Consider a domain of [ a, e, i, o, u] and an intent of [ a, i, u ]
The one hot encoding is [ 1, 0, 1, 0, 1] for the domain.

In [121]:
def one_hot(encode):
    o = OneHotEncoder(sparse=False)
    return(o.fit_transform(encode))

In [122]:
output_one_hot = one_hot(encoded_output)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [123]:
output_one_hot.shape

(119, 10)

## Create Model

In [124]:
from sklearn.model_selection import train_test_split

In [125]:

train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [126]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (95, 17) and train_Y = (95, 10)
Shape of val_X = (24, 17) and val_Y = (24, 10)


### Sequential Model
[ conv ] -> [ batch norm ] -> [ relu ]

A sequential model allows you to create models layer-by-layer in a step-by-step fashion

We instantiate the sequential model first, then add each layer one at a time.
Layers:
    
    Embedding:
        Vocab size:
        Input length:
    Bidirectional:
        LSTM:
    Dense: Relu
    Dropout: 0.5
    Dense: Softmax

In [130]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(len(unique_intent), activation = "softmax"))
  
  return model

In [131]:
print(vocab_size, max_length)
print(len(unique_intent))

158 17
10


### Compile and give summary of model
#### Compile
A loss function (or objective function, or optimization score function) is one of the two parameters required to compile a model. We use categorical cross entropy to train a CNN to output a probability over the C classes for each image. It is used for multi-class classification. It is also called softmax loss, a softmax activation plus a cross-entropy loss.

Adam is an adaptive learning rate optimization algorithm designed to train deep neural nets. Adam computes individual learning rates for different parameters, using the first and second moments of gradient to adapt the learning rate for each weight of the neural network.



In [132]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 17, 128)           20224     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_10 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                330       
Total params: 291,946
Trainable params: 271,722
Non-trainable params: 20,224
_________________________________________________________________


## Train Model
Uses checkpoint to save best model at each training

In [133]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 95 samples, validate on 24 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.27188, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.27188 to 2.22835, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 2.22835 to 2.15434, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 2.15434 to 2.03957, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 2.03957 to 1.96964, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss improved from 1.96964 to 1.94586, saving model to model.h5
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.94586
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.94586
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.94586
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.94586
Epoch 11/100
Epoch 00011: val_loss did not improve from 1.94586
Epoch 12/100
Epoch 00012: val_loss did not improve from 1.94586
Epoch 13/100
Epoch 00

## Load Model
Loads the best model found from training above, model.h5

In [134]:
model = load_model("model.h5")

### Get predicted probability for a given text

In [135]:

def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

### Get final output for the prediction and the classes of intents

In [136]:

def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

### Use the model =]

In [137]:

text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['can', 'you', 'help', 'me']
department.time has confidence = 0.21431595
faq.department has confidence = 0.17673804
class.location has confidence = 0.15498534
class.professor has confidence = 0.14523438
faq.professor has confidence = 0.09591928
faq.student has confidence = 0.075154
faq.important_date has confidence = 0.051016223
class.time has confidence = 0.036218736
faq.employee has confidence = 0.027541487
faq.class has confidence = 0.022876546


In [138]:

text = "How do I apply for this position?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['how', 'do', 'i', 'apply', 'for', 'this', 'position']
faq.student has confidence = 0.2879084
faq.department has confidence = 0.18928896
faq.class has confidence = 0.15100592
faq.professor has confidence = 0.13906564
faq.employee has confidence = 0.08079777
department.time has confidence = 0.05024669
class.location has confidence = 0.037927132
class.professor has confidence = 0.031630915
faq.important_date has confidence = 0.018002763
class.time has confidence = 0.014125747


In [139]:

text = "Wait for me"
pred = predictions(text)
get_final_output(pred, unique_intent)

['wait', 'for', 'me']
faq.important_date has confidence = 0.4579492
class.location has confidence = 0.41506466
class.professor has confidence = 0.11208097
faq.department has confidence = 0.0049666595
department.time has confidence = 0.0033122844
faq.student has confidence = 0.0032640186
class.time has confidence = 0.0025997008
faq.employee has confidence = 0.00046487432
faq.professor has confidence = 0.00022244213
faq.class has confidence = 7.513198e-05


In [141]:

text = "Where is the bathroom?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['where', 'is', 'the', 'bathroom']
class.location has confidence = 0.8445389
class.professor has confidence = 0.05849019
faq.important_date has confidence = 0.056206543
faq.department has confidence = 0.01820569
faq.student has confidence = 0.009420158
department.time has confidence = 0.0061977515
faq.employee has confidence = 0.002738522
class.time has confidence = 0.0026168828
faq.professor has confidence = 0.0012651766
faq.class has confidence = 0.00032005287


In [142]:

text = "Where is the CS Lab?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['where', 'is', 'the', 'cs', 'lab']
class.location has confidence = 0.8445389
class.professor has confidence = 0.05849019
faq.important_date has confidence = 0.056206543
faq.department has confidence = 0.01820569
faq.student has confidence = 0.009420158
department.time has confidence = 0.0061977515
faq.employee has confidence = 0.002738522
class.time has confidence = 0.0026168828
faq.professor has confidence = 0.0012651766
faq.class has confidence = 0.00032005287
