In [1]:
# https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557
# Classifies into 21 intents


import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Load Dataset

In [2]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])

  df.Intent = df.Intent.astype(str)
  df.Sentence = df.Sentence.astype(str)
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [3]:
intent, unique_intent, sentences = load_dataset("questions_categories.csv")

                                Sentence              Intent
0                 When do classes start?  faq.important_date
1          When does the semester start?  faq.important_date
2      What day does the semester start?  faq.important_date
3     Which day does the semester start?  faq.important_date
4  What date does the semester start on?  faq.important_date


# Get stopwords and punkt

In [4]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrusbaker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cyrusbaker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define stemmer

In [5]:
stemmer = LancasterStemmer()

# Data cleaning

In [6]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        # lemmatizing
        words.append([i.lower() for i in w])
    return words

In [7]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

119
[['when', 'do', 'classes', 'start'], ['when', 'does', 'the', 'semester', 'start']]


# Input encoding

In [8]:
def create_tokenizer( words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    """Create tokenizer
    """
    token = Tokenizer(filters=filters)
    token.fit_on_texts(words)
    return token

def get_max_length(words):
    """Gets max length of a word
    """
    return(len(max(words, key=len)))


In [9]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = get_max_length(cleaned_words)

print("Vocab size = ", vocab_size, " and Maximum length = ", max_length)

Vocab size =  158  and Maximum length =  17


# Output Encoding

In [10]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [11]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [12]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen=max_length, padding="post"))

In [13]:
padded_doc = padding_doc(encoded_doc, max_length)
padded_doc[:5]

array([[ 6,  5, 24, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 6, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 9, 14, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [66, 14, 12,  3, 21, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 9, 94, 12,  3, 21, 17, 46,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]], dtype=int32)

In [14]:
print("Shape of padded docs = ", padded_doc.shape)

Shape of padded docs =  (119, 17)


### Tokenizer

In [15]:
# tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [16]:
output_tokenizer.word_index

{'class.location': 1,
 'faq.professor': 2,
 'class.time': 3,
 'faq.student': 4,
 'faq.class': 5,
 'department.time': 6,
 'faq.department': 7,
 'faq.important_date': 8,
 'faq.employee': 9,
 'class.professor': 10}

### Encode output given intent and tokenizer and reshape

In [17]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [18]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [19]:
encoded_output.shape

(119, 1)

### Create one hot encoding
Example of one hot encoding:
Consider a domain of [ a, e, i, o, u] and an intent of [ a, i, u ]
The one hot encoding is [ 1, 0, 1, 0, 1] for the domain.

In [20]:
def one_hot(encode):
    o = OneHotEncoder(sparse=False)
    return(o.fit_transform(encode))

In [21]:
output_one_hot = one_hot(encoded_output)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [22]:
output_one_hot.shape

(119, 10)

## Create Model

In [23]:
from sklearn.model_selection import train_test_split

In [24]:

train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [49]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (95, 17) and train_Y = (95, 10)
Shape of val_X = (24, 17) and val_Y = (24, 10)
10


### Sequential Model
[ conv ] -> [ batch norm ] -> [ relu ]

A sequential model allows you to create models layer-by-layer in a step-by-step fashion

We instantiate the sequential model first, then add each layer one at a time.
Layers:
    
    Embedding:
        Vocab size:
        Input length:
    Bidirectional:
        LSTM:
    Dense: Relu
    Dropout: 0.5
    Dense: Softmax

In [36]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(10, activation = "softmax"))
  
  return model

### Compile and give summary of model
#### Compile
A loss function (or objective function, or optimization score function) is one of the two parameters required to compile a model. We use categorical cross entropy to train a CNN to output a probability over the C classes for each image. It is used for multi-class classification. It is also called softmax loss, a softmax activation plus a cross-entropy loss.

Adam is an adaptive learning rate optimization algorithm designed to train deep neural nets. Adam computes individual learning rates for different parameters, using the first and second moments of gradient to adapt the learning rate for each weight of the neural network.



In [37]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 17, 128)           20224     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_8 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                330       
Total params: 291,946
Trainable params: 271,722
Non-trainable params: 20,224
_________________________________________________________________


## Train Model
Uses checkpoint to save best model at each training

In [38]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 95 samples, validate on 24 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.27334, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.27334 to 2.22797, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 2.22797 to 2.13948, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 2.13948 to 2.03254, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 2.03254 to 2.01575, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss did not improve from 2.01575
Epoch 7/100
Epoch 00007: val_loss did not improve from 2.01575
Epoch 8/100
Epoch 00008: val_loss did not improve from 2.01575
Epoch 9/100
Epoch 00009: val_loss did not improve from 2.01575
Epoch 10/100
Epoch 00010: val_loss did not improve from 2.01575
Epoch 11/100
Epoch 00011: val_loss did not improve from 2.01575
Epoch 12/100
Epoch 00012: val_loss did not improve from 2.01575
Epoch 13/100
Epoch 00013: val_loss did not improve 

Epoch 00029: val_loss did not improve from 1.91873
Epoch 30/100
Epoch 00030: val_loss did not improve from 1.91873
Epoch 31/100
Epoch 00031: val_loss did not improve from 1.91873
Epoch 32/100
Epoch 00032: val_loss did not improve from 1.91873
Epoch 33/100
Epoch 00033: val_loss did not improve from 1.91873
Epoch 34/100
Epoch 00034: val_loss did not improve from 1.91873
Epoch 35/100
Epoch 00035: val_loss did not improve from 1.91873
Epoch 36/100
Epoch 00036: val_loss did not improve from 1.91873
Epoch 37/100
Epoch 00037: val_loss improved from 1.91873 to 1.88770, saving model to model.h5
Epoch 38/100
Epoch 00038: val_loss improved from 1.88770 to 1.84789, saving model to model.h5
Epoch 39/100
Epoch 00039: val_loss did not improve from 1.84789
Epoch 40/100
Epoch 00040: val_loss did not improve from 1.84789
Epoch 41/100
Epoch 00041: val_loss did not improve from 1.84789
Epoch 42/100
Epoch 00042: val_loss did not improve from 1.84789
Epoch 43/100
Epoch 00043: val_loss did not improve from 1

Epoch 00058: val_loss did not improve from 1.84789
Epoch 59/100
Epoch 00059: val_loss did not improve from 1.84789
Epoch 60/100
Epoch 00060: val_loss did not improve from 1.84789
Epoch 61/100
Epoch 00061: val_loss did not improve from 1.84789
Epoch 62/100
Epoch 00062: val_loss did not improve from 1.84789
Epoch 63/100
Epoch 00063: val_loss did not improve from 1.84789
Epoch 64/100
Epoch 00064: val_loss did not improve from 1.84789
Epoch 65/100
Epoch 00065: val_loss did not improve from 1.84789
Epoch 66/100
Epoch 00066: val_loss did not improve from 1.84789
Epoch 67/100
Epoch 00067: val_loss did not improve from 1.84789
Epoch 68/100
Epoch 00068: val_loss did not improve from 1.84789
Epoch 69/100
Epoch 00069: val_loss did not improve from 1.84789
Epoch 70/100
Epoch 00070: val_loss did not improve from 1.84789
Epoch 71/100
Epoch 00071: val_loss did not improve from 1.84789
Epoch 72/100
Epoch 00072: val_loss did not improve from 1.84789
Epoch 73/100
Epoch 00073: val_loss did not improve fr

Epoch 88/100
Epoch 00088: val_loss did not improve from 1.84789
Epoch 89/100
Epoch 00089: val_loss did not improve from 1.84789
Epoch 90/100
Epoch 00090: val_loss did not improve from 1.84789
Epoch 91/100
Epoch 00091: val_loss did not improve from 1.84789
Epoch 92/100
Epoch 00092: val_loss did not improve from 1.84789
Epoch 93/100
Epoch 00093: val_loss did not improve from 1.84789
Epoch 94/100
Epoch 00094: val_loss did not improve from 1.84789
Epoch 95/100
Epoch 00095: val_loss did not improve from 1.84789
Epoch 96/100
Epoch 00096: val_loss did not improve from 1.84789
Epoch 97/100
Epoch 00097: val_loss did not improve from 1.84789
Epoch 98/100
Epoch 00098: val_loss did not improve from 1.84789
Epoch 99/100
Epoch 00099: val_loss did not improve from 1.84789
Epoch 100/100
Epoch 00100: val_loss did not improve from 1.84789


## Load Model
Loads the best model found from training above, model.h5

In [39]:
model = load_model("model.h5")

### Get predicted probability for a given text

In [40]:

def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

### Get final output for the prediction and the classes of intents

In [41]:

def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

### Use the model =]

In [46]:

text = "When do classes start?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['when', 'do', 'classes', 'start']
faq.important_date has confidence = 0.66495943
class.location has confidence = 0.30868408
faq.department has confidence = 0.0103741735
faq.professor has confidence = 0.0066116094
department.time has confidence = 0.00581364
faq.student has confidence = 0.0013940028
class.professor has confidence = 0.0011072055
class.time has confidence = 0.00042878397
faq.employee has confidence = 0.0004191445
faq.class has confidence = 0.00020797239


In [43]:

text = "What time is CS448?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['what', 'time', 'is', 'cs448']
faq.important_date has confidence = 0.6010197
class.location has confidence = 0.36131033
faq.department has confidence = 0.015469164
faq.professor has confidence = 0.008923462
department.time has confidence = 0.0077641583
faq.student has confidence = 0.0020513462
class.professor has confidence = 0.0016670878
class.time has confidence = 0.00073204376
faq.employee has confidence = 0.00068655517
faq.class has confidence = 0.0003762511


In [44]:

text = "Why is Cyrus soooo gay"
pred = predictions(text)
get_final_output(pred, unique_intent)

['why', 'is', 'cyrus', 'soooo', 'gay']
faq.important_date has confidence = 0.558418
class.location has confidence = 0.42270595
faq.department has confidence = 0.008368289
faq.professor has confidence = 0.0042926595
department.time has confidence = 0.003756578
faq.student has confidence = 0.0008758987
class.professor has confidence = 0.0007826111
class.time has confidence = 0.0003623263
faq.employee has confidence = 0.00028308245
faq.class has confidence = 0.00015470317


In [45]:

text = "Who can I call to help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['who', 'can', 'i', 'call', 'to', 'help', 'me']
faq.student has confidence = 0.2683525
faq.professor has confidence = 0.14057757
faq.department has confidence = 0.132255
faq.employee has confidence = 0.11118568
class.location has confidence = 0.08039793
faq.class has confidence = 0.07556181
department.time has confidence = 0.07238655
faq.important_date has confidence = 0.052555244
class.professor has confidence = 0.037116643
class.time has confidence = 0.029611086
