In [1]:
# https://towardsdatascience.com/a-brief-introduction-to-intent-classification-96fda6b1f557
# Classifies into 21 intents


import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Load Dataset

In [2]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])

  df.Intent=df.Intent.astype(str)
  df.Sentence=df.Sentence.astype(str)

  #df = df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand')

  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [3]:
intent, unique_intent, sentences = load_dataset("intents_and_categories.csv")

                                            Sentence  \
0                             When do classes start?   
1                      When does the semester start?   
2                  What day does the semester start?   
3           I want to know when the semester starts.   
4  I was wondering when the beginning of the seme...   

                          Intent  
0   important_date.class_actions  
1  important_date.semester_start  
2  important_date.semester_start  
3  important_date.semester_start  
4  important_date.semester_start  


In [4]:
print("Intents: ")
print(intent)
print("Unique Intents: ")
print(unique_intent)
print("Sentences: ")
print(sentences)


Intents: 
0       important_date.class_actions
1      important_date.semester_start
2      important_date.semester_start
3      important_date.semester_start
4      important_date.semester_start
                   ...              
129                     employee.pay
130                     employee.pay
131                     undetermined
132                     undetermined
133          professor.help.meetings
Name: Intent, Length: 134, dtype: object
Unique Intents: 
['class.time', 'important_date.semester_end', 'important_date.break', 'important_date.drop_class', 'important_date.add_class', 'important_date.semester_start', 'important_date.graduation', 'professor.help.meetings', 'student.actions', 'important_date.registration', 'student.tutor', 'student.degree', 'cs_department.department_head', 'class.assistant', 'cs_department.advisor', 'professor.general_information', 'cs_department.candy', 'employee.pay', 'class.professor', 'location.lost_and_found', 'professor.actions', 'importa

# Get stopwords and punkt

In [5]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kathrine.swe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Define stemmer

In [6]:
stemmer = LancasterStemmer()

# Data cleaning

In [7]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        # lemmatizing
        words.append([i.lower() for i in w])
    return words

In [8]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

134
[['when', 'do', 'classes', 'start'], ['when', 'does', 'the', 'semester', 'start']]


# Input encoding

In [9]:
def create_tokenizer( words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    """Create tokenizer
    """
    token = Tokenizer(filters=filters)
    token.fit_on_texts(words)
    return token

def get_max_length(words):
    """Gets max length of a word
    """
    return(len(max(words, key=len)))


In [10]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = get_max_length(cleaned_words)

print("Vocab size = ", vocab_size, " and Maximum length = ", max_length)

Vocab size =  172  and Maximum length =  17


# Output Encoding

In [11]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [12]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [13]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen=max_length, padding="post"))

In [14]:
padded_doc = padding_doc(encoded_doc, max_length)
padded_doc[:5]

array([[  5,   6,  25,  18,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  5,  12,   3,  19,  18,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [ 10,  15,  12,   3,  19,  18,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  1,  29,   8,  36,   5,   3,  19, 102,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  1, 103, 104,   5,   3, 105,  23,   3,  19,   2,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [15]:
print("Shape of padded docs = ", padded_doc.shape)

Shape of padded docs =  (134, 17)


### Tokenizer

In [16]:
# tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [17]:
output_tokenizer.word_index

{'class.time': 1,
 'important_date.semester_end': 2,
 'important_date.break': 3,
 'important_date.drop_class': 4,
 'important_date.add_class': 5,
 'important_date.semester_start': 6,
 'important_date.graduation': 7,
 'professor.help.meetings': 8,
 'student.actions': 9,
 'important_date.registration': 10,
 'student.tutor': 11,
 'student.degree': 12,
 'cs_department.department_head': 13,
 'class.assistant': 14,
 'cs_department.advisor': 15,
 'professor.general_information': 16,
 'cs_department.candy': 17,
 'employee.pay': 18,
 'class.professor': 19,
 'location.lost_and_found': 20,
 'professor.actions': 21,
 'important_date.finals': 22,
 'class.schedule': 23,
 'undetermined': 24,
 'important_date.class_actions': 25,
 'cs_department.employee': 26,
 'class.actions': 27,
 'professor.contact': 28,
 'location.room': 29,
 'professor.office_hours': 30,
 'cs_department.time': 31,
 'cs_department.lost_and_found': 32}

### Encode output given intent and tokenizer and reshape

In [18]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [19]:
print(encoded_output)

[[25], [6], [6], [6], [6], [6], [6], [6], [6], [6], [6], [2], [2], [2], [2], [2], [2], [3], [22], [22], [22], [22], [3], [3], [3], [10], [10], [10], [5], [5], [4], [7], [7], [3], [3], [3], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [29], [20], [29], [29], [29], [29], [29], [30], [28], [14], [16], [19], [27], [27], [27], [27], [27], [1], [1], [1], [31], [31], [31], [31], [31], [32], [32], [15], [26], [13], [13], [13], [13], [13], [13], [13], [21], [21], [21], [21], [21], [21], [9], [9], [23], [11], [11], [12], [12], [12], [12], [12], [17], [17], [17], [17], [18], [18], [18], [18], [18], [24], [24], [8]]


In [20]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [21]:
encoded_output.shape

(134, 1)

In [22]:
print(encoded_output)

[[25]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 6]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 2]
 [ 3]
 [22]
 [22]
 [22]
 [22]
 [ 3]
 [ 3]
 [ 3]
 [10]
 [10]
 [10]
 [ 5]
 [ 5]
 [ 4]
 [ 7]
 [ 7]
 [ 3]
 [ 3]
 [ 3]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [29]
 [20]
 [29]
 [29]
 [29]
 [29]
 [29]
 [30]
 [28]
 [14]
 [16]
 [19]
 [27]
 [27]
 [27]
 [27]
 [27]
 [ 1]
 [ 1]
 [ 1]
 [31]
 [31]
 [31]
 [31]
 [31]
 [32]
 [32]
 [15]
 [26]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [13]
 [21]
 [21]
 [21]
 [21]
 [21]
 [21]
 [ 9]
 [ 9]
 [23]
 [11]
 [11]
 [12]
 [12]
 [12]
 [12]
 [12]
 [17]
 [17]
 [17]
 [17]
 [18]
 [18]
 [18]
 [18]
 [18]
 [24]
 [24]
 [ 8]]


### Create one hot encoding
Example of one hot encoding:
Consider a domain of [ a, e, i, o, u] and an intent of [ a, i, u ]
The one hot encoding is [ 1, 0, 1, 0, 1] for the domain.

In [23]:
def one_hot(encode):
    o = OneHotEncoder(sparse=False)
    return(o.fit_transform(encode))

In [24]:
output_one_hot = one_hot(encoded_output)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [25]:
output_one_hot.shape

(134, 32)

## Create Model

In [26]:
from sklearn.model_selection import train_test_split

In [27]:

train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)

In [28]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (107, 17) and train_Y = (107, 32)
Shape of val_X = (27, 17) and val_Y = (27, 32)


### Sequential Model
[ conv ] -> [ batch norm ] -> [ relu ]

A sequential model allows you to create models layer-by-layer in a step-by-step fashion

We instantiate the sequential model first, then add each layer one at a time.
Layers:
    
    Embedding:
        Vocab size:
        Input length:
    Bidirectional:
        LSTM:
    Dense: Relu
    Dropout: 0.5
    Dense: Softmax

In [29]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(len(unique_intent), activation = "softmax"))
  
  return model

In [30]:
print(vocab_size, max_length)
print(len(unique_intent))

172 17
32


### Compile and give summary of model
#### Compile
A loss function (or objective function, or optimization score function) is one of the two parameters required to compile a model. We use categorical cross entropy to train a CNN to output a probability over the C classes for each image. It is used for multi-class classification. It is also called softmax loss, a softmax activation plus a cross-entropy loss.

Adam is an adaptive learning rate optimization algorithm designed to train deep neural nets. Adam computes individual learning rates for different parameters, using the first and second moments of gradient to adapt the learning rate for each weight of the neural network.



In [31]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17, 128)           22016     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
Total params: 294,464
Trainable params: 272,448
Non-trainable params: 22,016
_________________________________________________________________


## Train Model
Uses checkpoint to save best model at each training

In [32]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 107 samples, validate on 27 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 3.41008, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 3.41008 to 3.31614, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 3.31614 to 3.08467, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 3.08467 to 2.88996, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss did not improve from 2.88996
Epoch 6/100
Epoch 00006: val_loss did not improve from 2.88996
Epoch 7/100
Epoch 00007: val_loss did not improve from 2.88996
Epoch 8/100
Epoch 00008: val_loss improved from 2.88996 to 2.82574, saving model to model.h5
Epoch 9/100
Epoch 00009: val_loss did not improve from 2.82574
Epoch 10/100
Epoch 00010: val_loss did not improve from 2.82574
Epoch 11/100
Epoch 00011: val_loss did not improve from 2.82574
Epoch 12/100
Epoch 00012: val_loss did not improve from 2.82574
Epoch 13/100
Epoch 00013: val_loss did not improve

## Load Model
Loads the best model found from training above, model.tf

In [33]:
model = load_model("model.h5")

### Get predicted probability for a given text

In [34]:

def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

### Get final output for the prediction and the classes of intents

In [35]:

def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

### Use the model =]

In [36]:

text = "Where is the bathroom?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['where', 'is', 'the', 'bathroom']
location.room has confidence = 0.9996063
student.degree has confidence = 0.00015015986
important_date.break has confidence = 0.00014791139
cs_department.candy has confidence = 6.617663e-05
cs_department.department_head has confidence = 2.1902215e-05
student.tutor has confidence = 3.064558e-06
student.actions has confidence = 1.1676095e-06
important_date.semester_start has confidence = 8.4472487e-07
cs_department.time has confidence = 6.812841e-07
professor.actions has confidence = 3.6305713e-07
important_date.finals has confidence = 3.018916e-07
professor.contact has confidence = 2.7457568e-07
employee.pay has confidence = 1.8582833e-07
location.lost_and_found has confidence = 1.3947381e-07
important_date.semester_end has confidence = 1.3061843e-07
class.actions has confidence = 1.18704975e-07
important_date.graduation has confidence = 1.086158e-07
important_date.class_actions has confidence = 2.843027e-08
class.assistant has confidence = 2.5561594e-0

In [37]:
# Missing a category for this
text = "How do I apply for this position?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['how', 'do', 'i', 'apply', 'for', 'this', 'position']
location.room has confidence = 0.22405767
student.degree has confidence = 0.17415778
important_date.break has confidence = 0.15930165
cs_department.candy has confidence = 0.06802481
cs_department.department_head has confidence = 0.064941704
professor.actions has confidence = 0.042349346
student.tutor has confidence = 0.035329804
class.actions has confidence = 0.03477942
cs_department.time has confidence = 0.025120987
important_date.finals has confidence = 0.014790873
employee.pay has confidence = 0.014046697
important_date.graduation has confidence = 0.012799045
professor.general_information has confidence = 0.012777041
professor.contact has confidence = 0.012455371
location.lost_and_found has confidence = 0.011779281
student.actions has confidence = 0.011028904
important_date.add_class has confidence = 0.0087904595
important_date.semester_start has confidence = 0.007919656
class.time has confidence = 0.007057428
class.schedule has

In [39]:

text = "When does class start"
pred = predictions(text)
get_final_output(pred, unique_intent)

['when', 'does', 'class', 'start']
important_date.semester_start has confidence = 0.21085246
important_date.semester_end has confidence = 0.19307576
cs_department.time has confidence = 0.07706747
class.time has confidence = 0.07469574
important_date.break has confidence = 0.06873134
employee.pay has confidence = 0.048331194
important_date.registration has confidence = 0.044060472
cs_department.department_head has confidence = 0.029015362
important_date.graduation has confidence = 0.0268216
cs_department.candy has confidence = 0.020679187
important_date.add_class has confidence = 0.020010762
important_date.finals has confidence = 0.017509388
student.actions has confidence = 0.016473062
cs_department.advisor has confidence = 0.014108512
location.lost_and_found has confidence = 0.013121838
important_date.class_actions has confidence = 0.012605652
professor.actions has confidence = 0.011815542
professor.contact has confidence = 0.011517313
class.schedule has confidence = 0.011372663
locati

In [42]:

text = "What day is the first day of class"
pred = predictions(text)
get_final_output(pred, unique_intent)

['what', 'day', 'is', 'the', 'first', 'day', 'of', 'class']
important_date.semester_start has confidence = 0.47262448
important_date.semester_end has confidence = 0.23996478
class.time has confidence = 0.0572898
cs_department.time has confidence = 0.041748505
important_date.break has confidence = 0.040201973
employee.pay has confidence = 0.022894034
important_date.registration has confidence = 0.021745028
important_date.graduation has confidence = 0.0155054405
cs_department.department_head has confidence = 0.011088861
location.room has confidence = 0.009305677
cs_department.candy has confidence = 0.008228852
student.actions has confidence = 0.0068848077
important_date.finals has confidence = 0.006128978
location.lost_and_found has confidence = 0.0049523483
important_date.add_class has confidence = 0.004923139
professor.contact has confidence = 0.004215413
cs_department.advisor has confidence = 0.0037628065
class.assistant has confidence = 0.0037062506
class.schedule has confidence = 0.

In [43]:

text = "What days off do we have?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['what', 'days', 'off', 'do', 'we', 'have']
important_date.break has confidence = 0.10852956
cs_department.department_head has confidence = 0.06220601
cs_department.candy has confidence = 0.06135723
cs_department.time has confidence = 0.05287456
important_date.semester_end has confidence = 0.048279174
employee.pay has confidence = 0.046318375
important_date.semester_start has confidence = 0.045832865
professor.actions has confidence = 0.04221223
student.degree has confidence = 0.041241072
class.time has confidence = 0.03780835
location.room has confidence = 0.0361469
important_date.graduation has confidence = 0.03488124
class.actions has confidence = 0.034471262
important_date.finals has confidence = 0.033373248
student.actions has confidence = 0.028275525
professor.contact has confidence = 0.026553785
student.tutor has confidence = 0.025194345
location.lost_and_found has confidence = 0.024997849
important_date.add_class has confidence = 0.023661029
important_date.registration has conf