<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout,LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow import keras
import spacy


In [2]:
#from bert4keras.models import build_transformer_model
#from bert4keras.tokenizers import Tokenizer
#config_path = '../bert/bert_config.json'
#checkpoint_path = '../bert/bert_model.ckpt'
#dict_path = '../bert/vocab.txt'
#tokenizer = Tokenizer(dict_path, do_lower_case=True)
#model = build_transformer_model(config_path, checkpoint_path)

In [3]:
#token_ids, segment_ids = tokenizer.encode(u'语言模型')

#print('\n ===== predicting =====\n')
#print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

In [4]:
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [5]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  


In [6]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


In [7]:
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [8]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cklam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cklam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
#define stemmer
stemmer = LancasterStemmer()

In [10]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [11]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
  


1113
[['need', 'help', 'pleese'], ['need', 'help']]


In [12]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [13]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [14]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)


print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 492 and Maximum length = 28


In [15]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [16]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)
word_index = word_tokenizer.word_index

In [17]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [18]:
padded_doc = padding_doc(encoded_doc, max_length)

In [19]:
padded_doc[:5]

array([[ 25,  77, 332,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 25,  77,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  1,  25, 198, 181,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 51,  10,  77,  16,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8, 268,   4,  10,  30,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])

In [20]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1113, 28)


In [21]:
# 轉成 Embedding 層的 input vector
num_words = min(20000,len(word_index)+1)
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_index.items():
    if i >= len(word_index):
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 載入預訓模型，trainable = False 表示不重新計算
embedding_layer = Embedding(num_words,
                            100,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [22]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [23]:
output_tokenizer.word_index

{'faq.biz_new': 1,
 'faq.borrow_limit': 2,
 'commonq.just_details': 3,
 'commonq.bot': 4,
 'faq.address_proof': 5,
 'faq.apply_register': 6,
 'faq.banking_option_missing': 7,
 'faq.bad_service': 8,
 'faq.approval_time': 9,
 'faq.aadhaar_missing': 10,
 'faq.biz_category_missing': 11,
 'commonq.wait': 12,
 'faq.biz_simpler': 13,
 'commonq.how': 14,
 'commonq.name': 15,
 'commonq.assist': 16,
 'faq.application_process': 17,
 'faq.borrow_use': 18,
 'commonq.not_giving': 19,
 'contact.contact': 20,
 'commonq.query': 21}

In [24]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [25]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [26]:
encoded_output.shape

(1113, 1)

In [27]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [28]:
output_one_hot = one_hot(encoded_output)

In [29]:
output_one_hot.shape

(1113, 21)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [32]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (890, 28) and train_Y = (890, 21)
Shape of val_X = (223, 28) and val_Y = (223, 21)


In [33]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length = max_length, trainable = False, weights=[embedding_matrix]))
#  model.add(Embedding(vocab_size, 100, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(100, return_sequences=True)))
  model.add(Bidirectional(LSTM(50)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(LayerNormalization())
  model.add(Dropout(0.3))
#  model.add(Dense(32, activation = "relu"))
#  model.add(LayerNormalization())
#  model.add(Dropout(0.3))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [34]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adamax", metrics = ["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 100)           49200     
_________________________________________________________________
bidirectional (Bidirectional (None, 28, 200)           160800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               100400    
_________________________________________________________________
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
layer_normalization (LayerNo (None, 32)                64        
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                6

In [35]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint,earlystop])

Train on 890 samples, validate on 223 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 2.19133, saving model to model.h5
Epoch 2/100
Epoch 00002: val_loss improved from 2.19133 to 1.75076, saving model to model.h5
Epoch 3/100
Epoch 00003: val_loss improved from 1.75076 to 1.53985, saving model to model.h5
Epoch 4/100
Epoch 00004: val_loss improved from 1.53985 to 1.30849, saving model to model.h5
Epoch 5/100
Epoch 00005: val_loss improved from 1.30849 to 1.17329, saving model to model.h5
Epoch 6/100
Epoch 00006: val_loss improved from 1.17329 to 1.05542, saving model to model.h5
Epoch 7/100
Epoch 00007: val_loss improved from 1.05542 to 0.97566, saving model to model.h5
Epoch 8/100
Epoch 00008: val_loss improved from 0.97566 to 0.87641, saving model to model.h5
Epoch 9/100
Epoch 00009: val_loss improved from 0.87641 to 0.83400, saving model to model.h5
Epoch 10/100
Epoch 00010: val_loss improved from 0.83400 to 0.76908, saving model to model.h5
Epoch 11/100
Epoch 00011: v

In [36]:
model = load_model("model.h5")

In [37]:
def pos(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text) # English: 'Where are you?'
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)

In [38]:
def predictions(text):
  pos(text)
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred


  

In [39]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [40]:
import pickle
with open('word_tokenizer.pickle', 'wb') as handle:
    pickle.dump(word_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

Can can AUX MD aux Xxx True True
you you PRON PRP nsubj xxx True True
help help VERB VB ROOT xxxx True False
me I PRON PRP dobj xx True True
? ? PUNCT . punct ? False False
['can', 'you', 'help', 'me']
commonQ.assist has confidence = 0.65746045
commonQ.name has confidence = 0.06545326
commonQ.how has confidence = 0.034884408
faq.bad_service has confidence = 0.030022996
commonQ.just_details has confidence = 0.026091795
faq.borrow_limit has confidence = 0.02516772
faq.aadhaar_missing has confidence = 0.023635486
faq.application_process has confidence = 0.021200769
commonQ.query has confidence = 0.02079014
commonQ.not_giving has confidence = 0.017898811
commonQ.wait has confidence = 0.015623264
faq.borrow_use has confidence = 0.014280699
faq.biz_simpler has confidence = 0.011900881
faq.banking_option_missing has confidence = 0.010449684
faq.biz_category_missing has confidence = 0.008156389
faq.apply_register has confidence = 0.005921877
commonQ.bot has confidence = 0.0028822846
faq.addres

In [42]:
text = "what is the status of machine"
pred = predictions(text)
get_final_output(pred, unique_intent)

what what PRON WP attr xxxx True True
is be AUX VBZ ROOT xx True True
the the DET DT det xxx True True
status status NOUN NN nsubj xxxx True False
of of ADP IN prep xx True True
machine machine NOUN NN pobj xxxx True False
['what', 'is', 'the', 'status', 'of', 'machine']
commonQ.bot has confidence = 0.44582468
faq.apply_register has confidence = 0.25063354
contact.contact has confidence = 0.08918841
faq.application_process has confidence = 0.052796543
commonQ.query has confidence = 0.02955908
commonQ.name has confidence = 0.025358351
commonQ.how has confidence = 0.016818201
faq.biz_new has confidence = 0.013923896
faq.approval_time has confidence = 0.013074705
faq.bad_service has confidence = 0.010916483
commonQ.not_giving has confidence = 0.009460566
faq.address_proof has confidence = 0.008145311
faq.borrow_use has confidence = 0.006877996
commonQ.assist has confidence = 0.006130393
commonQ.wait has confidence = 0.0051999525
faq.biz_simpler has confidence = 0.0051171454
commonQ.just_d

In [43]:
text = "How to subscribe a IaaS resource"
pred = predictions(text)
get_final_output(pred, unique_intent)

How how ADV WRB advmod Xxx True True
to to PART TO aux xx True True
subscribe subscribe VERB VB ROOT xxxx True False
a a DET DT det x True True
IaaS IaaS PROPN NNP compound XxxX True False
resource resource NOUN NN dobj xxxx True False
['how', 'to', 'subscribe', 'a', 'iaas', 'resource']
faq.application_process has confidence = 0.38295612
commonQ.query has confidence = 0.20202105
faq.apply_register has confidence = 0.19973065
commonQ.bot has confidence = 0.066315725
commonQ.how has confidence = 0.05262089
commonQ.assist has confidence = 0.015372348
contact.contact has confidence = 0.01315653
faq.bad_service has confidence = 0.012977205
faq.approval_time has confidence = 0.012356449
commonQ.name has confidence = 0.0089979945
commonQ.not_giving has confidence = 0.007391835
faq.biz_simpler has confidence = 0.0049456125
faq.address_proof has confidence = 0.003961111
commonQ.wait has confidence = 0.0035287107
faq.biz_new has confidence = 0.0035102002
faq.aadhaar_missing has confidence = 0.00

In [44]:
text = "Really bad support"
pred = predictions(text)
get_final_output(pred, unique_intent)

Really really ADV RB advmod Xxxxx True True
bad bad ADJ JJ amod xxx True False
support support NOUN NN ROOT xxxx True False
['really', 'bad', 'support']
faq.bad_service has confidence = 0.40887183
faq.aadhaar_missing has confidence = 0.26385894
faq.application_process has confidence = 0.05557092
commonQ.name has confidence = 0.049335055
commonQ.wait has confidence = 0.047366463
commonQ.assist has confidence = 0.022716876
commonQ.how has confidence = 0.022009714
commonQ.not_giving has confidence = 0.020639041
commonQ.query has confidence = 0.01553854
faq.borrow_limit has confidence = 0.0151531035
faq.biz_category_missing has confidence = 0.012907332
commonQ.bot has confidence = 0.011364317
commonQ.just_details has confidence = 0.010146258
faq.borrow_use has confidence = 0.010041847
faq.address_proof has confidence = 0.010033593
faq.banking_option_missing has confidence = 0.007573587
faq.biz_simpler has confidence = 0.00617479
faq.biz_new has confidence = 0.0036256227
faq.approval_time h

In [45]:
text = "What is the status of PVM20210607"
pred = predictions(text)
get_final_output(pred, unique_intent)

What what PRON WP attr Xxxx True True
is be AUX VBZ ROOT xx True True
the the DET DT det xxx True True
status status NOUN NN nsubj xxxx True False
of of ADP IN prep xx True True
PVM20210607 pvm20210607 NOUN NN pobj XXXdddd False False
['what', 'is', 'the', 'status', 'of', 'pvm20210607']
contact.contact has confidence = 0.428287
faq.application_process has confidence = 0.14621717
commonQ.name has confidence = 0.107909255
faq.apply_register has confidence = 0.07806759
commonQ.query has confidence = 0.035805337
commonQ.how has confidence = 0.0357539
commonQ.not_giving has confidence = 0.027166702
commonQ.bot has confidence = 0.024286345
faq.borrow_limit has confidence = 0.020852601
commonQ.assist has confidence = 0.018779764
faq.biz_simpler has confidence = 0.01683103
faq.approval_time has confidence = 0.011606105
faq.address_proof has confidence = 0.011262473
faq.bad_service has confidence = 0.009746549
commonQ.just_details has confidence = 0.008879144
commonQ.wait has confidence = 0.005