<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout,LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
from tensorflow import keras


In [57]:
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
config_path = '/bert/bert_config.json'
checkpoint_path = '/bert/bert_model.ckpt'
dict_path = '/bert/vocab.txt'
tokenizer = Tokenizer(dict_path, do_lower_case=True)
model = build_transformer_model(config_path, checkpoint_path)

AttributeError: module 'keras.utils.generic_utils' has no attribute 'populate_dict_with_module_objects'

In [None]:
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

In [2]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  


In [3]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


In [4]:
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [5]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cklam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cklam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#define stemmer
stemmer = LancasterStemmer()

In [7]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [8]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
  


1113
[['need', 'help', 'pleese'], ['need', 'help']]


In [9]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [10]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [11]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 492 and Maximum length = 28


In [12]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [13]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [14]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [15]:
padded_doc = padding_doc(encoded_doc, max_length)

In [16]:
padded_doc[:5]

array([[ 25,  77, 332,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 25,  77,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  1,  25, 198, 181,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 51,  10,  77,  16,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8, 268,   4,  10,  30,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])

In [17]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1113, 28)


In [18]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [19]:
output_tokenizer.word_index

{'faq.biz_simpler': 1,
 'commonq.how': 2,
 'commonq.wait': 3,
 'commonq.bot': 4,
 'commonq.query': 5,
 'faq.borrow_limit': 6,
 'faq.banking_option_missing': 7,
 'commonq.not_giving': 8,
 'contact.contact': 9,
 'faq.address_proof': 10,
 'faq.approval_time': 11,
 'faq.bad_service': 12,
 'faq.borrow_use': 13,
 'faq.application_process': 14,
 'faq.apply_register': 15,
 'commonq.just_details': 16,
 'faq.aadhaar_missing': 17,
 'faq.biz_category_missing': 18,
 'commonq.assist': 19,
 'faq.biz_new': 20,
 'commonq.name': 21}

In [20]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [21]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [22]:
encoded_output.shape

(1113, 1)

In [23]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [24]:
output_one_hot = one_hot(encoded_output)

In [25]:
output_one_hot.shape

(1113, 21)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [28]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (890, 28) and train_Y = (890, 21)
Shape of val_X = (223, 28) and val_Y = (223, 21)


In [43]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(64, activation = "relu"))
  model.add(LayerNormalization())
  model.add(Dropout(0.3))
  model.add(Dense(32, activation = "relu"))
  model.add(LayerNormalization())
  model.add(Dropout(0.3))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [46]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adamax", metrics = ["accuracy"])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 28, 128)           62976     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_4 (Dense)              (None, 64)                16448     
_________________________________________________________________
layer_normalization (LayerNo (None, 64)                128       
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
layer_normalization_1 (Layer (None, 32)               

In [47]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint,earlystop])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.65332, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.65332 to 2.58182, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.58182 to 2.40871, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.40871 to 2.19282, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.19282 to 1.99979, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 1.99979 to 1.88193, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 1.88193 to 1.73753, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 1.73753 to 1.57421, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss improved from 1.57421 to 1.51681, saving model to model.h5
Epoch 10/100

Epoch 00010: val_loss improved from 1.51681 to 1.41828, saving model to model.h5
Epoch 11/100

Epoch 00011: val_loss improved from 1.41828 to 1.


Epoch 00038: val_loss did not improve from 0.60631
Epoch 39/100

Epoch 00039: val_loss did not improve from 0.60631
Epoch 40/100

Epoch 00040: val_loss improved from 0.60631 to 0.58299, saving model to model.h5
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.58299
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.58299
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.58299


In [48]:
 model = load_model("model.h5")

In [49]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred


  

In [50]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [51]:
text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)

['can', 'you', 'help', 'me']
commonQ.assist has confidence = 0.8149865
faq.biz_simpler has confidence = 0.041353233
commonQ.bot has confidence = 0.026135573
contact.contact has confidence = 0.024465024
commonQ.how has confidence = 0.018269645
faq.apply_register has confidence = 0.013250038
commonQ.wait has confidence = 0.011071756
faq.aadhaar_missing has confidence = 0.009382638
faq.bad_service has confidence = 0.00924835
commonQ.not_giving has confidence = 0.008607407
commonQ.name has confidence = 0.0069128983
faq.borrow_use has confidence = 0.0041292086
faq.borrow_limit has confidence = 0.0036818
commonQ.just_details has confidence = 0.0034714518
faq.biz_new has confidence = 0.001510999
commonQ.query has confidence = 0.0013555232
faq.biz_category_missing has confidence = 0.0010349959
faq.approval_time has confidence = 0.00050180295
faq.address_proof has confidence = 0.00023229868
faq.application_process has confidence = 0.00020509967
faq.banking_option_missing has confidence = 0.0001

In [54]:
text = "what is the status of machine"
pred = predictions(text)
get_final_output(pred, unique_intent)

['what', 'is', 'the', 'status', 'of', 'machine']
commonQ.assist has confidence = 0.33879754
faq.apply_register has confidence = 0.21809928
faq.biz_simpler has confidence = 0.19926198
faq.application_process has confidence = 0.07832856
contact.contact has confidence = 0.0423267
faq.borrow_limit has confidence = 0.025300356
faq.biz_new has confidence = 0.01993843
commonQ.not_giving has confidence = 0.016335577
faq.bad_service has confidence = 0.014217495
faq.aadhaar_missing has confidence = 0.010039139
commonQ.wait has confidence = 0.008689242
commonQ.how has confidence = 0.008107038
commonQ.name has confidence = 0.004722679
commonQ.bot has confidence = 0.0040169135
faq.banking_option_missing has confidence = 0.003937143
faq.biz_category_missing has confidence = 0.0019291642
commonQ.query has confidence = 0.0017043999
faq.borrow_use has confidence = 0.0017018206
commonQ.just_details has confidence = 0.0012658816
faq.address_proof has confidence = 0.0009377083
faq.approval_time has confid