In [1]:
import pandas as pd
import nltk
import gensim
import numpy as np
df = pd.read_csv('./content/archive/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv')
df.head(5)

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account


# Basic Processing

In [2]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
english_stopwords = stopwords.words('english')
porter = PorterStemmer()
def preprocess(text):
    text = text.lower()
    token = word_tokenize(text)
    token = [porter.stem(word) for word in token if (word not in english_stopwords) and word.isalpha()]
    return token
df['words'] = df['utterance'].apply(lambda x: preprocess(x))
print(df['words'].head())
print(df['words'].describe())

0                             [onlin, account, regist]
1    [tell, regisg, two, account, singl, email, add...
2                   [onlin, account, open, one, pleas]
3            [could, ask, agent, open, account, pleas]
4                   [want, onlin, account, creat, one]
Name: words, dtype: object
count           21534
unique          10836
top       [see, bill]
freq               31
Name: words, dtype: object


In [3]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state = 1337)

# Building Embedding Layer

In [4]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf

In [5]:
# Download from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
w2v_pretrain= gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['words'])
encoded_train_docs = tokenizer.texts_to_sequences(df_train['words'])
encoded_test_docs = tokenizer.texts_to_sequences(df_test['words'])
max_length = max([len(s) for s in df_train['words']])
X_train = pad_sequences(encoded_train_docs, 
                        maxlen=max_length, 
                        padding='post')
y_train = pd.get_dummies(df_train['category'])
X_test = pad_sequences(encoded_test_docs, 
                        maxlen=max_length, 
                        padding='post')
y_test = pd.get_dummies(df_test['category'])

In [7]:
# word2vec + cnn
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def get_word2vec_embed_layer(max_length, tokenizer, model):
    word_index = tokenizer.word_index
    embedding_mat = np.zeros((len(word_index)+1, 300))
    for word, i in word_index.items():
        try:
            vector = model.get_vector(word)
            embedding_mat[i] = vector
        except:
            continue
    word2vec_embedding_layer = Embedding(input_dim=embedding_mat.shape[0],
                                      output_dim=embedding_mat.shape[1], 
                                      weights=[embedding_mat],
                                      input_length=max_length, 
                                      trainable=False)
    return word2vec_embedding_layer

embedding = get_word2vec_embed_layer(X_train.shape[1], tokenizer, w2v_pretrain)

In [8]:
y_train_arr = np.array(y_train).reshape(-1,1,y_train.shape[1])
y_test_arr = np.array(y_test).reshape(-1,1,y_test.shape[1])

# RNN Model for topic detection

In [9]:
from tensorflow.keras.layers import LSTM, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [29]:
model = Sequential()
model.add(embedding)

model.add(LSTM(64))
model.add(Dense(y_train.shape[1],activation='softmax'))
callback = EarlyStopping(monitor='val_loss', patience=3)

model.compile(loss='CategoricalCrossentropy',metrics=['Accuracy'])

In [30]:
model.fit(X_train,y_train,validation_split=0.1,epochs=100,callbacks=[callback])

Epoch 1/100


2022-07-17 20:57:27.860184: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-17 20:57:27.974115: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-17 20:57:28.043507: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-07-17 20:57:33.644653: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-17 20:57:33.689240: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x3c4e702b0>

# Model Inference

In [31]:
model.save("RNN_chatbot")



INFO:tensorflow:Assets written to: RNN_chatbot/assets


INFO:tensorflow:Assets written to: RNN_chatbot/assets


In [32]:
topic_mapping =dict(zip(range(11),list(y_train.columns)))
print('Maxlen of prediction: ',max_length) # The incoming texts will be pad/ cut to length 13
topic_mapping

Maxlen of prediction:  13


{0: 'ACCOUNT',
 1: 'CANCELLATION_FEE',
 2: 'CONTACT',
 3: 'DELIVERY',
 4: 'FEEDBACK',
 5: 'INVOICES',
 6: 'NEWSLETTER',
 7: 'ORDER',
 8: 'PAYMENT',
 9: 'REFUNDS',
 10: 'SHIPPING'}

In [33]:
def accuracy_rate(df):
    return np.sum(df['True']!=df['Predict'])/len(df)
def model_evaluation(model,X,y):
    # Pass a list of word, label
    X_preprocessed = list(map(preprocess,X))
    X_input = pad_sequences(tokenizer.texts_to_sequences(X),maxlen=max_length,padding='post')
    y_pred = model.predict(X_input)
    class_prob = np.max(y_pred,axis=1)
    label = np.argmax(y_pred,axis=1)
    result = pd.DataFrame({'Word':X,'True':y,'Prob':class_prob,'Predict':label})
    result['Predict']= result['Predict'].apply(lambda x: topic_mapping[x])
    print('Accuracy: ',np.sum(result['True']==result['Predict'])/len(X))
    print('Top 5 misclassification:')
    temp = result[result['True']!=result['Predict']].sort_values(by='Prob').head(5)
    print(temp.to_string())
    print("-"*100)
    print("Misclassification Rate by category:")
    temp = result.groupby('True').apply(lambda x: accuracy_rate(x))
    print(temp)
    return result

## Evaluation on Training Set
The overall accuracy is quite satisfactory
The performance on some minority catrgories like Shipping,  Delivery,  cancellation_fee is quite bad subjucted to small sample size

In [34]:
result =model_evaluation(model,df_train['utterance'],df_train['category'])

 27/539 [>.............................] - ETA: 1s

2022-07-17 20:58:25.140776: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-17 20:58:25.184864: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy:  0.8423985603993731
Top 5 misclassification:
                                                                         Word      True      Prob     Predict
20961          please, could u ask an agnt where to receive an reimbursement?   REFUNDS  0.267685  NEWSLETTER
6587   I would like information, I need help calling Customer Service, please   CONTACT  0.295178  NEWSLETTER
9827                                i want to checkl my bills, can u help me?  INVOICES  0.295393     ACCOUNT
8412                                ask an agent hhow long the delivery takes  DELIVERY  0.300667  NEWSLETTER
6418                      tell me what the free number of Customer Support is   CONTACT  0.302037    DELIVERY
----------------------------------------------------------------------------------------------------
Misclassification Rate by category:
True
ACCOUNT             0.100966
CANCELLATION_FEE    0.740351
CONTACT             0.096616
DELIVERY            0.412935
FEEDBACK            0.03564

## Evalution on Test Set
The performance on testing set is quite consistent with the one in training set, indicate there is no sign of overfitting

In [35]:
result =model_evaluation(model,df_test['utterance'],df_test['category'])

Accuracy:  0.8344555374970978
Top 5 misclassification:
                                                                         Word        True      Prob     Predict
6442   I want information, I ned help speaking with Customer Servgice, please     CONTACT  0.234240     ACCOUNT
13758               can u ask an agnet if my fucking orders are on their way?       ORDER  0.284226     REFUNDS
8358                      can u ask an agent how soon i can expect my tickets    DELIVERY  0.312659    INVOICES
12498                       I wantto know avout subscribing to the newsletter  NEWSLETTER  0.320301       ORDER
2487                          I don't want my profile and I want to delete it     ACCOUNT  0.334384  NEWSLETTER
----------------------------------------------------------------------------------------------------
Misclassification Rate by category:
True
ACCOUNT             0.085837
CANCELLATION_FEE    0.746667
CONTACT             0.106688
DELIVERY            0.484848
FEEDBACK       

# Inference on some self prepare enquiry

In [37]:
Enquiry=['I want to open an account',"The hat quality is poor, can I return it?","Find somebody to talk with me or cotact me later"]
# Enquiry 1 and 2 should be some quite standard Account and Refund enquiry.
# Enquiry 3 is about contact, but there is some typo
topic_ans = ['ACCOUNT','REFUNDS','CONTACT']

In [38]:
model_evaluation(model,Enquiry,topic_ans)

Accuracy:  0.6666666666666666
Top 5 misclassification:
                                        Word     True      Prob Predict
1  The hat quality is poor, can I return it?  REFUNDS  0.669783   ORDER
----------------------------------------------------------------------------------------------------
Misclassification Rate by category:
True
ACCOUNT    0.0
CONTACT    0.0
REFUNDS    1.0
dtype: float64


Unnamed: 0,Word,True,Prob,Predict
0,I want to open an account,ACCOUNT,0.999975,ACCOUNT
1,"The hat quality is poor, can I return it?",REFUNDS,0.669783,ORDER
2,Find somebody to talk with me or cotact me later,CONTACT,0.885407,CONTACT
