# CHATBOT PROJECT 

# Start date : Friday 8th November 2024

In [None]:
import zipfile 
import os 

In [None]:
import os 

# print the current directory
print(os.getcwd())

In [None]:
print(os.getcwd())

In [None]:
print(os.listdir())

In [None]:
print(os.listdir())

In [None]:
import zipfile
import os 

# Unzip the file to the current directory 
with zipfile.ZipFile("20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.zip", 'r') as zip_ref:
    zip_ref.extractall(".")
    print("Files extracted to the current directory.")

# Training Dataset for chatbots/Virtual Assistants

*Training your customer service chatbot*

# About Dataset

# Bitext Sample Pre-built Customer Support Dataset for English

# Overview

This dataset contains example utterances and their corresponding intents from the Customer Support domain. The data can be used to train 
intent recognition models Natural Language Understanding (NLU) platforms.


The dataset covers the "Customer Support" domain and includes 27 intents grouped in 11 categories. These intents have been selected from 
Bitext's collection of 20 domain-specific datasets (banking, retail, utilities…), keeping the intents that are common across domains. See below 
for a full list of categories and intents.

# Utterances

The dataset contains over 20,000 utterances, with a varying number of utterances per intent. These utterances have been extracted from a 
larger dataset of 288,000 utterances (approx. 10,000 per intent), including language register variations such as politeness, colloquial, swearing, 
indirect style… To select the utterances, we use stratified sampling to generate a dataset with a general user language register profile.



The dataset also reflects commonly ocurring linguistic phenomena of real-life chatbots, such as:
    
- spelling mistakes


- run-on words


- missing punctuation

# Contents


Each entry in the dataset contains an example utterance from the Customer Support domain, along with its corresponding intent, category and 
additional linguistic information. Each line contains the following four fields:


    
- flags: the applicable linguistic flags

- utterance: an example user utterance

- category: the high-level intent category

- intent: the intent corresponding to the user utterance



# Linguistic flags


The dataset contains annotations for linguistic phenomena, which can be used to adapt bot training to different user language profiles.
These flags are:


B - Basic syntactic structure


S - Syntactic structure


L - Lexical variation (synonyms)


M - Morphological variation (plurals, tenses…)


I - Interrogative structure

C - Complex/Coordinated syntactic structure

P - Politeness variation

Q - Colloquial variation

W - Offensive language

E - Expanded abbreviations (I'm -> I am, I'd -> I would…)

D - Indirect speech (ask an agent to…)

Z - Noise (spelling, punctuation…)

These phenomena make the training dataset more effective and make bots more accurate and robust.

# Categories and Intents


The intent categories covered by the dataset are:


ACCOUNT


CANCELLATION_FEE


CONTACT


DELIVERY

FEEDBACK

INVOICES

NEWSLETTER

ORDER

PAYMENT

REFUNDS

SHIPPING

# The intents covered by the dataset are:


cancel_order


complaint


contact_customer_service


contact_human_agent


create_account


change_order

change_shipping_address

check_cancellation_fee

check_invoices

check_payment_methods

check_refund_policy

delete_account

delivery_options

delivery_period

edit_account

get_invoice

get_refund

newsletter_subscription

payment_issue

place_order

recover_password

registration_problems

review

set_up_shipping_address

switch_account

track_order

track_refund

(c) Bitext Innovations, 2020

# Import necessary libraries 

In [1]:
# Import necessary libraries 

import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout  

# Load your dataset into a DataFrame

In [2]:
# Load your dataset 

df = pd.read_csv('20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample/20000-Utterances-Training-dataset-for-chatbots-virtual-assistant-Bitext-sample.csv')
df

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account
...,...,...,...,...
21529,BILQC,"i have no shipping address, what do i have to ...",SHIPPING,set_up_shipping_address
21530,BLQC,I have no shipping address and I want to set o...,SHIPPING,set_up_shipping_address
21531,BIQC,"i want to set up my shipping address, what do ...",SHIPPING,set_up_shipping_address
21532,BILC,"I don't have a shipping address, can you set o...",SHIPPING,set_up_shipping_address


In [3]:
df.head()

Unnamed: 0,flags,utterance,category,intent
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account
4,BLQC,"i want an online account, create one",ACCOUNT,create_account


# Explore and Preprocess the Data

*First, inspect the dataset to understand its structure.*

In [4]:
# Explore the data 
print(df['intent'].value_counts())
print(df['category'].value_counts())

payment_issue               4366
create_account              2122
contact_customer_service    2055
get_invoice                 1430
track_order                 1224
get_refund                  1150
contact_human_agent         1026
check_invoices              1013
recover_password             986
change_order                 926
delete_account               913
complaint                    746
review                       580
check_refund_policy          479
delivery_options             360
check_cancellation_fee       360
track_refund                 303
switch_account               273
check_payment_methods        270
newsletter_subscription      236
delivery_period              141
edit_account                 133
registration_problems        130
change_shipping_address      110
set_up_shipping_address       96
place_order                   70
cancel_order                  36
Name: intent, dtype: int64
PAYMENT             4636
ACCOUNT             4557
CONTACT             3081
INVOICE

In [5]:
print(df['intent'].unique())

['create_account' 'delete_account' 'edit_account' 'recover_password'
 'registration_problems' 'switch_account' 'check_cancellation_fee'
 'contact_customer_service' 'contact_human_agent' 'delivery_options'
 'delivery_period' 'complaint' 'review' 'check_invoices' 'get_invoice'
 'newsletter_subscription' 'cancel_order' 'change_order' 'place_order'
 'track_order' 'check_payment_methods' 'payment_issue'
 'check_refund_policy' 'get_refund' 'track_refund'
 'change_shipping_address' 'set_up_shipping_address']


**Clean and preprocess the text data to make it suitable for training. Use tokenization and 
padding for standardizing input lengths.**

In [6]:
from textblob import Word
import nltk 
# nltk.download('stopwords')
from nltk.corpus import stopwords 

# Remove stopwords and perform basic text cleaning 

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = text.lower().split()
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens) 

df['cleaned_utterance'] = df['utterance'].apply(clean_text)

In [7]:
df.head()

Unnamed: 0,flags,utterance,category,intent,cleaned_utterance
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account,online
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account,tell regisger two accounts single email
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account,online open please
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account,could ask agent open
4,BLQC,"i want an online account, create one",ACCOUNT,create_account,want online create one


In [8]:
# checking data tyypes of features 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21534 entries, 0 to 21533
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   flags              21534 non-null  object
 1   utterance          21534 non-null  object
 2   category           21534 non-null  object
 3   intent             21534 non-null  object
 4   cleaned_utterance  21534 non-null  object
dtypes: object(5)
memory usage: 841.3+ KB


#  Encode Labels and Prepare Data for Training

*Convert intent labels into numerical categories, which the model will learn to classify.*

In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode intent labels 
le = LabelEncoder()
df['intent_encoded'] = le.fit_transform(df['intent'])

In [10]:
df

Unnamed: 0,flags,utterance,category,intent,cleaned_utterance,intent_encoded
0,BILC,"I don't have an online account, what do I have...",ACCOUNT,create_account,online,10
1,BILQZ,can you tell me if i can regisger two accounts...,ACCOUNT,create_account,tell regisger two accounts single email,10
2,BPLC,"I have no online account, open one, please",ACCOUNT,create_account,online open please,10
3,BIPLD,"could you ask an agent how to open an account,...",ACCOUNT,create_account,could ask agent open,10
4,BLQC,"i want an online account, create one",ACCOUNT,create_account,want online create one,10
...,...,...,...,...,...,...
21529,BILQC,"i have no shipping address, what do i have to ...",SHIPPING,set_up_shipping_address,shipping set,23
21530,BLQC,I have no shipping address and I want to set o...,SHIPPING,set_up_shipping_address,shipping address want set one,23
21531,BIQC,"i want to set up my shipping address, what do ...",SHIPPING,set_up_shipping_address,want set shipping,23
21532,BILC,"I don't have a shipping address, can you set o...",SHIPPING,set_up_shipping_address,shipping set one,23


In [11]:
df['intent_encoded'].value_counts()

18    4366
10    2122
8     2055
15    1430
25    1224
16    1150
9     1026
4     1013
20     986
1      926
11     913
7      746
22     580
6      479
12     360
3      360
26     303
24     273
5      270
17     236
13     141
14     133
21     130
2      110
23      96
19      70
0       36
Name: intent_encoded, dtype: int64

# Tokenize and pad the text data.

In [12]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OO>')
tokenizer.fit_on_texts(df['cleaned_utterance'])

X = tokenizer.texts_to_sequences(df['cleaned_utterance'])
X = pad_sequences(X, padding='post')
y = df['intent_encoded'].values

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21534 entries, 0 to 21533
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   flags              21534 non-null  object
 1   utterance          21534 non-null  object
 2   category           21534 non-null  object
 3   intent             21534 non-null  object
 4   cleaned_utterance  21534 non-null  object
 5   intent_encoded     21534 non-null  int32 
dtypes: int32(1), object(5)
memory usage: 925.4+ KB


# Train-Test Split

*Divide your data into training and testing sets.*

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the LSTM Model

In [14]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=X.shape[1]),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



# Train the Model

*Train the model on your training data.*

In [15]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.3260 - loss: 2.3431 - val_accuracy: 0.7681 - val_loss: 0.8044
Epoch 2/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.7877 - loss: 0.7439 - val_accuracy: 0.8560 - val_loss: 0.4926
Epoch 3/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8698 - loss: 0.4551 - val_accuracy: 0.8825 - val_loss: 0.3644
Epoch 4/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9053 - loss: 0.3195 - val_accuracy: 0.9113 - val_loss: 0.3142
Epoch 5/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9287 - loss: 0.2413 - val_accuracy: 0.9234 - val_loss: 0.2355
Epoch 6/10
[1m539/539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.9343 - loss: 0.2084 - val_accuracy: 0.9280 - val_loss: 0.2352
Epoch 7/10
[1m539/539[0m 

In [16]:
X_train

array([[   5,    3,   22, ...,    0,    0,    0],
       [   7,    8,    5, ...,    0,    0,    0],
       [   2,  100,    0, ...,    0,    0,    0],
       ...,
       [   9,   30,   35, ...,    0,    0,    0],
       [ 172,   14,   21, ...,    0,    0,    0],
       [1390,  103,   99, ...,    0,    0,    0]])

In [17]:
y_train

array([18, 15, 11, ...,  8, 10, 18])

In [18]:
X_test 

array([[ 19,  38,  15, ...,   0,   0,   0],
       [  8,  42,  31, ...,   0,   0,   0],
       [319, 183,   0, ...,   0,   0,   0],
       ...,
       [  2,  16,  77, ...,   0,   0,   0],
       [  6,  58,  92, ...,   0,   0,   0],
       [  5,   3,  16, ...,   0,   0,   0]])

In [19]:
y_test  

array([18,  8, 25, ...,  6, 10, 25])

# Evaluate the Model

*Assess the model’s performance using the test set.*

In [21]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9418 - loss: 0.1846
Test Accuracy: 0.94


# Build a Chat Interface for Testing

*Now create a simple function to input a message and receive an intent prediction from the chatbot.*

In [25]:
def predict_intent(message):
    cleaned_message = clean_text(message)
    sequence = tokenizer.texts_to_sequences([cleaned_message])
    padded_sequence = pad_sequences(sequence, maxlen=X.shape[1], padding='post')
    prediction = model.predict(padded_sequence)
    intent_label = le.inverse_transform([np.argmax(prediction)])
    return intent_label[0]


# Test chatbot 
while True: 
    message = input("You:  ")
    if message.lower() =='quit':
        break
    intent = predict_intent(message)
    print(f"Chatbot: Detected intent -  {intent}")

You:  I made a mistake in my shipping address
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step
Chatbot: Detected intent -  change_shipping_address
You:   I did not get my order
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Chatbot: Detected intent -  get_invoice


KeyboardInterrupt: Interrupted by user

In [None]:
def predict_intent(message):
    cleaned_message = clean_text(message)
    sequence = tokenizer.texts_to_sequences([cleaned_message])
    padded_sequence = pad_sequences(sequence, maxlen=X.shape[1], padding='post')
    prediction = model.predict(padded_sequence)
    intent_label = le.inverse_transform([np.argmax(prediction)])
    return intent_label[0]

# Test chatbot
while True:
    message = input("You: ")
    if message.lower() == 'quit':
        break
    intent = predict_intent(message)
    print(f"Chatbot: Detected intent - {intent}")
