### Get an E-Commerce Dataset from Hugging Face

In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/NebulaByte/E-Commerce_Customer_Support_Conversations/data/train-00000-of-00001-a5a7c6e4bb30b016.parquet")

In [2]:
df.head()

Unnamed: 0,issue_area,issue_category,issue_sub_category,issue_category_sub_category,customer_sentiment,product_category,product_sub_category,issue_complexity,agent_experience_level,agent_experience_level_desc,conversation
0,Login and Account,Mobile Number and Email Verification,Verification requirement for mobile number or ...,Mobile Number and Email Verification -> Verifi...,neutral,Appliances,Oven Toaster Grills (OTG),medium,junior,"handles customer inquiries independently, poss...",Agent: Thank you for calling BrownBox Customer...
1,Cancellations and returns,Pickup and Shipping,Reasons for being asked to ship the item,Pickup and Shipping -> Reasons for being asked...,neutral,Electronics,Computer Monitor,less,junior,"handles customer inquiries independently, poss...",Agent: Thank you for calling BrownBox customer...
2,Cancellations and returns,Replacement and Return Process,Inability to click the 'Cancel' button,Replacement and Return Process -> Inability to...,neutral,Appliances,Juicer/Mixer/Grinder,medium,experienced,"confidently handles complex customer issues, e...",Agent: Thank you for calling BrownBox Customer...
3,Login and Account,Login Issues and Error Messages,Error message regarding exceeded attempts to e...,Login Issues and Error Messages -> Error messa...,neutral,Appliances,Water Purifier,less,inexperienced,"may struggle with ambiguous queries, rely on c...","Customer: Hi, I am facing an issue while loggi..."
4,Order,Order Delivery Issues,Delivery not attempted again,Order Delivery Issues -> Delivery not attempte...,negative,Electronics,Bp Monitor,medium,experienced,"confidently handles complex customer issues, e...",Agent: Thank you for contacting BrownBox custo...


In [3]:
df.shape

(1000, 11)

In [4]:
df["issue_area"].unique()

array(['Login and Account', 'Cancellations and returns', 'Order',
       'Shopping', 'Warranty', 'Shipping'], dtype=object)

In [5]:
print("Number of categories this dataset contains is = ", len(df["issue_area"].unique()))

Number of categories this dataset contains is =  6


In [6]:
# Let's define each problem under each category
df["issue_category"].unique()

array(['Mobile Number and Email Verification', 'Pickup and Shipping',
       'Replacement and Return Process',
       'Login Issues and Error Messages', 'Order Delivery Issues',
       'Account Reactivation and Deactivation',
       'Cash on Delivery (CoD) Refunds',
       'Product Availability and Status', 'Product Installation',
       'Order Cancellation', 'Lost or Missing Warranty Card',
       'Return and Exchange', 'Start Date of Warranty',
       'Invoice and Payment', 'Account and Shopping', 'Miscellaneous',
       'Accessing Warranty Details',
       'Availability of Faster Delivery Options', 'Returns and Refunds',
       'Warranty Terms and Changes', 'Pricing and Discounts',
       'Login Methods', 'Product Availability for Shipping',
       'Return Checks and Fees', 'Book Pricing Discrepancies',
       'Order Confirmation and Status', 'Product Information and Tags',
       'Loyalty program', 'Installation and Accessories',
       'Warranty Claim Process', 'Product Registrati

In [7]:
df["issue_category_sub_category"].unique()

array(['Mobile Number and Email Verification -> Verification requirement for mobile number or email address during login',
       'Pickup and Shipping -> Reasons for being asked to ship the item',
       "Replacement and Return Process -> Inability to click the 'Cancel' button",
       'Login Issues and Error Messages -> Error message regarding exceeded attempts to enter the correct verification code',
       'Order Delivery Issues -> Delivery not attempted again',
       'Account Reactivation and Deactivation -> Reactivating an inactive account',
       'Cash on Delivery (CoD) Refunds -> Refund timelines for Cash on Delivery  returns',
       "Product Availability and Status -> Ordering 'Out of Stock' or 'Temporarily Unavailable' products",
       'Product Installation -> Installation after delivery',
       'Mobile Number and Email Verification -> Issues with receiving the OTP or verification code',
       'Order Cancellation -> Time taken to cancel an order',
       'Lost or Missing

In [8]:
len(df["conversation"])

1000

In [9]:
df["conversation"].iloc[0]

"Agent: Thank you for calling BrownBox Customer Support. My name is Tom. How may I assist you today?\n\nCustomer: Hi Tom, I'm trying to log in to my account to purchase an Oven Toaster Grill (OTG), but I'm unable to proceed as it's asking for mobile number or email verification. Can you help me with that?\n\nAgent: Sure, I can assist you with that. May I know your registered mobile number or email address, please?\n\nCustomer: My registered mobile number is +1 123-456-7890.\n\nAgent: Thank you. Let me check that for you. I'm sorry to inform you that we don't have this number on our records. Can you please confirm if this is the correct number?\n\nCustomer: Oh, I'm sorry. I might have registered with a different number. Can you please check with my email address instead? It's johndoe@email.com.\n\nAgent: Sure, let me check that for you. (After a few moments) I see that we have your email address on our records. We'll be sending you a verification code shortly. Please check your email an

In [10]:
import re

In [11]:
# Let's split the conversation column
context_list = []
response_list = []

        
for j in range(len(df["conversation"])):
    conversation = df["conversation"].iloc[j]
    # Step 1: Split based on "Agent" and "Customer" turns
    turns = re.split(r'(Agent:|Customer:)', conversation)
    for i in range(1, len(turns) - 1, 2):
        # Assuming the agent's response follows the customer's context
        context_list.append(turns[i + 0].strip())  # Customer's query
        response_list.append(turns[i + 1].strip())  # Agent's response

In [12]:
# Output the first few context-response pairs
for context, response in zip(context_list[:5], response_list[:5]):
    print(f"Context: {context}")
    print(f"Response: {response}")
    print("-" * 50)

Context: Agent:
Response: Thank you for calling BrownBox Customer Support. My name is Tom. How may I assist you today?
--------------------------------------------------
Context: Customer:
Response: Hi Tom, I'm trying to log in to my account to purchase an Oven Toaster Grill (OTG), but I'm unable to proceed as it's asking for mobile number or email verification. Can you help me with that?
--------------------------------------------------
Context: Agent:
Response: Sure, I can assist you with that. May I know your registered mobile number or email address, please?
--------------------------------------------------
Context: Customer:
Response: My registered mobile number is +1 123-456-7890.
--------------------------------------------------
Context: Agent:
Response: Thank you. Let me check that for you. I'm sorry to inform you that we don't have this number on our records. Can you please confirm if this is the correct number?
--------------------------------------------------


## Clean and Tokenize Text 

#### 1. Text Cleaning: Clean the text for both context and response columns (lowercasing, removing special characters, etc.).

In [13]:
context_list = [text.lower().replace(r'[^a-zA-Z\s]', '') for text in context_list]
response_list = [text.lower().replace(r'[^a-zA-Z\s]', '') for text in response_list]

In [14]:
for context, response in zip(context_list[:5], response_list[:5]):
    print(f"Context: {context}")
    print(f"Response: {response}")
    print("-" * 50)

Context: agent:
Response: thank you for calling brownbox customer support. my name is tom. how may i assist you today?
--------------------------------------------------
Context: customer:
Response: hi tom, i'm trying to log in to my account to purchase an oven toaster grill (otg), but i'm unable to proceed as it's asking for mobile number or email verification. can you help me with that?
--------------------------------------------------
Context: agent:
Response: sure, i can assist you with that. may i know your registered mobile number or email address, please?
--------------------------------------------------
Context: customer:
Response: my registered mobile number is +1 123-456-7890.
--------------------------------------------------
Context: agent:
Response: thank you. let me check that for you. i'm sorry to inform you that we don't have this number on our records. can you please confirm if this is the correct number?
--------------------------------------------------


#### 2.Tokenization: Use a tokenizer (e.g., Keras Tokenizer or Hugging Face Tokenizer) to convert the text into sequences of tokens.

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(context_list + response_list)  # Fit on both context and responses
context_seq = tokenizer.texts_to_sequences(context_list)
response_seq = tokenizer.texts_to_sequences(response_list)

#### 3.Padding Sequences: Make sure that the input (context) and output (response) sequences are of uniform length by padding. 

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 50  # Choose a suitable maximum length
context_seq = pad_sequences(context_seq, maxlen=max_len, padding='post')
response_seq = pad_sequences(response_seq, maxlen=max_len, padding='post')

## Prepare for Sequence-to-Sequence Model

#### 1.Format Data for Model: Your data should be organized as (context, response) pairs for training.

In [17]:
X = context_seq  # User query (context)
y = response_seq  # System response

#### 2.Split Data: Divide the data into training, validation, and test sets.


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Build the Sequence-to-Sequence Model with LSTM

In [19]:
X_train.shape

(13748, 50)

In [20]:
X_val.shape

(3438, 50)

In [21]:
y_train.shape, y_val.shape

((13748, 50), (3438, 50))

In [22]:
len(tokenizer.word_index)

3514

In [25]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam


vocab_size = 3515  # Ensure this matches your tokenizer vocab size
output_dim = 200  # Embedding vector size

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=output_dim, input_length=50),
    LSTM(256, return_sequences=True),  # LSTM hidden size
    Dropout(0.3),  # Dropout with 30% probability
    Dense(vocab_size, activation='softmax')  # Final layer for classification
])

model.compile(optimizer=Adam(learning_rate=0.0005), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 200)           703000    
                                                                 
 lstm_2 (LSTM)               (None, 50, 256)           467968    
                                                                 
 dropout_1 (Dropout)         (None, 50, 256)           0         
                                                                 
 dense_1 (Dense)             (None, 50, 3515)          903355    
                                                                 
Total params: 2074323 (7.91 MB)
Trainable params: 2074323 (7.91 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train the Model

In [27]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [58]:
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val),
                    epochs=20, 
                    batch_size=64,
                    callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x19578a0bd90>

## Evaluate model

In [60]:
# Evaluate the model on validation/test data
loss, accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Loss: 2.4650
Validation Accuracy: 0.6090


## Generate Text

In [62]:
import tensorflow as tf

In [63]:
# Generate a response given a seed text
def generate_text(model, tokenizer, seed_text, max_length):
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([seed_text])
        padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=50)
        predictions = model.predict(padded_sequence, verbose=0)
        
        predicted_word_index = np.argmax(predictions[0, -1, :])
        output_word = tokenizer.index_word.get(predicted_word_index, '')
        seed_text += " " + output_word
    return seed_text

# Example generation
seed_text = "Thank you for calling"
print(generate_text(model, tokenizer, seed_text, max_length=20))

Thank you for calling                    


In [64]:
model.save("text_generator_model.h5")

  saving_api.save_model(
