### Get an E-Commerce Dataset from Hugging Face

In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/NebulaByte/E-Commerce_Customer_Support_Conversations/data/train-00000-of-00001-a5a7c6e4bb30b016.parquet")

In [2]:
df.head()

Unnamed: 0,issue_area,issue_category,issue_sub_category,issue_category_sub_category,customer_sentiment,product_category,product_sub_category,issue_complexity,agent_experience_level,agent_experience_level_desc,conversation
0,Login and Account,Mobile Number and Email Verification,Verification requirement for mobile number or ...,Mobile Number and Email Verification -> Verifi...,neutral,Appliances,Oven Toaster Grills (OTG),medium,junior,"handles customer inquiries independently, poss...",Agent: Thank you for calling BrownBox Customer...
1,Cancellations and returns,Pickup and Shipping,Reasons for being asked to ship the item,Pickup and Shipping -> Reasons for being asked...,neutral,Electronics,Computer Monitor,less,junior,"handles customer inquiries independently, poss...",Agent: Thank you for calling BrownBox customer...
2,Cancellations and returns,Replacement and Return Process,Inability to click the 'Cancel' button,Replacement and Return Process -> Inability to...,neutral,Appliances,Juicer/Mixer/Grinder,medium,experienced,"confidently handles complex customer issues, e...",Agent: Thank you for calling BrownBox Customer...
3,Login and Account,Login Issues and Error Messages,Error message regarding exceeded attempts to e...,Login Issues and Error Messages -> Error messa...,neutral,Appliances,Water Purifier,less,inexperienced,"may struggle with ambiguous queries, rely on c...","Customer: Hi, I am facing an issue while loggi..."
4,Order,Order Delivery Issues,Delivery not attempted again,Order Delivery Issues -> Delivery not attempte...,negative,Electronics,Bp Monitor,medium,experienced,"confidently handles complex customer issues, e...",Agent: Thank you for contacting BrownBox custo...


In [3]:
df.shape

(1000, 11)

In [4]:
df["issue_area"].unique()

array(['Login and Account', 'Cancellations and returns', 'Order',
       'Shopping', 'Warranty', 'Shipping'], dtype=object)

In [5]:
print("Number of categories this dataset contains is = ", len(df["issue_area"].unique()))

Number of categories this dataset contains is =  6


In [6]:
# Let's define each problem under each category
df["issue_category"].unique()

array(['Mobile Number and Email Verification', 'Pickup and Shipping',
       'Replacement and Return Process',
       'Login Issues and Error Messages', 'Order Delivery Issues',
       'Account Reactivation and Deactivation',
       'Cash on Delivery (CoD) Refunds',
       'Product Availability and Status', 'Product Installation',
       'Order Cancellation', 'Lost or Missing Warranty Card',
       'Return and Exchange', 'Start Date of Warranty',
       'Invoice and Payment', 'Account and Shopping', 'Miscellaneous',
       'Accessing Warranty Details',
       'Availability of Faster Delivery Options', 'Returns and Refunds',
       'Warranty Terms and Changes', 'Pricing and Discounts',
       'Login Methods', 'Product Availability for Shipping',
       'Return Checks and Fees', 'Book Pricing Discrepancies',
       'Order Confirmation and Status', 'Product Information and Tags',
       'Loyalty program', 'Installation and Accessories',
       'Warranty Claim Process', 'Product Registrati

In [7]:
df["issue_category_sub_category"].unique()

array(['Mobile Number and Email Verification -> Verification requirement for mobile number or email address during login',
       'Pickup and Shipping -> Reasons for being asked to ship the item',
       "Replacement and Return Process -> Inability to click the 'Cancel' button",
       'Login Issues and Error Messages -> Error message regarding exceeded attempts to enter the correct verification code',
       'Order Delivery Issues -> Delivery not attempted again',
       'Account Reactivation and Deactivation -> Reactivating an inactive account',
       'Cash on Delivery (CoD) Refunds -> Refund timelines for Cash on Delivery  returns',
       "Product Availability and Status -> Ordering 'Out of Stock' or 'Temporarily Unavailable' products",
       'Product Installation -> Installation after delivery',
       'Mobile Number and Email Verification -> Issues with receiving the OTP or verification code',
       'Order Cancellation -> Time taken to cancel an order',
       'Lost or Missing

In [8]:
len(df["conversation"])

1000

In [9]:
df["conversation"].iloc[0]

"Agent: Thank you for calling BrownBox Customer Support. My name is Tom. How may I assist you today?\n\nCustomer: Hi Tom, I'm trying to log in to my account to purchase an Oven Toaster Grill (OTG), but I'm unable to proceed as it's asking for mobile number or email verification. Can you help me with that?\n\nAgent: Sure, I can assist you with that. May I know your registered mobile number or email address, please?\n\nCustomer: My registered mobile number is +1 123-456-7890.\n\nAgent: Thank you. Let me check that for you. I'm sorry to inform you that we don't have this number on our records. Can you please confirm if this is the correct number?\n\nCustomer: Oh, I'm sorry. I might have registered with a different number. Can you please check with my email address instead? It's johndoe@email.com.\n\nAgent: Sure, let me check that for you. (After a few moments) I see that we have your email address on our records. We'll be sending you a verification code shortly. Please check your email an

# Step-1: Data Preprocessing 

## 1.1 Preprocess the Conversation Text

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['conversation'])

# Convert conversations to sequences of integers
conversation_sequences = tokenizer.texts_to_sequences(df['conversation'])

# Pad the sequences to make them uniform in length (maxlen=50 is an example)
max_sequence_length = 50
X_conversation = pad_sequences(conversation_sequences, maxlen=max_sequence_length, padding='pre')

## 1.2 Preprocess the Metadata

In [15]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
# For each categorical metadata column, apply LabelEncoder
label_encoders = {}
metadata_columns  = df.select_dtypes(include=["object", "category"]).columns

#print("Categorical columns:", categorical_columns)

# Encode metadata columns
X_metadata = []
for column in metadata_columns:
    le = LabelEncoder()
    label_encoders[column] = le
    encoded_column = le.fit_transform(df[column])
    X_metadata.append(encoded_column)

# Stack metadata columns into a single array

X_metadata = np.stack(X_metadata, axis=1)

 # Step-2: Combine Conversation and Metadata
 You need to combine the conversation and metadata features into one unified input

In [16]:
# Concatenate conversation input and metadata input
X_combined = np.concatenate([X_conversation, X_metadata], axis=1)

# Step-3: Prepare the Target for Text Generation (Next Word Prediction)
To train a text generator, you need to define a target sequence (next word prediction). For each conversation, the target will be the next word after the current sequence.

In [17]:
# Create the next word labels (target) for training
y = []
for seq in conversation_sequences:
    for i in range(1, len(seq)):
        y.append(seq[i])

y = np.array(y)

# Step-4: Build the LSTM Model

In [27]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional

# Decrease learning rate to fine-tune
optimizer = Adam(learning_rate=0.0001)

# Define the inputs
conversation_input = Input(shape=(max_sequence_length,))
metadata_input = Input(shape=(X_metadata.shape[1],))

# Define the embedding layer for conversation text
embedding_layer = Embedding(input_dim=5000, output_dim=200)(conversation_input)

# LSTM layer for processing conversation text
lstm_layer = Bidirectional(LSTM(256, return_sequences=False))(embedding_layer)

# Concatenate the LSTM output with the metadata input
combined = Concatenate()([lstm_layer, metadata_input])

# Define the output layer (dense layer for next word prediction)
output = Dense(5000, activation='softmax')(combined)

# Build and compile the model
model = Model(inputs=[conversation_input, metadata_input], outputs=output)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_11 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 50, 200)              1000000   ['input_11[0][0]']            
                                                                                                  
 bidirectional (Bidirection  (None, 512)                  935936    ['embedding_5[0][0]']         
 al)                                                                                              
                                                                                                  
 input_12 (InputLayer)       [(None, 11)]                 0         []                      

# Step-5: Train the Model

In [28]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train with early stopping
model.fit([X_conversation, X_metadata], y, epochs=30, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1488eaa43a0>