In [1]:
import warnings

import re
import string
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk

from time import time
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import remove_stopwords
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding,Bidirectional
from tensorflow.keras.models import Sequential 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

warnings.filterwarnings('ignore')

In [2]:
review_data = pd.read_csv("../b. Datasets/Online Shopping Reviews.csv")
display(review_data.head(2))
display(review_data.tail(2))

Unnamed: 0,Review Text,Rating
0,Absolutely wonderful - silky and sexy and comf...,4
1,Love this dress! it's sooo pretty. i happene...,5


Unnamed: 0,Review Text,Rating
497,"Very cute, very comfortable. for me aesthetics...",5
498,Cropped and wide- would look cuter on someone ...,3


In [3]:
review_data['Emotion'] = np.where(review_data['Rating'] > 3, 'Positive', 'Negative')

In [4]:
display(review_data.head(2))
display(review_data.tail(2))

Unnamed: 0,Review Text,Rating,Emotion
0,Absolutely wonderful - silky and sexy and comf...,4,Positive
1,Love this dress! it's sooo pretty. i happene...,5,Positive


Unnamed: 0,Review Text,Rating,Emotion
497,"Very cute, very comfortable. for me aesthetics...",5,Positive
498,Cropped and wide- would look cuter on someone ...,3,Negative


## Data Preprocessing

In [5]:
review_data['Review Text'] = review_data['Review Text'].astype(str).fillna("")

In [6]:
review_data['Review Text'] = review_data['Review Text'].apply(lambda x: x.lower())

In [7]:
tokeniser = Tokenizer()
tokeniser.fit_on_texts(review_data['Review Text'])
list(tokeniser.word_counts.items())[:5]

[('absolutely', 12),
 ('wonderful', 9),
 ('silky', 2),
 ('and', 1099),
 ('sexy', 9)]

In [8]:
def remove_punctuations(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

review_data['Review Text'] = review_data['Review Text'].apply(remove_punctuations)
review_data.head(2)

Unnamed: 0,Review Text,Rating,Emotion
0,absolutely wonderful silky and sexy and comfo...,4,Positive
1,love this dress its sooo pretty i happened t...,5,Positive


In [9]:
def remove_stopwords_function(text):
    return remove_stopwords(text)

review_data['Review Text'] = review_data['Review Text'].apply(remove_stopwords_function)
review_data.head(2)

Unnamed: 0,Review Text,Rating,Emotion
0,absolutely wonderful silky sexy comfortable,4,Positive
1,love dress sooo pretty happened store im glad ...,5,Positive


In [10]:
wordnet_lemmatiser = WordNetLemmatizer()
wordnet_map = {
    'N': wordnet.NOUN,
    'J': wordnet.ADJ,
    'V': wordnet.VERB,
    'R': wordnet.ADV
}

def lemmatise_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return ' '.join([wordnet_lemmatiser.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

review_data['Review Text'] = review_data['Review Text'].apply(lemmatise_words)
display(review_data.head(2))
display(review_data.tail(2))

Unnamed: 0,Review Text,Rating,Emotion
0,absolutely wonderful silky sexy comfortable,4,Positive
1,love dress sooo pretty happen store im glad bc...,5,Positive


Unnamed: 0,Review Text,Rating,Emotion
497,cute comfortable aesthetic comfort hand hand d...,5,Positive
498,crop wide look cuter petite wide crop sheer,3,Negative


## Word2Vec = CBOW (Continuous Bag of Words)

In [11]:
token_list = []
for word in review_data['Review Text']:
    token_list.append(RegexpTokenizer('\w+').tokenize(word))

token_list[:2]

[['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable'],
 ['love',
  'dress',
  'sooo',
  'pretty',
  'happen',
  'store',
  'im',
  'glad',
  'bc',
  'order',
  'online',
  'bc',
  'petite',
  'buy',
  'petite',
  '58',
  'love',
  'length',
  'hit',
  'little',
  'knee',
  'definitely',
  'true',
  'midi',
  'truly',
  'petite']]

In [12]:
start_time = time()
cbow_model = Word2Vec(min_count=2, window=2, sg=0, sample=5e-5, alpha=0.05, min_alpha=0.0005, negative=20)
cbow_model.build_vocab(token_list)
cbow_model.train(token_list, total_examples=cbow_model.corpus_count, epochs=5000, report_delay=1)
print("Time to build and train the CBOW model vocabulary {} mins".format(round((time()-start_time)/ 60, 2)))

Time to build and train the CBOW model vocabulary 1.58 mins


In [13]:
cbow_model.wv.__getitem__('look')

array([ 4.4970006e-01,  1.4481029e-01, -3.3299336e-01,  1.0815486e+00,
        1.8659927e-01,  9.7541157e-03, -6.1849755e-01, -2.2549291e-01,
       -8.1570365e-02, -1.5310153e-02, -6.0348469e-01,  4.1442069e-01,
       -4.2936131e-02,  5.9963572e-01, -1.6378808e-01, -8.6428243e-01,
        5.3765021e-02,  3.8537908e-01, -1.8285264e-01, -5.0456357e-01,
       -8.5532641e-01,  3.9504370e-01, -7.6105124e-01, -3.4410292e-01,
        1.1005403e+00, -1.0346520e-01,  1.6382396e+00,  2.4964552e-01,
       -6.5069997e-01, -5.0101840e-01,  1.9166665e-02,  6.8660438e-02,
        2.0231715e-01,  1.0029162e-01, -3.1870019e-01,  1.5651911e-01,
       -3.4575057e-01,  6.4419933e-02,  2.8742218e-02, -4.6252999e-02,
        8.0844849e-02,  6.9430369e-01,  8.5663341e-02,  4.1341874e-01,
        4.1485196e-01,  5.3475875e-01,  3.8409519e-01, -7.9785541e-02,
        1.3393161e-01,  1.8016180e-01, -7.2145587e-01, -7.5172198e-01,
        9.7166888e-02,  4.8654845e-01, -9.1107255e-01, -8.1156361e-01,
      

In [14]:
print('Using word embedding vocabulary model to find top 5 similar words to the word `comfort`')
cbow_model.wv.most_similar('comfort', topn=5)

Using word embedding vocabulary model to find top 5 similar words to the word `comfort`


[('wonderfully', 0.4730899930000305),
 ('machine', 0.3544391393661499),
 ('band', 0.3534582555294037),
 ('sleep', 0.35129183530807495),
 ('charm', 0.3066907525062561)]

In [15]:
print('Using word embedding vocabulary model to find top 5 similar words to the word `order`')
cbow_model.wv.most_similar('order', topn=5)

Using word embedding vocabulary model to find top 5 similar words to the word `order`


[('size', 0.5465220808982849),
 ('im', 0.43615227937698364),
 ('large', 0.43381091952323914),
 ('small', 0.43364858627319336),
 ('fit', 0.4155549705028534)]

In [16]:
X = review_data.drop(columns='Emotion', axis=0)
y = review_data['Emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((399, 2), (100, 2), (399,), (100,))

In [17]:
encoder = LabelEncoder()
y_encoding = encoder.fit_transform(y)

print("Classes", encoder.classes_)
print("Encoding labels", y_encoding[:10])

Classes ['Negative' 'Positive']
Encoding labels [1 1 0 1 1 0 1 1 1 1]


In [18]:
MAX_SEQUENCE_LENGTH = 40
MAX_WORDS = 1000

tokeniser = Tokenizer(num_words=MAX_WORDS)
tokeniser.fit_on_texts(X["Review Text"])
vocab_size = len(tokeniser.word_index)
print("Vocab size:", vocab_size)
X_sequences = tokeniser.texts_to_sequences(X["Review Text"])
print("\nX Sequences - First 5:", X_sequences[:5])

X_padded = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("\nX Padded - First 2:", X_padded[:2])

Vocab size: 2178

X Sequences - First 5: [[205, 263, 739, 264, 36], [4, 5, 24, 486, 50, 10, 157, 414, 14, 60, 414, 26, 16, 26, 265, 4, 38, 121, 13, 149, 65, 56, 580, 353, 26], [122, 309, 5, 42, 21, 740, 14, 26, 9, 123, 1, 9, 9, 310, 266, 581, 26, 51, 354, 166, 244, 36, 2, 129, 244, 68, 103, 415, 177, 741, 103, 63, 582, 741, 103, 742, 206, 487], [4, 4, 4, 743, 135, 583, 584, 66, 6, 12, 104], [15, 20, 744, 167, 27, 38, 6, 288, 585, 116, 586, 4, 15]]

X Padded - First 2: [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 205
  263 739 264  36]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4   5  24
  486  50  10 157 414  14  60 414  26  16  26 265   4  38 121  13 149  65
   56 580 353  26]]


In [19]:
list(tokeniser.word_index.items())[:5]

[('size', 1), ('fit', 2), ('look', 3), ('love', 4), ('dress', 5)]

## Train Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoding, test_size=0.2, random_state=0)

## Building an Bidirectional LSTM Model

In [21]:
embedding_dim = cbow_model.vector_size
word_index = tokeniser.word_index
embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

for word, i in word_index.items():
    if word in cbow_model.wv:
        embedding_matrix[i] = cbow_model.wv[word]

In [22]:
model = Sequential()
model.add(Embedding(input_dim=len(word_index)+1, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    trainable=True))

model.add(Bidirectional(LSTM(256, return_sequences=False), merge_mode='concat'))
model.add(tf.keras.layers.Dense(len(word_index)+1, activation='softmax'))
model.build(input_shape=(None, MAX_SEQUENCE_LENGTH))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
model.summary()

In [24]:
model.fit(X_train, y_train, epochs=20, verbose=True, batch_size=32, validation_split=0.25)

Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 225ms/step - accuracy: 0.4341 - loss: 6.9260 - val_accuracy: 0.7900 - val_loss: 1.1930
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 144ms/step - accuracy: 0.7917 - loss: 1.0333 - val_accuracy: 0.8200 - val_loss: 0.6177
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 142ms/step - accuracy: 0.6688 - loss: 0.6143 - val_accuracy: 0.7900 - val_loss: 0.5855
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - accuracy: 0.8111 - loss: 0.4883 - val_accuracy: 0.8000 - val_loss: 0.5307
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 143ms/step - accuracy: 0.7984 - loss: 0.5133 - val_accuracy: 0.7900 - val_loss: 0.5470
Epoch 6/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 154ms/step - accuracy: 0.7909 - loss: 0.5044 - val_accuracy: 0.8200 - val_loss: 0.4970
Epoch 7/20
[1m10/10[0m [3

<keras.src.callbacks.history.History at 0x1d957668450>

In [25]:
y_test_pred = model.predict(X_test)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 243ms/step


In [26]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.8198 - loss: 0.4917
