In [1]:
import numpy as np 
import pandas as pd 
from plotly import graph_objs as go

In [2]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
del df['Unnamed: 0']

In [4]:

print('There are', df.shape[0], 'reviews in this dataset')
print('Number of Duplicates:', len(df[df.duplicated()]))
print('Number of Missing Values:', df.isnull().sum().sum())

There are 23486 reviews in this dataset
Number of Duplicates: 21
Number of Missing Values: 4697


In [5]:
print('Number of Missing Values per column:')
df.isnull().sum().sort_values(ascending=False)

Number of Missing Values per column:


Title                      3810
Review Text                 845
Class Name                   14
Division Name                14
Department Name              14
Clothing ID                   0
Age                           0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
dtype: int64

In [6]:
df = df.dropna(subset = ['Review Text', 'Division Name', 'Department Name', 'Class Name'])

In [7]:
classes = (
    df
    .groupby(['Recommended IND', 'Class Name'])
    .size()
    .to_frame()
    .rename(columns={0:'Count'})
    .reset_index()
)  

In [8]:
a = classes.groupby('Class Name')['Count'].transform('sum')

In [9]:
classes['Count'] = classes['Count'].div(a)

In [10]:
classes = classes.pivot(index='Class Name', columns='Recommended IND')  

In [11]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=classes.index,
    x=classes.iloc[:,0],
    name='Not Recommended',
    orientation='h',
    marker=dict(
        color='#f6b220')
    ))

fig.add_trace(go.Bar(
    y=classes.index,
    x=classes.iloc[:,1],
    name='Recommended',
    orientation='h',
    marker=dict(
        color='#0E2F44')
    ))
fig.update_layout(barmode='stack')

fig.update_layout(
                title = 'Distribution of <b>Product Class</b> by Recommendation ',
                barmode='stack', 
                autosize=False,
                width=2000,
                height=1200,
                font=dict(family="Rockwell, sans-serif", size=18, color='#000000'),
                margin=dict(
                  l=150,
                  r=100,
                   b=30,
                   t=100,
                   pad=4
                          ))
fig.layout.xaxis.tickformat = ',.0%'

fig.show()


In [12]:

from sklearn.model_selection import train_test_split
from sklearn import metrics
import re
import string
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-11-21 12:21:41.875792: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-21 12:21:41.936837: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-21 12:21:42.002756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732171902.059399    4365 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732171902.075333    4365 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 12:21:42.206533: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [13]:
data = df[['Review Text', 'Recommended IND']]

In [14]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, 
    remove links, remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'


invalid escape sequence '\['


invalid escape sequence '\S'


invalid escape sequence '\w'



In [15]:
data['Review Text'] = data['Review Text'].apply(lambda x:clean_text(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
X = data.drop('Recommended IND', axis=1)
y = data['Recommended IND']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.2,random_state=42,shuffle=True)

In [18]:
num_words = None

In [19]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train['Review Text'].tolist() + X_test['Review Text'].tolist())   # introduce text in list


In [20]:
word_index = tokenizer.word_index

In [21]:
X_train_seq = tokenizer.texts_to_sequences(X_train['Review Text'].tolist())
X_test_seq = tokenizer.texts_to_sequences(X_test['Review Text'].tolist())


In [22]:
max_len = max([len(x) for x in X_train_seq])

In [23]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [24]:
#using RNN
model=Sequential([
    Embedding(len(word_index)+1, 50,input_length=max_len),
    SimpleRNN(100),
    Dense(1, activation='sigmoid')
])


Argument `input_length` is deprecated. Just remove it.

2024-11-21 12:21:45.967290: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [25]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=512)

Epoch 1/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - accuracy: 0.7579 - loss: 0.5353
Epoch 2/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step - accuracy: 0.8133 - loss: 0.4578
Epoch 3/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.8242 - loss: 0.4216
Epoch 4/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 68ms/step - accuracy: 0.8246 - loss: 0.4057
Epoch 5/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.8610 - loss: 0.3236


<keras.src.callbacks.history.History at 0x7551354b6f60>

In [27]:
acc=model.evaluate(X_test_pad, y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(acc[0],acc[1]))

[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8544 - loss: 0.3703
Test set
  Loss: 0.365
  Accuracy: 0.856


In [28]:
#using lstm
model2=Sequential([ 
    Embedding(len(word_index)+1, 50,input_length=max_len),
    LSTM(100, dropout=0.3, recurrent_dropout=0.3),
    Dense(1, activation='sigmoid')
])

In [29]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
model2.fit(X_train_pad, y_train, epochs=5, batch_size=512)

Epoch 1/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 293ms/step - accuracy: 0.7804 - loss: 0.5773
Epoch 2/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 272ms/step - accuracy: 0.8231 - loss: 0.4010
Epoch 3/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 259ms/step - accuracy: 0.8718 - loss: 0.3013
Epoch 4/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 274ms/step - accuracy: 0.8971 - loss: 0.2463
Epoch 5/5
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 263ms/step - accuracy: 0.9110 - loss: 0.2204


<keras.src.callbacks.history.History at 0x755131bb4380>

In [31]:
acc=model2.evaluate(X_test_pad, y_test)
print('Accuracy:', acc[1])

[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.8905 - loss: 0.2694
Accuracy: 0.888422429561615


In [32]:

review = ['I liked this product!']
review_seq = tokenizer.texts_to_sequences(review)
review_pad = pad_sequences(review_seq, maxlen=max_len)
model.predict(review_pad)
if model.predict(review_pad) > 0.5:
    print('Recommended')
else:
    print('Not Recommended')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Recommended
