In [14]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.initializers import Constant
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2, l1, l1_l2

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import sklearn.metrics as mt
import warnings
from tensorflow import keras

warnings.filterwarnings('ignore')

## Pre-process and train models

In [2]:
df = pd.read_csv("./cleaned_data.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Clean_Review
0,0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,absolutely wonderful silky and sexy and comf...
1,1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,love this dress it 's sooo pretty i happene...
2,2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,i had such high hope for this dress and really...
3,3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,i love love love this jumpsuit it 's fun ...
4,4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,this shirt is very flattering to all due to th...


In [3]:
df.shape

(22641, 13)

In [4]:
X = np.array(df['Clean_Review'])

y_recom = np.array(df['Recommended IND'])
y_rating = np.array(df['Rating'])

### Tokenize words (change the to numeric values)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

sentence_sq = tokenizer.texts_to_sequences(X)

In [6]:
max_len = 0
for sub_str in sentence_sq:
    temp = len(sub_str)
    if temp > max_len:
        max_len = temp
        
max_len

117

**Make all sentence vectors same length**

In [7]:
padded_docs = np.array(pad_sequences(sentence_sq, maxlen=max_len, padding='post'))

In [8]:
padded_docs.shape

(22641, 117)

### Predict recommendation

In [9]:
len(y_recom)

22641

**Train test data split** 

In [10]:
X_recom_train, X_recom_test, y_recom_train, y_recom_test = train_test_split(padded_docs, y_recom, test_size=0.3)

In [20]:
early_stopping = EarlyStopping(monitor='val_accuracy', mode="auto", patience = 5)
model_save = ModelCheckpoint('model.hdf5', save_best_only=True)

# we use sequential model
model = Sequential()

# embedding layer is a layer of the model which transform every world to a feature vector
# and it learns it through learning process
model.add(Embedding(
    input_dim = len(tokenizer.word_counts)+1,
    output_dim = 50,
    input_length = padded_docs.shape[1]
))

# we use two bidirectional LSTM (long short term memory)
model.add(Bidirectional(LSTM(units = 10, return_sequences = True)))
model.add(Bidirectional(LSTM(units = 10, return_sequences = False)))
model.add(Dense(10, activation='relu', activity_regularizer = l1(0.1)))
model.add(Dropout(rate=0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 117, 50)           654350    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 117, 20)           4880      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 20)                2480      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                210       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 661,931
Trainable params: 661,931
Non-trainable params: 0
________________________________________________

In [23]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
model_his = model.fit(X_recom_train, y_recom_train, epochs = 100, batch_size = 64, validation_split = 0.1, shuffle = True, verbose = True, callbacks = [early_stopping, model_save])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


### Evaluate model on test data

In [24]:
test_result = model.evaluate(X_recom_test, y_recom_test)
print ('model test loss: ', test_result[0])
print ('model test accuracy: ', test_result[1])

model test loss:  0.28052762150764465
model test accuracy:  0.9445016980171204


### Evaluate model on train data

In [25]:
train_result = model.evaluate(X_recom_train, y_recom_train)
print ('model train loss: ', train_result[0])
print ('model train accuracy: ', train_result[1])

model train loss:  0.28288668394088745
model train accuracy:  0.9422008991241455


## Rating prediction

In [11]:
y_rating = y_rating - 1

In [12]:
X_rate_train, X_rate_test, y_rate_train, y_rate_test = train_test_split(padded_docs, y_rating, test_size=0.3)

In [19]:
early_stopping = EarlyStopping(monitor='val_loss', mode="auto", patience = 5)
model_save = ModelCheckpoint('model_rating.hdf5', save_best_only=True)

model1 = Sequential()
model1.add(Embedding(
    input_dim = len(tokenizer.word_counts)+1,
    output_dim = 50,
    input_length = padded_docs.shape[1]
))
model1.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model1.add(Dense(512, activation='relu'))
model1.add(Dropout(0.50))
model1.add(Dense(5, activation='softmax'))
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 117, 50)           654350    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dense (Dense)                (None, 512)               131584    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2565      
Total params: 971,795
Trainable params: 971,795
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_rate_train

array([4, 4, 1, ..., 4, 4, 2])

In [None]:
model1.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])
model1_his = model1.fit(X_rate_train, y_rate_train, epochs = 100, batch_size = 32, validation_split = 0.1, shuffle = True, verbose = True, callbacks = [early_stopping, model_save])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


### Evaluate model on test data

In [26]:
test_result_rate = model1.evaluate(X_rate_test, y_rate_test)
print ('model test loss: ', test_result_rate[0])
print ('model test accuracy: ', test_result_rate[1])

model test loss:  0.8395806550979614
model test accuracy:  0.6430148482322693


### Evaluate model on train data

In [27]:
train_result_rate = model1.evaluate(X_rate_train, y_rate_train)
print ('model train loss: ', train_result_rate[0])
print ('model train accuracy: ', train_result_rate[1])

model train loss:  0.8334828019142151
model train accuracy:  0.6559818387031555


## Load Saved models

In [23]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
model1.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

model.load_weights('./model.hdf5')
model1.load_weights('./model_rating.hdf5')

### Evaluate recommendation model on test data

In [24]:
test_result = model.evaluate(X_recom_test, y_recom_test)
print ('model test loss: ', test_result[0])
print ('model test accuracy: ', test_result[1])

model test loss:  0.28052762150764465
model test accuracy:  0.9445016980171204


### Evaluate recommendation model on train data

In [25]:
train_result = model.evaluate(X_recom_train, y_recom_train)
print ('model train loss: ', train_result[0])
print ('model train accuracy: ', train_result[1])

model train loss:  0.28288668394088745
model train accuracy:  0.9422008991241455


### Evaluate rating model on test data

In [26]:
test_result_rate = model1.evaluate(X_rate_test, y_rate_test)
print ('model test loss: ', test_result_rate[0])
print ('model test accuracy: ', test_result_rate[1])

model test loss:  0.8395806550979614
model test accuracy:  0.6430148482322693


### Evaluate  rating model on train data

In [27]:
train_result_rate = model1.evaluate(X_rate_train, y_rate_train)
print ('model train loss: ', train_result_rate[0])
print ('model train accuracy: ', train_result_rate[1])

model train loss:  0.8334828019142151
model train accuracy:  0.6559818387031555
