In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pwd

/content


In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf

In [5]:
data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/amazon/FshionProductReviews_V3.csv",
)
data = data.head(50000)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,overall,verified,reviewTime,reviewerID,reviewText,summary
0,0,2,True,"09 28, 2014",A3DDWDH9PX2YX2,agree review opening small almost bent hook ex...,"I agree with the other review, the opening is ..."
1,1,4,False,"08 25, 2014",A2MWC41EW7XL15,love going order another pack keep work someon...,My New 'Friends' !!
2,2,2,True,"08 24, 2014",A2UH2QQ275NV45,tiny opening,Two Stars
3,3,4,True,"05 31, 2014",A7QS961ROI6E0,little plastic back work great loosing hook ea...,Works great!
4,4,3,True,"09 22, 2013",A1BB77SEBQT8VX,mother law wanted present sister liked said wo...,bought as a present


In [7]:
headline = data.reviewText.apply(
    lambda row: str(row)
)
labels = data.overall

In [8]:
labels

0        2
1        4
2        2
3        4
4        3
        ..
49995    4
49996    4
49997    5
49998    1
49999    5
Name: overall, Length: 50000, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
train_headline, test_headline, train_labels, test_labels = train_test_split(
    headline, labels, test_size=0.3, random_state=42
)

In [10]:
# Encode labels
train_labels = tf.one_hot(
    train_labels,
    depth=5,
    on_value=None,
    off_value=None,
    axis=None,
    dtype=None,
    name=None
)

test_labels = tf.one_hot(
    test_labels,
    depth=5,
    on_value=None,
    off_value=None,
    axis=None,
    dtype=None,
    name=None
)

# Pre-Processing

In [11]:
# init Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=5000, oov_token="<OOV>"
)

# fit on data
tokenizer.fit_on_texts(train_headline)

# generate sequence
train_sequence = tokenizer.texts_to_sequences(train_headline)

In [23]:
train_padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequence, maxlen=100, padding='post', truncating='post'
)
train_padded_sequence

array([[  35,    6,  272, ..., 2953,   10,   42],
       [ 372,   34,    7, ...,    0,    0,    0],
       [  40,  392,  357, ...,    0,    0,    0],
       ...,
       [ 582,    5,   14, ...,    0,    0,    0],
       [  27,  681,  347, ...,    0,    0,    0],
       [ 270,   29,  139, ...,    0,    0,    0]], dtype=int32)

In [24]:
test_sequence = tokenizer.texts_to_sequences(test_headline)
test_padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequence, maxlen=100, padding='post', truncating='post'
)
test_padded_sequence

array([[   3,   14, 1128, ...,    0,    0,    0],
       [3166,   13,    0, ...,    0,    0,    0],
       [ 461,   51,  544, ...,    0,    0,    0],
       ...,
       [ 407,  102,  726, ...,    0,    0,    0],
       [ 115,  621,  158, ...,    0,    0,    0],
       [   2,    3,   77, ...,    0,    0,    0]], dtype=int32)

# Modeling LSTM

In [25]:
lstm_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(5000, 300, input_length=100),
        # tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(10, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
])

In [26]:
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics= ['accuracy']
)

In [27]:
with tf.device('/GPU:0'):

    lstm_model.fit(
        train_padded_sequence, train_labels,
        epochs=10,
        validation_data=(test_padded_sequence, test_labels)
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 228)           11400000  
                                                                 
 bidirectional (Bidirectiona  (None, 128)              150016    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 10)                650       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 11,558,933
Trainable params: 11,558,933
Non-trainable params: 0
____________________________________________

In [None]:
hist = pd.DataFrame(pd.DataFrame(lstm_model.history.history))
hist

# Predicting

In [None]:
def predict(data:list, model):
  sequence = tokenizer.texts_to_sequences(data)
  padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
      sequence, maxlen=50, padding='post', truncating='post'
  )
  return model.predict(padded_sequence) 

In [None]:
predict(
    ['thirtysomething scientists unveil doomsday clock of hair loss','eat your veggies: 9 deliciously different recipes'],
    lstm_model)

array([[9.9974948e-01],
       [5.2917004e-04]], dtype=float32)