In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os

def check_dir(dir:str):
    if not os.path.exists(dir):
        os.mkdir(dir)
        print(f'Created dir: {dir}')
    else:
        print(f'{dir} exists')

In [2]:
const = tf.constant(["This is a string", "This is another string"])
const[0]


<tf.Tensor: shape=(), dtype=string, numpy=b'This is a string'>

In [3]:
def transform_sentiment(input):
    dict = {
        1: 'very negative',
        2: 'slightly negative',
        3: 'neutral',
        4: 'slightly positive',
        5: 'very positive'
    }

    return dict[input]


df = pd.read_json('./Data/consolidated/final_2.json')
df = df.drop(columns=['index'], axis=0)
df['count'] = 1
df['sentiment'] = df['overall'].apply(transform_sentiment)
df = df.dropna(inplace=False)
df

Unnamed: 0,overall,verified,reviewTime,reviewerName,reviewText,summary,count,sentiment
0,1,True,"02 19, 2015",theodore j bigham,great,One Star,1,very negative
1,1,True,"04 10, 2017",Jacqueline Diaz,I didn't like this product it smudged all unde...,One Star,1,very negative
2,1,True,"11 19, 2016",rabiyaa123,it burns your eyes when u put it on and very ...,i do not recommend.,1,very negative
3,1,True,"03 24, 2018",Skip,It rusts.,It rusts.,1,very negative
4,1,True,"03 14, 2018",VB,Bought it as a present...doesn't fit a standar...,One Star,1,very negative
...,...,...,...,...,...,...,...,...
1316778,5,False,"11 29, 2001",Tina Evans,The only way to describe this wonderful piece ...,"A extream, but fun game!",1,very positive
1316779,5,False,"11 15, 1999",Kimberly P.Curtis,I found these games highly impressive and exci...,Excellent Game,1,very positive
1316780,5,False,"11 25, 2001",Amazon Customer,I hav had this game for more than a year and I...,Best GAmE ever,1,very positive
1316781,5,False,"11 12, 2001",Anne Callanan,"This game is as good as tehy come. Sure, ther...",a ma zing,1,very positive


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X = df['reviewText'].values
y = enc.fit_transform(np.array(df['overall'].to_list()).reshape(-1,1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=20000,oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=200,truncating='post',padding='post')
X_test_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=200,truncating='post',padding='post')


In [5]:
word_index = tokenizer.word_index
word_index.keys()



In [6]:
enc.categories_

[array([1, 2, 3, 4, 5])]

In [7]:
type(X)

numpy.ndarray

In [8]:
len(X_train), len(X_test), len(y_train), len(y_test)

(1052881, 263221, 1052881, 263221)

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index), 100, input_length=200),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(75)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 100)          28394700  
                                                                 
 bidirectional_2 (Bidirectio  (None, 200, 300)         301200    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 150)              225600    
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 100)               15100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 50)               

In [18]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(0.7e-3),
              metrics=['accuracy'])

checkpoint_dir = './model_checkpoint'
check_dir(checkpoint_dir)

callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1, min_lr=0.00001,verbose=1),
    ModelCheckpoint(filepath=checkpoint_dir, monitor='val_loss', save_best_only=True)
]

./model_checkpoint exists


In [19]:
history = model.fit(X_train_seq_pad,y_train, epochs=20,
                    validation_data=(X_test_seq_pad,y_test),batch_size=700,
                    validation_steps=30, callbacks=callbacks_list)

Epoch 1/20



INFO:tensorflow:Assets written to: .\model_checkpoint\assets


INFO:tensorflow:Assets written to: .\model_checkpoint\assets


Epoch 2/20



INFO:tensorflow:Assets written to: .\model_checkpoint\assets


INFO:tensorflow:Assets written to: .\model_checkpoint\assets


Epoch 3/20



INFO:tensorflow:Assets written to: .\model_checkpoint\assets


INFO:tensorflow:Assets written to: .\model_checkpoint\assets


Epoch 4/20

KeyboardInterrupt: 

In [None]:
model.predict(['I dont really like this product'])

In [None]:
model.save('model_saved/LSTM')