In [20]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os

def check_dir(dir:str):
    if not os.path.exists(dir):
        os.mkdir(dir)
        print(f'Created dir: {dir}')
    else:
        print(f'{dir} exists')

In [21]:
const = tf.constant(["This is a string", "This is another string"])
const[0]


<tf.Tensor: shape=(), dtype=string, numpy=b'This is a string'>

In [22]:
def transform_sentiment(input):
    dict = {
        1: 'very negative',
        2: 'slightly negative',
        3: 'neutral',
        4: 'slightly positive',
        5: 'very positive'
    }

    return dict[input]


df = pd.read_json('./Data/consolidated/final_3.json')
df = df.drop(columns=['index'], axis=0)
df['count'] = 1
df['sentiment'] = df['overall'].apply(transform_sentiment)
df = df.dropna(inplace=False)
df

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X = df['reviewText'].values
y = enc.fit_transform(np.array(df['overall'].to_list()).reshape(-1,1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=20000,oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=200,truncating='post',padding='post')
X_test_seq_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq, maxlen=200,truncating='post',padding='post')


In [None]:
word_index = tokenizer.word_index
word_index.keys()

In [None]:
enc.categories_

In [None]:
type(X)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index), 100, input_length=200),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(75)),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
model.summary()

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(0.7e-3),
              metrics=['accuracy'])

checkpoint_dir = './model_checkpoint'
check_dir(checkpoint_dir)

callbacks_list = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=1, min_lr=0.00001,verbose=1),
    ModelCheckpoint(filepath=checkpoint_dir, monitor='val_loss', save_best_only=True)
]

In [None]:
history = model.fit(X_train_seq_pad,y_train, epochs=20,
                    validation_data=(X_test_seq_pad,y_test),batch_size=700,
                    validation_steps=30, callbacks=callbacks_list)

In [None]:
model.predict(['I dont really like this product'])

In [None]:
model.save('model_saved/LSTM')