In [27]:
import keras
from keras import layers, callbacks
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.model_selection import train_test_split

from lstm_model import load_model_weights_from_checkpoint

VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 100
EMBED_DIM = 24
SEED = 0

DATA_PATH=r"../../data/transformed/amazon_reviews_5_partition_1.csv"

In [None]:
df_reader = pd.read_csv(
    DATA_PATH,
    index_col=0,
    chunksize=10000
)

In [None]:
df = df_reader.__next__()

In [4]:
df = pd.read_csv(DATA_PATH, index_col=0)
df = df.dropna()

df["reviewText"] = df["reviewText"].astype("string")

x = df["reviewText"]
y = df["overall"] -1

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=SEED)

In [16]:
with open("../../checkpoints/vectorization_vocabulary.txt", "r") as file:
    vocab = file.read()
    vocab=vocab.split("\n")[:-1] #last line is an empty string

In [17]:
vectorize_layer = layers.TextVectorization(
    VOCAB_SIZE,
    "lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
    vocabulary=vocab
)

In [30]:
# Increase complexity

lstm_model = keras.Sequential(
        [
            layers.Input(shape=(1,), dtype="string"),
            vectorize_layer,
            layers.Embedding(VOCAB_SIZE, EMBED_DIM),
            layers.LSTM(24, dropout=0.5, return_sequences=True),
            layers.LSTM(24, dropout=0.5),
            layers.Dense(1)
        ]
    )

In [31]:
# Load from checkpoint

lstm_model.load_weights(r"../../checkpoints/lstm_model_24_24_250k_rows/model")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x13a983190f0>

In [None]:
lstm_model.compile(
    optimizer="adam", 
    loss="mse",
    metrics=["mae"]
)

In [None]:
cbs = [
    callbacks.ModelCheckpoint(
        filepath="../../checkpoints/lstm_model_124_16_10k_rows",
        save_best_only=True,
        save_weights_only=True
    ),
    callbacks.EarlyStopping(
        patience=8
    )
]

In [None]:
lstm_model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    validation_data=(x_test, y_test),
    callbacks=cbs
)

In [None]:
lstm_model.predict([""])

In [None]:
y_train_naive = np.full_like(y_train, dtype="float32", fill_value=2.1539134783695926)

In [1]:
mean_absolute_error(y_train, y_train_naive).numpy()

NameError: name 'mean_absolute_error' is not defined

In [None]:
predicted = lstm_model.predict(x)

In [None]:
stats = pd.DataFrame({
    "text":x,
    "actual":y,
    "predicted":predicted.flatten()
})

In [None]:
stats.sample(10)

# Load model

In [6]:
model = keras.models.load_model(r"../../saved_models/lstm_model")



In [9]:
x_test_pred =  model.predict(x_test)



In [11]:
mean_absolute_error(y_test, x_test_pred)

0.5817508957319908

In [12]:
mean_squared_error(y_test, x_test_pred)

0.6016878475227962

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 24)           360000    
                                                                 
 lstm (LSTM)                 (None, 100, 24)           4704      
                                                                 
 lstm_1 (LSTM)               (None, 24)                4704      
                                                                 
 dense (Dense)               (None, 1)                 25        
                                                                 
Total params: 369,433
Trainable params: 369,433
Non-trainable params: 0
__________________________________________________

In [23]:
model = load_model_weights_from_checkpoint()