In [None]:
import keras
from keras import layers, callbacks
import tensorflow as tf
import pandas as pd
import numpy as np

from keras.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split

VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 100
EMBED_DIM = 8

DATA_PATH=r"../../data/transformed/amazon_reviews_5_partition_1.csv"

In [None]:
# Load a subset of dataset for testing
df_reader = pd.read_csv(
    DATA_PATH,
    index_col=0,
    chunksize=10000
)
df = df_reader.__next__()

In [None]:
# load entire dataset for training
df = pd.read_csv(DATA_PATH, index_col=0)

In [None]:
df = df.dropna()
df["reviewText"] = df["reviewText"].astype("string")

In [None]:
x = df["reviewText"]
y = df["overall"] -1 # Normalize to 0-4

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
with open("../../checkpoints/vectorization_vocabulary.txt", "r") as file:
    vocab = file.read()
    vocab=vocab.split("\n")[:-1] #last line is an empty string

In [None]:
vectorize_layer = layers.TextVectorization(
    VOCAB_SIZE,
    "lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
    vocabulary=vocab
)

In [None]:
# Increase complexity

cnn_model = keras.Sequential(
    [
        layers.Input(shape=(1,), dtype="string"),
        vectorize_layer,
        layers.Embedding(VOCAB_SIZE, EMBED_DIM),
        layers.Reshape(target_shape=(100, 8, 1)),
        layers.Conv2D(32, 3, padding="same", activation="relu"),
        layers.Conv2D(16, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(padding="same"),
        layers.Conv2D(32, 3, padding="same"),
        layers.MaxPooling2D(padding="same"),
        layers.Flatten(),
        layers.Dense(1)
    ]
)

In [None]:
cnn_model.compile(
    optimizer="adam", 
    loss="mse",
    metrics=["mae"]
)

In [None]:
cbs = [
    callbacks.ModelCheckpoint(
        filepath="../../checkpoints/2d_cnn_32_16_32_250k/model",
        save_best_only=True,
        save_weights_only=True
    ),
    callbacks.EarlyStopping(
        patience=8
    )
]

In [None]:
cnn_model.fit(
    x=x_train,
    y=y_train,
    epochs=30,
    validation_data=(x_test, y_test),
    callbacks=cbs
)

In [None]:
cnn_model.save("../../saved_models/2dcnn_32_16_32_250k_rows")

In [None]:
cnn_model.predict(["worst bad bad hated loved it so much omagahhd love love love "])

In [None]:
y_train_naive = np.full_like(y_train, dtype="float32", fill_value=2.1539134783695926)

In [None]:
mean_absolute_error(y_train, y_train_naive).numpy()

In [None]:
predicted = lstm_model.predict(x)

In [None]:
stats = pd.DataFrame({
    "text":x,
    "actual":y,
    "predicted":predicted.flatten()
})

In [None]:
stats.sample(10)

In [None]:
model = keras.models.load_model('../../saved_models/2dcnn_32_16_32_250k_rows/')

In [None]:
model.predict(["tesla is an obvious scam"])