In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np

from keras.metrics import mean_absolute_error
from keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 100
EMBED_DIM = 8

DATA_PATH=r"../../data/transformed/amazon_reviews_5_partition_1.csv"

In [5]:
df_reader = pd.read_csv(
    DATA_PATH,
    index_col=0,
    chunksize=10000
)

In [6]:
df = df_reader.__next__()

In [32]:
# df = pd.read_csv(DATA_PATH, index_col=0)

In [None]:
df.sample(4)

In [7]:
df = df.dropna()
df["reviewText"] = df["reviewText"].astype("string")

In [8]:
with open("../../checkpoints/vectorization_vocabulary.txt", "r") as file:
    vocab = file.read()
    vocab=vocab.split("\n")[:-1] #last line is an empty string

vectorize_layer = TextVectorization(
    VOCAB_SIZE,
    "lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
    vocabulary=vocab
)

In [29]:
def to_vec_array(x):
    return np.c_[x]

In [23]:
x = df["reviewText"].apply(vectorize_layer)
y = df["overall"] -1 # Now [0-4] scoring

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

array([71,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [35]:
model = SVR()

In [43]:
model.fit(x.to_list(), y)

In [46]:
mean_absolute_error(y_test, model.predict(x_test.to_list())).numpy()

0.8228319489968221

In [52]:
def predict_svr(text):
    x = np.array(vectorize_layer(text)).reshape(1, -1)
    return model.predict(x)

In [68]:
predict_svr("dead dead dead dead dead dead")

array([2.4744697])

In [74]:
df_testing = df.iloc[:20,:2]

Unnamed: 0,overall,reviewText
4,5,great
12,3,years later the cheese is government cheese th...
21,2,looking for a louis untermeyer book from the s...
24,5,i don t know that i can truly explain why i li...
29,5,one of my daughters growing up also loved this...
39,3,dr seuss has some really brilliant books this ...
43,1,completly boring yes it s a childerns book tha...
74,3,the carpet wars is a sampler of informal writi...
79,3,i love this series but this entry is not hille...
82,4,legendary lieutenant joe leaphorn and his prot...


In [75]:
df_testing["preds"] = df_testing["reviewText"].apply(predict_svr)

In [76]:
df_testing

Unnamed: 0,overall,reviewText,preds
4,5,great,[2.3513871980204177]
12,3,years later the cheese is government cheese th...,[1.899986764096484]
21,2,looking for a louis untermeyer book from the s...,[2.427924901443747]
24,5,i don t know that i can truly explain why i li...,[1.8068471143168336]
29,5,one of my daughters growing up also loved this...,[1.7704203416876796]
39,3,dr seuss has some really brilliant books this ...,[1.9000791309705711]
43,1,completly boring yes it s a childerns book tha...,[2.0337852015675817]
74,3,the carpet wars is a sampler of informal writi...,[1.9674114217256387]
79,3,i love this series but this entry is not hille...,[1.9000652627214585]
82,4,legendary lieutenant joe leaphorn and his prot...,[2.8875847618190242]
