# ANALYSIS OF REVIEW SEMANTICS (<u>PREDICTION MODEL</u>)

## <u>1. Preliminaries</u>

### 1.1 Importing packages

In [1]:
#Import necessary packages
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


### 1.2 Version check

In [2]:
#Check Python version
print(f'TensorFlow version: {tf.__version__}')
print(f'Keras version: {keras.__version__}')
print(f'NumPy version: {np.__version__}')
print(f'Pandas version: {pd.__version__}')
!python --version

TensorFlow version: 2.16.1
Keras version: 3.3.3
NumPy version: 1.26.4
Pandas version: 2.2.2
Python 3.9.7


## <u>2. Data preperation</u>

In [3]:
#Load the data
df = pd.read_csv("data/order_reviews.csv")

#Modify the original dataframe
df_model = df.drop(['review_id', 'order_id', 'review_comment_title', 'review_creation_date', 'review_answer_timestamp'], axis=1).dropna()

#Define and apply score mapping: 0 - bad, 1 - neutral, 2 - good
score_mapping = {1: 0,
                 2: 0,
                 3: 1,
                 4: 2,
                 5: 2}

df_model['review_score'] = df_model['review_score'].map(score_mapping)

print('\nINITIALLY PROCESSED DATA TABLE')
df_model.head()


INITIALLY PROCESSED DATA TABLE


Unnamed: 0,review_score,review_comment_message
3,2,Recebi bem antes do prazo estipulado.
4,2,Parabéns lojas lannister adorei comprar pela I...
9,2,aparelho eficiente. no site a marca do aparelh...
12,2,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
15,2,"Vendedor confiável, produto ok e entrega antes..."


## <u>3. Creating a word index</u>

### 3.1 Creating and filtering words list

In [4]:
#Get all words from reviews
all_words = ' '.join(df_model['review_comment_message'].astype(str)).lower()

#Clean up the words by removing special characters
translation_table = str.maketrans({
    ',': ' ',
    '.': ' ',
    '(': ' ',
    ')': ' ',
    ':': ' ',
    "/": ' ',
    "!": ' ',
    "?": ' '})

words_list = all_words.translate(translation_table).strip().split()

#Calculate word counts
word_counts = {}

for word in words_list:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

#Filter off rarely used words
cutoff = 5
word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True) if v > cutoff}

#Check the current state of the index
print('\nMOST COMMON WORDS IN THE DATASET:')
i = 0
for k, v in word_counts.items():
        i += 1
        print(k, ": ", v)
        if i >= 10:
            break

print(f'\nNumber of actual words in vocabulary: {len(word_counts)}')


MOST COMMON WORDS IN THE DATASET:
o :  19637
produto :  18851
e :  16376
a :  12730
de :  11778
do :  11392
não :  11311
que :  8760
prazo :  8528
muito :  8045

Number of actual words in vocabulary: 3476


### 3.2 Creating the final word index for the model

In [5]:
#Modify the word list so it can be used by the model
word_index = {k: (v+3) for v, k in enumerate(word_counts.keys())}
word_index = {'<PAD>': 0, '<START>': 1, '<UNK>': 2, **word_index}
word_index = dict(list(word_index.items()))

#Check the number of elements in index
vocab_n = len(word_index)

#Check the final state of the index
print('\nFIRST FEW TOKENS IN VOCABULARY:')

for k, v in word_index.items():
    print(k, ": ", v)
    if v >= 10:
        break
        
print(f'\nFinal number of elements in vocabulary: {vocab_n}')


FIRST FEW TOKENS IN VOCABULARY:
<PAD> :  0
<START> :  1
<UNK> :  2
o :  3
produto :  4
e :  5
a :  6
de :  7
do :  8
não :  9
que :  10

Final number of elements in vocabulary: 3479


## <u>4. Tokenization</u>

### 4.1 Creating functions to encode and decode reviews

In [6]:
#Create review encoding function
def review_encode(s):
    encoded = [1]
    for word in s.split():
        word = word.translate(translation_table).strip().lower()
        if word in word_index:
            encoded.append(word_index[word])
        else:
            encoded.append(2)

    return encoded

#Create review decoding function
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def review_decode(s):
    return ' '.join([reverse_word_index.get(i, '?') for i in s])

### 4.2 Tokenizing reviews

In [7]:
#Encode reviews
df_model['review_comment_message'] = df_model['review_comment_message'].apply(review_encode)

print('\nENCODED REVIEWS TABLE')
df_model.head()


ENCODED REVIEWS TABLE


Unnamed: 0,review_score,review_comment_message
3,2,"[1, 17, 30, 15, 8, 11, 227]"
4,2,"[1, 77, 163, 62, 96, 80, 109, 457, 670, 5, 648..."
9,2,"[1, 436, 362, 18, 65, 6, 388, 8, 436, 135, 245..."
12,2,"[1, 31, 22, 143, 2, 170, 767, 61]"
15,2,"[1, 132, 395, 4, 99, 5, 13, 15, 8, 11]"


## <u>5. Model creation</u>

### 5.1 Creating test and train data

In [8]:
#Set up train test split
X = df_model['review_comment_message']
y = df_model['review_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Check the maximum comment length to decide on maxlen argument for padding
print(f'\nMaximum comment length: {df_model["review_comment_message"].apply(len).max()}')

#Pad train and test sequences
X_train = keras.preprocessing.sequence.pad_sequences(X_train, value=word_index['<PAD>'], padding='post', maxlen=50)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, value=word_index['<PAD>'], padding='post', maxlen=50)


Maximum comment length: 46


### 5.2 Building or loading a model

In [9]:
#load the model
model = keras.models.load_model("model_reviews.keras")

'''
#Create the model
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_n, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#Train the model
fit_model = model.fit(X_train, y_train, epochs=200, batch_size=512, validation_data=(X_test, y_test), verbose=1)

#Save the model
model.save("model_reviews.keras")
'''

'\n#Create the model\nmodel = keras.Sequential()\nmodel.add(keras.layers.Embedding(vocab_n, 16))\nmodel.add(keras.layers.GlobalAveragePooling1D())\nmodel.add(keras.layers.Dense(16, activation=\'relu\'))\nmodel.add(keras.layers.Dense(16, activation=\'relu\'))\nmodel.add(keras.layers.Dense(3, activation=\'softmax\'))\n\nmodel.compile(optimizer=\'adam\', loss=\'sparse_categorical_crossentropy\', metrics=[\'accuracy\'])\n\n#Train the model\nfit_model = model.fit(X_train, y_train, epochs=200, batch_size=512, validation_data=(X_test, y_test), verbose=1)\n\n#Save the model\nmodel.save("model_reviews.keras")\n'

### 5.3 Model assessment

In [10]:
#Model assessment
model.summary()
results = model.evaluate(X_test, y_test)
print(f'\nAccuracy: {results[1]}, Loss: {results[0]}')

[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.8126 - loss: 0.9488

Accuracy: 0.8179858922958374, Loss: 0.9397571682929993


# <u>6. User input prediction</u>

### 6.1 Create appropriate function

In [11]:
#Create a comment prediction function
def predict_comment(comment):
    comment_encoded = keras.preprocessing.sequence.pad_sequences([review_encode(comment)], value=word_index["<PAD>"], padding="post", maxlen=50)
    predictions = model.predict(comment_encoded)
    
    for i, prediction in enumerate(predictions):
        predicted_class = np.argmax(prediction)
        class_labels = ["Bad", "Neutral", "Good"]
        predicted_label = class_labels[predicted_class]
        
        print(f'Comment: {comment}\n')
        print(f'Bad comment chance:     {prediction[0] * 100:.2f}%')
        print(f'Neutral comment chance: {prediction[1] * 100:.2f}%')
        print(f'Good comment chance:    {prediction[2] * 100:.2f}%\n')
        print(f"FINAL PREDICTION:       {predicted_label} comment")

### 6.2 Comment prediction

In [12]:
#Input your comment as the argument to check model's prediction
predict_comment("Compra ruim, não estou satisfeito.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Comment: Compra ruim, não estou satisfeito.

Bad comment chance:     96.04%
Neutral comment chance: 2.35%
Good comment chance:    1.61%

FINAL PREDICTION:       Bad comment
