**#Importing libraries**

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.utils import shuffle
from scipy.sparse import csr_matrix
import tensorflow as tf
import re
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from joblib import dump,load
from sklearn.metrics import precision_recall_fscore_support as score
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

**#Read Dataset/Calculating weight**

In [2]:
df = pd.read_json("./Video_Games_5.json", lines=True)
num_overall_1 = df["overall"].value_counts()[1]
num_overall_2 = df["overall"].value_counts()[2]
num_overall_3 = df["overall"].value_counts()[3]
num_overall_4 = df["overall"].value_counts()[4]
num_overall_5 = df["overall"].value_counts()[5]
num_class_0 = num_overall_1 + num_overall_2
num_class_2 = num_overall_4 + num_overall_5
weight_0 = num_class_2 / num_class_0
weight_1 = num_class_2 / num_overall_3
bayes_weight=[0.4,0.4,0.2]
svm_rf_weight={0:weight_0, 1: weight_1, 2:4}

**#Creating classes**

In [3]:
df["sentiment"] = df["overall"].apply(
    lambda rating: 0 if rating <= 2 else (1 if rating == 3 else 2)
)

**#Preprocessing**

In [4]:

df.fillna({"reviewText": ""}, inplace=True)

df["reviewText"] = df["reviewText"].apply(lambda x: re.sub(r"\W", " ", str(x)))

stop_words = set(stopwords.words("english"))
df["reviewText"] = df["reviewText"].apply(
    lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words])
)

stemmer = PorterStemmer()
df["reviewText"] = df["reviewText"].apply(
    lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)])
)

**#Vectorization**

In [5]:
vectorizer = TfidfVectorizer()
reviews_tfidf = vectorizer.fit_transform(df["reviewText"])

**#UnderSampling**

In [6]:
rus=RandomUnderSampler(random_state=42, sampling_strategy={2: 50000})
reviews_sampled, sentiments_sampled = rus.fit_resample(reviews_tfidf, df["sentiment"])

**#Splitting dataset**

In [7]:
reviews_train, reviews_test, sentiments_train, sentiments_test = train_test_split(
    reviews_sampled, sentiments_sampled, test_size=0.2, random_state=42
)

**#Random Forest Classifier**

In [None]:
clf = RandomForestClassifier(n_estimators=300, max_depth=100, min_samples_leaf=5, min_samples_split=10, random_state=42, class_weight=svm_rf_weight)

clf.fit(reviews_train, sentiments_train)

dump(clf, 'random_forest_classifier_partition.joblib')
rf_loaded = load('random_forest_classifier_partition.joblib')

sentiments_pred = rf_loaded.predict(reviews_test)
train_accuracy = accuracy_score(sentiments_train, rf_loaded.predict(reviews_train))
test_accuracy = accuracy_score(sentiments_test, rf_loaded.predict(reviews_test))
print("Train accuracy: ", train_accuracy)
print("Test accuracy: ", test_accuracy)
print(classification_report(sentiments_test, sentiments_pred))

cm = confusion_matrix(sentiments_test, sentiments_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

**#SVM Classifier**

In [None]:
clf = LinearSVC(dual=True, max_iter=10000, class_weight=svm_rf_weight)

clf.fit(reviews_train, sentiments_train)

dump(clf, 'svm_partition.joblib')
svm_loaded = load('svm.joblib')

sentiments_pred = clf.predict(reviews_test)
train_accuracy = accuracy_score(sentiments_train, clf.predict(reviews_train))
test_accuracy = accuracy_score(sentiments_test, clf.predict(reviews_test))
print("Train accuracy: ", train_accuracy)
print("Test accuracy: ", test_accuracy)
print(classification_report(sentiments_test, sentiments_pred))

cm = confusion_matrix(sentiments_test, sentiments_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

**#Multinomial Naive Bayes Classifier**

In [None]:
clf = MultinomialNB(class_prior=bayes_weight)
clf.fit(reviews_train, sentiments_train)
dump(clf, "bayes_partition.joblib")

nb_loaded = load('bayes.joblib')
sentiments_pred = clf.predict(reviews_test)

print(classification_report(sentiments_test, sentiments_pred))

train_accuracy = accuracy_score(sentiments_train, clf.predict(reviews_train))
test_accuracy = accuracy_score(sentiments_test, clf.predict(reviews_test))
print("Train accuracy: ", train_accuracy)
print("Test accuracy: ", test_accuracy)

cm = confusion_matrix(sentiments_test, sentiments_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

**#WordCloud**

In [None]:
positive_reviews = df[df['sentiment'] == 2]['reviewText'].str.cat(sep=' ')
wordcloud = WordCloud(background_color='white', max_words=200).generate(positive_reviews)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Positive Reviews')
plt.show()

negative_reviews = df[df['sentiment'] == 0]['reviewText'].str.cat(sep=' ')
wordcloud_negative = WordCloud(background_color='white', max_words=200, contour_color='red').generate(negative_reviews)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Sentiment Word Cloud')
plt.show()

**#Metrics Graph**

In [None]:
svm_loaded = load('svm_partition.joblib')
nb_loaded = load('bayes_partition.joblib')
precision_rf, recall_rf, fscore_rf, _ = score(sentiments_test, rf_loaded.predict(reviews_test), average='macro')

precision_svm, recall_svm, fscore_svm, _ = score(sentiments_test, svm_loaded.predict(reviews_test), average='macro')

precision_nb, recall_nb, fscore_nb, _ = score(sentiments_test, nb_loaded.predict(reviews_test), average='macro')

models = ['Random Forest', 'SVM', 'Naive Bayes']
precision_scores = [precision_rf, precision_svm, precision_nb]
recall_scores = [recall_rf, recall_svm, recall_nb]
fscore_scores = [fscore_rf, fscore_svm, fscore_nb]

x = np.arange(len(models))  
width = 0.25  

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision_scores, width, label='Precision')
rects2 = ax.bar(x, recall_scores, width, label='Recall')
rects3 = ax.bar(x + width, fscore_scores, width, label='F1-Score')

ax.set_ylabel('Scores')
ax.set_title('Scores by model and metric')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

fig.tight_layout()
plt.show()

**#Rete Neurale Fully Connected**

In [9]:
input_size = reviews_tfidf.shape[1]

input_layer = tf.keras.Input(shape=(input_size,))
hidden_layer = tf.keras.layers.Dense(72, activation='relu')(input_layer)
output_layer = tf.keras.layers.Dense(3, activation='softmax')(hidden_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(reviews_train, sentiments_train, epochs=50, batch_size=256, shuffle=True)

In [None]:
predictions = model.predict(reviews_test)

predicted_classes = predictions.argmax(axis=1) 

real_labels = sentiments_test.astype(int)


print(classification_report(real_labels, predicted_classes))
print("\nAccuracy:", accuracy_score(real_labels, predicted_classes))

cm = confusion_matrix(real_labels, predicted_classes)
sns.heatmap(cm, annot=True, fmt="d")    
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

report = classification_report(real_labels, predicted_classes, output_dict=True)
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"]

classes = ["Negative", "Neutral", "Positive"]

x = np.arange(len(classes))
width = 0.2

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision, width, label="Precision")
rects2 = ax.bar(x, recall, width, label="Recall")
rects3 = ax.bar(x + width, f1_score, width, label="F1-score")

ax.set_ylabel("Score")
ax.set_title("Scores by  metric")
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

fig.tight_layout()
plt.show()

**FCN With Dropout Layer**

In [None]:

reviews_train_csr = csr_matrix(reviews_train)
reviews_test_csr = csr_matrix(reviews_test)

reviews_train_csr, sentiments_train_shuffle = shuffle(reviews_train_csr, sentiments_train)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(72, activation='relu', input_shape=(reviews_train_csr.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(
    reviews_train_csr, sentiments_train_shuffle, epochs=20, batch_size=256, shuffle=True
)

loss, accuracy = model.evaluate(reviews_test_csr, sentiments_test)
print("Test Accuracy:", accuracy)

In [None]:
predictions = model.predict(reviews_test)


predicted_classes = predictions.argmax(axis=1) 

real_labels = sentiments_test.astype(int)


print(classification_report(real_labels, predicted_classes))
print("\nAccuracy:", accuracy_score(real_labels, predicted_classes))

cm = confusion_matrix(real_labels, predicted_classes)
sns.heatmap(cm, annot=True, fmt="d")    
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

report = classification_report(real_labels, predicted_classes, output_dict=True)
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"]

classes = ["Negative", "Neutral", "Positive"]

x = np.arange(len(classes))
width = 0.2

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision, width, label="Precision")
rects2 = ax.bar(x, recall, width, label="Recall")
rects3 = ax.bar(x + width, f1_score, width, label="F1-score")

ax.set_ylabel("Score")
ax.set_title("Scores by  metric")
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

fig.tight_layout()
plt.show()

**#1DCNN**

In [None]:
# BEFORE STARTING CNN CODE
# DOWNLOAD glove.6B.200d.txt FROM THE FOLLOWING LINK AND PLACE IT IN THE SAME DIRECTORY AS THIS SCRIPT
# https://www.kaggle.com/datasets/rtatman/glove-global-vectors-for-word-representation?resource=download

In [8]:
def load_glove_vectors(file_path):
    word_vectors = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            vector = np.array([float(val) for val in parts[1:]])
            word_vectors[word] = vector
    return word_vectors

glove_file_path = "./glove.6B.200d.txt"
glove_vectors = load_glove_vectors(glove_file_path)

vocab_size = len(glove_vectors)
embedding_dim = len(next(iter(glove_vectors.values())))
embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_index = {}
for i, (word, vector) in enumerate(glove_vectors.items()):
    embedding_matrix[i] = vector
    word_index[word] = i

In [None]:
modelCNN = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            trainable=False,
        ),
        tf.keras.layers.Conv1D(64, 5, activation="relu"),
        tf.keras.layers.MaxPooling1D(pool_size=4),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(3, activation="softmax"),
    ]
)

modelCNN.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)


modelCNN.fit(
    reviews_train,
    sentiments_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1,
)

loss, accuracy = modelCNN.evaluate(reviews_test, sentiments_test)

print("Test Accuracy:", accuracy)

In [None]:

predictions = modelCNN.predict(reviews_test)


predicted_classes = predictions.argmax(
    axis=1
)

real_labels = sentiments_test.astype(int)


print(classification_report(real_labels, predicted_classes))
print("\nAccuracy:", accuracy_score(real_labels, predicted_classes))

cm = confusion_matrix(real_labels, predicted_classes)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

report = classification_report(real_labels, predicted_classes, output_dict=True)
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"]

classes = ["Negative", "Neutral", "Positive"]

x = np.arange(len(classes))
width = 0.2

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision, width, label="Precision")
rects2 = ax.bar(x, recall, width, label="Recall")
rects3 = ax.bar(x + width, f1_score, width, label="F1-score")

ax.set_ylabel("Score")
ax.set_title("Scores by  metric")
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

fig.tight_layout()
plt.show()

**#HAN**


In [None]:

vocab = vectorizer.get_feature_names_out()
word_to_index = {word: index for index, word in enumerate(vocab)}

train_sequence = []
for doc in reviews_train:
    doc = doc.toarray().flatten()
    sequence = [word_to_index[vocab[i]] for i in np.where(doc > 0)[0]]
    train_sequence.append(sequence)

max_sequence_length = max(len(seq) for seq in train_sequence)
train_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequence, maxlen=max_sequence_length, padding="post"
)

test_sequences = []
for doc in reviews_test:
    doc = doc.toarray().flatten()
    words = [word for word, count in zip(vocab, doc) if count > 0]
    words_lower = [word.lower() for word in words]
    doc_lower = " ".join(words_lower)
    doc_vector = vectorizer.transform([doc_lower])
    sequence = [
        word_to_index[word] for word, count in zip(vocab, doc_vector.toarray().flatten()) if count > 0
    ]
    test_sequences.append(sequence)

test_sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequences, maxlen=max_sequence_length, padding="post"
)


In [None]:
def attention_layer(inputs):
    attention = tf.keras.layers.Dense(1, activation="tanh")(inputs)
    attention = tf.keras.layers.Flatten()(attention)
    attention = tf.keras.layers.Activation("softmax")(attention)
    attention = tf.keras.layers.RepeatVector(2 * 64)(attention)
    attention = tf.keras.layers.Permute([2, 1])(attention)
    return tf.keras.layers.multiply([inputs, attention])

def create_han(max_sequence_length, max_words, embedding_dim):
    input_word = tf.keras.layers.Input(shape=(max_sequence_length,))
    embedding = tf.keras.layers.Embedding(
        input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length
    )(input_word)
    word_encoder = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(embedding)
    word_attention = attention_layer(word_encoder)
    word_attention = tf.keras.layers.Lambda(
        lambda x: tf.keras.backend.sum(x, axis=1), output_shape=(128,)
    )(word_attention)

    output = tf.keras.layers.Dense(3, activation="softmax")(word_attention)

    model = tf.keras.models.Model(inputs=input_word, outputs=output)
    return model

max_sequence_length = train_sequences_padded.shape[
    1
]
max_words = len(word_to_index) + 1 
embedding_dim = 100

modelHAN = create_han(max_sequence_length, max_words, embedding_dim)

modelHAN.compile(
    loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
sentiments_train_onehot = tf.keras.utils.to_categorical(sentiments_train, num_classes=3)
sentiments_test_onehot = tf.keras.utils.to_categorical(sentiments_test, num_classes=3)

modelHAN.fit(
    train_sequences_padded,
    sentiments_train_onehot,
    epochs=10,
    batch_size=64,
    validation_data=(test_sequences_padded, sentiments_test_onehot),
)

modelHAN.summary()

In [None]:
predictions = modelHAN.predict(test_sequences_padded)

predicted_classes = predictions.argmax(
    axis=1
)

real_labels = sentiments_test.astype(int)


print(classification_report(real_labels, predicted_classes))
print("\nAccuracy:", accuracy_score(real_labels, predicted_classes))

cm = confusion_matrix(real_labels, predicted_classes)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

report = classification_report(real_labels, predicted_classes, output_dict=True)
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"]

classes = ["Negative", "Neutral", "Positive"]

x = np.arange(len(classes))
width = 0.2

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision, width, label="Precision")
rects2 = ax.bar(x, recall, width, label="Recall")
rects3 = ax.bar(x + width, f1_score, width, label="F1-score")
ax.set_ylabel("Score")
ax.set_title("Scores by  metric")
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

fig.tight_layout()
plt.show()

**#DISTIL BERT**

In [None]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

In [23]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_texts = [
    " ".join(
        [
            vocab[index]
            for index, count in enumerate(doc.toarray().flatten())
            if count > 0
        ]
    )
    for doc in reviews_train
]
test_texts = [
    " ".join(
        [
            vocab[index]
            for index, count in enumerate(doc.toarray().flatten())
            if count > 0
        ]
    )
    for doc in reviews_test
]

train_tokenizer = [
    tokenizer.encode(review, max_length=512, truncation=True) for review in train_texts
]
test_tokenizer = [
    tokenizer.encode(review, max_length=512, truncation=True) for review in test_texts
]

max_seq_length= max(len(seq) for seq in train_tokenizer + test_tokenizer)
train_tokenizer_padded = tf.keras.preprocessing.sequence.pad_sequences(train_tokenizer, maxlen=max_seq_length, padding='post')
test_tokenizer_padded = tf.keras.preprocessing.sequence.pad_sequences(test_tokenizer, maxlen=max_seq_length, padding='post')

train_tokenizer_padded = tf.convert_to_tensor(train_tokenizer_padded)
test_tokenizer_padded = tf.convert_to_tensor(test_tokenizer_padded)

In [None]:
modelDB = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

modelDB.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

modelDB.fit(
    train_tokenizer_padded,
    sentiments_train,
    epochs=1,
    batch_size=32,
    validation_split=0.1,
)

modelDB.summary()

In [None]:
predictions = modelDB.predict(test_tokenizer_padded)

predicted_classes = predictions.argmax(
    axis=1
) 
real_labels = sentiments_test.astype(int)


print(classification_report(real_labels, predicted_classes))
print("\nAccuracy:", accuracy_score(real_labels, predicted_classes))

cm = confusion_matrix(real_labels, predicted_classes)
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

report = classification_report(real_labels, predicted_classes, output_dict=True)
precision = report["macro avg"]["precision"]
recall = report["macro avg"]["recall"]
f1_score = report["macro avg"]["f1-score"]

classes = ["Negative", "Neutral", "Positive"]

x = np.arange(len(classes))
width = 0.2

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, precision, width, label="Precision")
rects2 = ax.bar(x, recall, width, label="Recall")
rects3 = ax.bar(x + width, f1_score, width, label="F1-score")
ax.set_ylabel("Score")
ax.set_title("Scores by  metric")
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

fig.tight_layout()
plt.show()