In [1]:
import numpy as np
import pandas as pd

import time
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import en_nlp_utils
import model_evaluation_utils

In [2]:
# Define default source path
SRC_PATH = "src/"

start_time = datetime.now()
print("Text processing started at {}".format(start_time))

Text processing started at 2024-06-23 22:39:27.459417


In [3]:
# Load dataset
df_review = pd.read_csv(SRC_PATH + "en_hotel_review.csv")

In [4]:
# Check NULL values
en_nlp_utils.check_null(df_review, "df_review")

# Remove rows when "review_cleaned_v1" is NULL
df_review = df_review.dropna(subset=["review_cleaned_v1"]).reset_index(drop=True)

[1mdf_review:[0m
source                     0
hotel_id                   0
hotel_name                 0
country                    0
group_name                 0
room_type                  0
stay_length                0
stay_date                  0
review_score               0
review_score_category      0
sentiment                  0
review_date                0
review_title               1
review                     0
review_cleaned_v1        143
dtype: int64
[1mTotal rows in df_review:[0m 39974 



# Using Full Dataset

## 1) Data set partitioning

In [5]:
# Extract features and labels
X = df_review["review_cleaned_v1"]
y = df_review["sentiment"]

# Data set partitioning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from collections import Counter

trd = dict(Counter(y_train))
tsd = dict(Counter(y_test))
pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], columns=["Target Label", "Train Count", "Test Count"]).sort_values(by=["Train Count", "Test Count"], ascending=False)

Unnamed: 0,Target Label,Train Count,Test Count
1,positive,25316,6360
0,negative,6548,1607


## 2) Feature extraction

### a) Bag of Words

In [7]:
# Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

### b) TF-IDF

In [8]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## 3) Model training and evaluation

### a) Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Trained using Bag of Words features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
y_pred_bow_nb = nb_bow.predict(X_test_bow)
print("Naive Bayes with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_nb))

# Trained using TF-IDF features
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_nb = nb_tfidf.predict(X_test_tfidf)
print("Naive Bayes with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_nb))

Naive Bayes with BoW accuracy:  0.8223923685201456
Naive Bayes with TF-IDF accuracy:  0.8260323835822769


In [10]:
# Display classification report 
unique_classes = list(set(y_test))

print("Naive Bayes with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_nb, classes=unique_classes)

print("Naive Bayes with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_nb, classes=unique_classes)

Naive Bayes with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.56      0.54      0.55      1607
    positive       0.88      0.89      0.89      6360

    accuracy                           0.82      7967
   macro avg       0.72      0.72      0.72      7967
weighted avg       0.82      0.82      0.82      7967

Naive Bayes with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.76      0.20      0.32      1607
    positive       0.83      0.98      0.90      6360

    accuracy                           0.83      7967
   macro avg       0.79      0.59      0.61      7967
weighted avg       0.82      0.83      0.78      7967



### b) SVM

In [11]:
from sklearn.svm import LinearSVC

# Trained using Bag of Words features
svm_bow = LinearSVC(C=0.1, max_iter=5000)
svm_bow.fit(X_train_bow, y_train)
y_pred_bow_svm = svm_bow.predict(X_test_bow)
print("SVM with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_svm))

# Trained using TF-IDF features
svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_svm = svm_tfidf.predict(X_test_tfidf)
print("SVM with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_svm))

SVM with BoW accuracy:  0.8315551650558554
SVM with TF-IDF accuracy:  0.8393372662231706


In [12]:
# Display classification report 
print("SVM with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_svm, classes=unique_classes)

print("SVM with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_svm, classes=unique_classes)

SVM with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.62      0.42      0.50      1607
    positive       0.87      0.93      0.90      6360

    accuracy                           0.83      7967
   macro avg       0.74      0.68      0.70      7967
weighted avg       0.82      0.83      0.82      7967

SVM with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.63      0.48      0.55      1607
    positive       0.88      0.93      0.90      6360

    accuracy                           0.84      7967
   macro avg       0.76      0.71      0.72      7967
weighted avg       0.83      0.84      0.83      7967



### c) Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

# Trained using Bag of Words features
lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(X_train_bow, y_train)
y_pred_bow_lr = lr_bow.predict(X_test_bow)
print("Logistic Regression with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_lr))

# Trained using TF-IDF features
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_lr = lr_tfidf.predict(X_test_tfidf)
print("Logistic Regression with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_lr))

Logistic Regression with BoW accuracy:  0.8349441445964604
Logistic Regression with TF-IDF accuracy:  0.844860047696749


In [14]:
# Display classification report 
print("Logistic Regression with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_lr, classes=unique_classes)

print("Logistic Regression with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_lr, classes=unique_classes)

Logistic Regression with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.63      0.45      0.52      1607
    positive       0.87      0.93      0.90      6360

    accuracy                           0.83      7967
   macro avg       0.75      0.69      0.71      7967
weighted avg       0.82      0.83      0.82      7967

Logistic Regression with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.67      0.45      0.54      1607
    positive       0.87      0.94      0.91      6360

    accuracy                           0.84      7967
   macro avg       0.77      0.70      0.72      7967
weighted avg       0.83      0.84      0.83      7967



### d) LSTM

In [15]:
import tensorflow.compat.v1 as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Converts the label to numeric form
label_encoder = LabelEncoder()
y2 = label_encoder.fit_transform(y) # Convert "negative" to 0, and "positive" to 1

# Data set partitioning
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train2)
X_train_seq = tokenizer.texts_to_sequences(X_train2)
X_test_seq = tokenizer.texts_to_sequences(X_test2)

# Padding sequences
max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(learning_rate=1e-3), loss="binary_crossentropy", metrics=["accuracy"])

# Train LSTM model
model.fit(X_train_pad, y_train2, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test2), verbose=2)

# Evaluate model 
loss, accuracy = model.evaluate(X_test_pad, y_test2, verbose=2)
print("LSTM accuracy: ", accuracy)

Epoch 1/5
498/498 - 41s - 82ms/step - accuracy: 0.8251 - loss: 0.3822 - val_accuracy: 0.8450 - val_loss: 0.3392
Epoch 2/5
498/498 - 43s - 87ms/step - accuracy: 0.8584 - loss: 0.3155 - val_accuracy: 0.8436 - val_loss: 0.3326
Epoch 3/5
498/498 - 45s - 90ms/step - accuracy: 0.8716 - loss: 0.2953 - val_accuracy: 0.8474 - val_loss: 0.3363
Epoch 4/5
498/498 - 45s - 89ms/step - accuracy: 0.8821 - loss: 0.2743 - val_accuracy: 0.8436 - val_loss: 0.3433
Epoch 5/5
498/498 - 44s - 89ms/step - accuracy: 0.8930 - loss: 0.2553 - val_accuracy: 0.8441 - val_loss: 0.3606
249/249 - 4s - 16ms/step - accuracy: 0.8441 - loss: 0.3606
LSTM accuracy:  0.844106912612915


In [16]:
end_time = datetime.now()
print("Text processing ended at {}".format(end_time))
print("Text processing spent {}".format(end_time - start_time))

Text processing ended at 2024-06-23 22:43:18.703596
Text processing spent 0:03:51.244179


# Applying Stratified Sampling

In [17]:
# Convert "stay_date" and "review_date" columns as datetime data type
df_review["stay_date"] = pd.to_datetime(df_review["stay_date"], format="%d/%m/%Y")

# Count number of reviews based on year and sentiment
sentiment_count = df_review.groupby([df_review["sentiment"], df_review["stay_date"].dt.year]).size().reset_index(name="count")
sentiment_count2 = sentiment_count.pivot_table(index="stay_date", columns="sentiment", values="count", 
                                               aggfunc="sum", margins=True, margins_name="Total")
sentiment_count2.columns.name = None
sentiment_count2

Unnamed: 0_level_0,negative,positive,Total
stay_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,1691,5560,7251
2023,5364,21375,26739
2024,1100,4741,5841
Total,8155,31676,39831


## 1) Stratified sampling

In [18]:
# Function for stratified sampling
def stratified_sample(df, stratify_col, frac):
    return df.groupby(stratify_col, group_keys=False).apply(lambda x: x.sample(frac=frac))

# Stratified sampling based on year and sentiment
frac = 0.30  # Sampling ratio
sampled_2022 = stratified_sample(df_review[df_review["stay_date"].dt.year == 2022], "sentiment", frac)
sampled_2023 = stratified_sample(df_review[df_review["stay_date"].dt.year == 2023], "sentiment", frac)
sampled_2024 = stratified_sample(df_review[df_review["stay_date"].dt.year == 2024], "sentiment", frac)

# Merge the sampled data
sampled_data = pd.concat([sampled_2022, sampled_2023, sampled_2024])

In [19]:
# Count number of reviews based on year and sentiment
sampled_sentiment_count = sampled_data.groupby([sampled_data["sentiment"], sampled_data["stay_date"].dt.year]).size().reset_index(name="count")
sampled_sentiment_count2 = sampled_sentiment_count.pivot_table(index="stay_date", columns="sentiment", values="count", 
                                               aggfunc="sum", margins=True, margins_name="Total")
sampled_sentiment_count2.columns.name = None
sampled_sentiment_count2

Unnamed: 0_level_0,negative,positive,Total
stay_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,507,1668,2175
2023,1609,6412,8021
2024,330,1422,1752
Total,2446,9502,11948


## 2) Data set partitioning

In [20]:
# Extract features and labels
X = sampled_data["review_cleaned_v1"]
y = sampled_data["sentiment"]

# Data set partitioning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
trd = dict(Counter(y_train))
tsd = dict(Counter(y_test))
pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], columns=["Target Label", "Train Count", "Test Count"]).sort_values(by=["Train Count", "Test Count"], ascending=False)

Unnamed: 0,Target Label,Train Count,Test Count
0,positive,7616,1886
1,negative,1942,504


## 3) Feature extraction

### a) Bag of Words

In [22]:
# Bag of Words
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

### b) TF-IDF

In [23]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## 4) Model training and evaluation

### a) Naive Bayes

In [24]:
# Trained using Bag of Words features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)
y_pred_bow_nb = nb_bow.predict(X_test_bow)
print("Naive Bayes with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_nb))

# Trained using TF-IDF features
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_nb = nb_tfidf.predict(X_test_tfidf)
print("Naive Bayes with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_nb))

Naive Bayes with BoW accuracy:  0.8317991631799163
Naive Bayes with TF-IDF accuracy:  0.8108786610878661


In [25]:
# Display classification report 
unique_classes = list(set(y_test))

print("Naive Bayes with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_nb, classes=unique_classes)

print("Naive Bayes with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_nb, classes=unique_classes)

Naive Bayes with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.63      0.48      0.55       504
    positive       0.87      0.93      0.90      1886

    accuracy                           0.83      2390
   macro avg       0.75      0.70      0.72      2390
weighted avg       0.82      0.83      0.82      2390

Naive Bayes with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.87      0.12      0.21       504
    positive       0.81      1.00      0.89      1886

    accuracy                           0.81      2390
   macro avg       0.84      0.56      0.55      2390
weighted avg       0.82      0.81      0.75      2390



### b) SVM

In [26]:
# Trained using Bag of Words features
svm_bow = LinearSVC(C=0.1, max_iter=5000)
svm_bow.fit(X_train_bow, y_train)
y_pred_bow_svm = svm_bow.predict(X_test_bow)
print("SVM with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_svm))

# Trained using TF-IDF features
svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_svm = svm_tfidf.predict(X_test_tfidf)
print("SVM with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_svm))

SVM with BoW accuracy:  0.8271966527196652
SVM with TF-IDF accuracy:  0.8309623430962343


In [27]:
# Display classification report 
print("SVM with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_svm, classes=unique_classes)

print("SVM with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_svm, classes=unique_classes)

SVM with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.64      0.42      0.51       504
    positive       0.86      0.94      0.90      1886

    accuracy                           0.83      2390
   macro avg       0.75      0.68      0.70      2390
weighted avg       0.81      0.83      0.81      2390

SVM with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.64      0.46      0.53       504
    positive       0.87      0.93      0.90      1886

    accuracy                           0.83      2390
   macro avg       0.75      0.69      0.71      2390
weighted avg       0.82      0.83      0.82      2390



### c) Logistic Regression

In [28]:
# Trained using Bag of Words features
lr_bow = LogisticRegression(max_iter=1000)
lr_bow.fit(X_train_bow, y_train)
y_pred_bow_lr = lr_bow.predict(X_test_bow)
print("Logistic Regression with BoW accuracy: ", accuracy_score(y_test, y_pred_bow_lr))

# Trained using TF-IDF features
lr_tfidf = LogisticRegression(max_iter=1000)
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_lr = lr_tfidf.predict(X_test_tfidf)
print("Logistic Regression with TF-IDF accuracy: ", accuracy_score(y_test, y_pred_tfidf_lr))

Logistic Regression with BoW accuracy:  0.8288702928870293
Logistic Regression with TF-IDF accuracy:  0.8301255230125523


In [29]:
# Display classification report 
print("Logistic Regression with BoW accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_bow_lr, classes=unique_classes)

print("Logistic Regression with TF-IDF accuracy:")
model_evaluation_utils.display_classification_report(true_labels=y_test, predicted_labels=y_pred_tfidf_lr, classes=unique_classes)

Logistic Regression with BoW accuracy:
              precision    recall  f1-score   support

    negative       0.64      0.43      0.52       504
    positive       0.86      0.93      0.90      1886

    accuracy                           0.83      2390
   macro avg       0.75      0.68      0.71      2390
weighted avg       0.81      0.83      0.82      2390

Logistic Regression with TF-IDF accuracy:
              precision    recall  f1-score   support

    negative       0.70      0.34      0.46       504
    positive       0.84      0.96      0.90      1886

    accuracy                           0.83      2390
   macro avg       0.77      0.65      0.68      2390
weighted avg       0.81      0.83      0.81      2390



### d) LSTM

In [30]:
import tensorflow.compat.v1 as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Converts the label to numeric form
label_encoder = LabelEncoder()
y2 = label_encoder.fit_transform(y) # Convert "negative" to 0, and "positive" to 1

# Data set partitioning
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train2)
X_train_seq = tokenizer.texts_to_sequences(X_train2)
X_test_seq = tokenizer.texts_to_sequences(X_test2)

# Padding sequences
max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer=Adam(learning_rate=1e-3), loss="binary_crossentropy", metrics=["accuracy"])

# Train LSTM model
model.fit(X_train_pad, y_train2, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test2), verbose=2)

# Evaluate model 
loss, accuracy = model.evaluate(X_test_pad, y_test2, verbose=2)
print("LSTM accuracy: ", accuracy)

Epoch 1/5
150/150 - 16s - 108ms/step - accuracy: 0.8002 - loss: 0.4516 - val_accuracy: 0.8209 - val_loss: 0.3791
Epoch 2/5
150/150 - 14s - 91ms/step - accuracy: 0.8487 - loss: 0.3435 - val_accuracy: 0.8343 - val_loss: 0.3528
Epoch 3/5
150/150 - 15s - 97ms/step - accuracy: 0.8732 - loss: 0.3003 - val_accuracy: 0.8402 - val_loss: 0.3629
Epoch 4/5
150/150 - 14s - 92ms/step - accuracy: 0.8876 - loss: 0.2695 - val_accuracy: 0.8331 - val_loss: 0.3905
Epoch 5/5
150/150 - 14s - 92ms/step - accuracy: 0.9009 - loss: 0.2469 - val_accuracy: 0.8301 - val_loss: 0.4139
75/75 - 1s - 16ms/step - accuracy: 0.8301 - loss: 0.4139
LSTM accuracy:  0.8301255106925964


In [32]:
end_time = datetime.now()
print("Text processing ended at {}".format(end_time))
print("Text processing spent {}".format(end_time - start_time))

Text processing ended at 2024-06-23 22:44:41.650135
Text processing spent 0:05:14.190718
