<h1 style="font-family: Impact, Charcoal, sans-serif; padding: 12px; font-size: 30px; color: #8b4513; text-align: center; line-height: 1.25;">Sentiment Analysis<br><span style="color: #800000; font-size: 48px"><b>TripAdvisor Hotel Reviews</b></span><br><span style="color: #a0522d; font-size: 20px">Using Sklearn and Tensorflow</span></h1>

<div style="display: flex; justify-content: center;">
  <img src="assets/review.png" alt="Hotel Review" style="width:200px;">
</div>

<p>Data source: <a href="https://www.kaggle.com/datasets/thedevastator/tripadvisor-hotel-reviews">TripAdvisor Hotel Reviews</a></p>


<h1><b><span style="color: #8b4513; font-size: 28px">TABLE OF CONTENTS</span></b></h1>

* [Importing Libraries](#1)
* [Loading Dataset](#2)
* [Text Preprocessing](#3)
    * [Clean Text](#3.1)
* [Data Visualization](#4)
* [Building Model with Sklearn](#5)
    * [Make Predictions](#5.1)
    * [Prediction Interpretability using SHAP Values](#5.2)
* [Building Model with Tensorflow](#6)
* [Prediction](#7)

<a id="1"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Importing Libraries</span></b></h1>


In [1]:
import numpy as np # for array, linear algebra
import pandas as pd # for data processing
import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import string

# Preprocessing and evaluation
import nltk
from nltk.stem import WordNetLemmatizer # lemmatize a word
from nltk.corpus import stopwords
from nltk.corpus import wordnet # large lexical database of English words
from wordcloud import WordCloud # to visualize text
from sklearn.feature_extraction.text import TfidfVectorizer # converts a collection of raw documents into a matrix
nltk.download('punkt')


# Models
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_scoreyy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
#import optuna
from xgboost import XGBClassifier
import shap


ModuleNotFoundError: No module named 'numpy'

<a id="2"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Loading Dataset</span></b></h1>

In [None]:
df = pd.read_csv('data/trip_advisor_reviews.csv', encoding = 'ISO-8859-1')
df.columns = df.columns.str.lower()
df.head()

In [None]:
df.info()

In [None]:
df = df.sample(frac = 0.3, replace = False, random_state=42)

In [None]:
df.info()

In [None]:
#This column is the same as index
df = df.drop(['s.no.'], axis = 1)

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

<a id="3"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Text Preprocessing</span></b></h1>
- For this dataset, we categorize all 5 and 4 stars reviews as a Good Review, the 3 start review as Neutral while all reviews from 2-star to 1-star, we categorize it as Bad Review.

In [None]:
def rating(score):
    if score >= 4:
        return 'good'
    elif score == 3:
        return 'neutral'
    else:
        return 'bad'

In [None]:
df['score'] = df['rating'].apply(rating)

In [None]:
def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

In [None]:
df['review_length'] = df['review'].apply(count_chars)
df['words_count'] = df['review'].apply(count_words)

In [None]:
df.head()

In [None]:
length = df['review_length'].sum()
print(f'Total words in the dataset before cleaning: {length}')

<a id="3.1"></a>
<h1 style="font-family: Trebuchet MS; font-size: 20px; color: #b47238; text-align: left; "><b>Clean up the text</b></h1>

<ul>
  <li>Remove 'empty' reviews and words with only 1 letter</li>
  <li>Lowercase all text</li>
  <li>Tokenize and split text into words</li>
  <li>Remove stop words ('a', 'an', 'the', 'of', 'in', etc.)</li>
  <li>Lemmatize the text: transform every word into its root form</li>
</ul>


In [None]:
def clean_text(text):
    # Remove \t
    text = text.replace('\t', '')

    # Lowercase text
    text = text.lower()

    # Tokenize text into sentences
    sentences = nltk.sent_tokenize(text)

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    cleaned_sentences = []
    for sentence in sentences:
        # Tokenize sentence into words
        words = nltk.word_tokenize(sentence)

        # Remove punctuation and words with numbers
        words = [word.strip(string.punctuation) for word in words if not any(c.isdigit() for c in word)]

        # Remove empty tokens and stopwords
        words = [word for word in words if len(word) > 0 and word not in stopwords.words('english')]

        # Lemmatize words
        lemmatized_words = [lemmatizer.lemmatize(word, 'v') for word in words]

        cleaned_sentence = ' '.join(lemmatized_words)
        cleaned_sentences.append(cleaned_sentence)

    return ' '.join(cleaned_sentences)

In [None]:
df['clean_review'] = df['review'].apply(clean_text)

In [None]:
new_length = df['clean_review'].apply(len).sum()

print(f'Total words in the dataset before cleaning: {length}')
print(f'Total words in the dataset after cleaning: {new_length}')

In [None]:
df.head()

In [None]:
df.to_csv('df_cleaned.csv', index=False)

<a id="1"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Data Visualization</span></b></h1>

In [None]:
rating_proportions = df['rating'].value_counts(normalize=True).sort_index()

fig = px.bar(rating_proportions, x=rating_proportions.index, y=rating_proportions.values,
             hover_data=[rating_proportions.values], color=rating_proportions.index,
             height=400)

# Configurar etiquetas y título del gráfico
fig.update_layout(
    xaxis_title='Rating',
    yaxis_title='Proportion',
    title={
        'text': '<b>Hotel Review Rating Proportions</b>',
        'font': {'size': 24, 'color': '#8b4513'},
        'x': 0.5,  # Alineación centrada
        'xanchor': 'center'  # Alineación centrada
    },
    plot_bgcolor='white'
)

fig.update_traces(marker_coloraxis=None)

fig.show()


<p>
<strong>Ratings Breakdown:</strong><br>
1 star: 7.03%<br>
2 stars: 8.51%<br>
3 stars: 10.49%<br>
4 stars: 30.70%<br>
5 stars: 43.27%
</p>
<p>
The majority of TripAdvisor hotel reviews (43.27%) are rated with 5 stars, indicating a high level of satisfaction. Additionally, 4-star ratings hold a significant proportion (30.70%). Lower ratings (1 to 3 stars) represent a smaller proportion of the reviews.
</p>


In [None]:
plot = sns.displot(data=df, x='review_length', hue='rating', palette='viridis', kind='kde', fill=True, aspect=2)

plt.suptitle('Distribution of Review Length by Rating', fontweight='bold', fontsize=18, color='#8b4513')
plot.set(xlabel='Total words', ylabel='Density')
plt.show()

In [None]:
from PIL import Image

def wordCloud_generator(data, color, color_map):
    wave_mask = np.array(Image.open('assets/cloud.png'))
    wordcloud = WordCloud(width=1000, height=1000,
                          background_color=color,
                          min_font_size=12,
                          colormap=color_map,
                          mask=wave_mask
                          ).generate(' '.join(data['clean_review'].values))
    
    # plot the WordCloud image
    plt.figure(figsize=(10, 10), facecolor=None)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    
    plt.show()


In [None]:
df_good_neutral = df[df['score'].isin(['good', 'neutral'])][['clean_review']]
df_bad = df[df['score'] == 'bad'][['clean_review']]

In [None]:
wordCloud_generator(df_good_neutral, 'white', 'ocean')

In [None]:
wordCloud_generator(df_bad, 'white', 'Reds')

<a id="5"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Building Model with Sklearn Classifiers Models</span></b></h1>

In [None]:
X = df['clean_review']
Y = df['score']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# Calculate the proportion of values in each set
train_proportions = y_train.value_counts() / len(y_train)
test_proportions = y_test.value_counts() / len(y_test)

print("Proportions in the training set:")
print(train_proportions)

print("\nProportions in the test set:")
print(test_proportions)

In [None]:
X.head()

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
class_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print("Class Mapping:", class_mapping)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

##### Save the transformer 

In [None]:
import pickle

pickle.dump(tfid, open('tfidf.pkl', 'wb'))

In [None]:
models = [DecisionTreeClassifier(random_state=42),
          RandomForestClassifier(random_state=42),
          XGBClassifier(random_state=42, objective='error'),
          SVC(random_state=42),
          LogisticRegression(random_state=42, max_iter=1000),
          KNeighborsClassifier()]

In [None]:
from sklearn.model_selection import cross_val_score


accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train_encoded, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)

In [None]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier']


acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

In [None]:
best_model = XGBClassifier(random_state=42)
best_model.fit(train_tfid_matrix, y_train_encoded)
pred = best_model.predict(test_tfid_matrix)

In [None]:
""" def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }

    model = XGBClassifier(random_state=42, **params)
    model.fit(train_tfid_matrix, y_train_encoded)
    
    y_pred = model.predict(test_tfid_matrix)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

best_params = study.best_params
best_model = XGBClassifier(random_state=42, **{f'xgb_{key}': value for key, value in best_params.items()})
best_model.fit(train_tfid_matrix, y_train_encoded)

pred = best_model.predict(test_tfid_matrix) """

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

cf = confusion_matrix(y_test_encoded, pred)

# Plot confusion matrix as a heatmap
sns.heatmap(cf, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import scikitplot as skplt

probs = best_model.predict_proba(test_tfid_matrix)  # Get predicted probabilities

# Plot gain curve
skplt.metrics.plot_cumulative_gain(y_test, probs)
plt.xlabel('Percentage of Samples')
plt.ylabel('Gain')
plt.title('Cumulative Gain Curve')
plt.show()

In [None]:

cr = classification_report(y_test_encoded, pred)
print(cr)
cf = confusion_matrix(y_test_encoded, pred)
cf

In [None]:
pickle.dump(best_model, open('ml_model.pkl', 'wb'))

#### Open the model trained and the transformed values used during training

In [None]:
ml = pickle.load(open('ml_model.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

In [None]:
def ml_predict(text):
    text = clean_text(text)
    tfid_matrix = tfidf.transform([text])
    pred_proba = ml.predict_proba(tfid_matrix)
    idx = np.argmax(pred_proba)
    pred = ml.classes_[idx]
    
    return pred, pred_proba[0][idx]

ml_predict('poor room service')

### SHAP Values

In [None]:
explainer = shap.TreeExplainer(best_model)

In [None]:
# Calculate SHAP values for the test set
shap_values = explainer.shap_values(train_tfid_matrix)

# Visualize the SHAP values
shap.summary_plot(shap_values, train_tfid_matrix)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], train_tfid_matrix[0], matplotlib=True)
plt.show()

<a id="6"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Building Model with Tensorflow</span></b></h1>

In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l1, l2

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
from sklearn.preprocessing import LabelBinarizer
tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')

tokenizer.fit_on_texts(X_train)
# print(tokenizer.word_index)
total_word = len(tokenizer.word_index)
print('Total distinct words: {}'.format(total_word))

train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq)

test_seq = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_seq)

# One hot encoding the label
lb = LabelBinarizer()
train_labels = lb.fit_transform(y_train)
test_labels = lb.transform(y_test)

In [None]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
pickle.dump(lb, open('label.pkl', 'wb'))

In [None]:
model = tf.keras.models.Sequential([tf.keras.layers.Embedding(total_word, 8),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(8, kernel_regularizer=l2(0.001),
                                                          bias_regularizer=l2(0.001), activation='relu'),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(3, activation='softmax')])

model.summary()

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_padded, train_labels, epochs=25, validation_data=(test_padded, test_labels))

In [None]:
metrics = pd.DataFrame(model.history.history)
metrics[['accuracy', 'val_accuracy']].plot()
metrics[['loss', 'val_loss']].plot()

In [None]:
pred2 = model.predict(test_padded)

In [None]:
model.save('dl_model.h5')

<a id="6"></a>
<h1><b><span style="color: #8b4513; font-size: 28px">Building Model with Tensorflow</span></b></h1>

In [None]:
# Logistic Regression
def ml_predict(text):
    clean_text = clean_text(text)
    tfid_matrix = tfid.transform([clean_text])
    pred = best_model.predict(tfid_matrix)[0]
    
    return pred

# Deep Neural Network
def dl_predict(text):
    clean_text = clean_text(text)
    seq = tokenizer.texts_to_sequences([clean_text])
    padded = pad_sequences(seq)

    pred = model.predict(padded)
    # Get the label name back
    result = lb.inverse_transform(pred)[0]
    
    return result

In [None]:
text = 'Such a comfy place to stay with the loved one'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text)))
print('Prediction using DNN: {}'.format(dl_predict(text)))

In [None]:
text2 = 'Awful room services and slow wifi connection'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text2)))
print('Prediction using DNN: {}'.format(dl_predict(text2)))

In [None]:
text3 = 'Hard to get here but the scenery is wonderful'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text3)))
print('Prediction using DNN: {}'.format(dl_predict(text3)))