<h1><center><font size="6">Sentiment Analysis on Tripadvisor Hotel Reviews</font></center></h1>


<center><img src="https://plus.unsplash.com/premium_photo-1661775662573-f681f8700ac4?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxzZWFyY2h8MTN8fHRyaXB8ZW58MHx8MHx8&auto=format&fit=crop&w=500&q=60" width="1100"></img></center>


# <a id='0'>Content</a>

- <a href='#1'>Import relevant libraries</a>  
- <a href='#2'>Load the dataset</a> 
- <a href='#3'>Exploratory Data Analysis</a>  
- <a href='#4'>Text Preprocessing</a>  
- <a href='#5'>Using sklearn for model building</a>  
- <a href='#6'>Using TensorFlow for model building</a> 
- <a href='#7'>Prediction</a>

# <a id="1">Import relevant libraries</a>  

In [None]:
#nltk will be used in this project
!pip install nltk


In [None]:
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from wordcloud import WordCloud

# Preprocessing and evaluation
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

# <a id="2">Load the dataset</a>  

Load the dataset and take a sneak peak into what the dataset looks like.

In [None]:
df = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
df.head()

In [None]:
print(pd.isnull(df).sum())

From above you can see that there are no null values in this dataset.

# <a id="3">Exploratory Data Analysis</a>  

In [None]:
sns.countplot(data=df, x='Rating', palette='flare').set_title('Distribution of Rating')
print("Distribution of Rating")

From above we can see that the lowest ratiing is 1 and the highest rating is 5.

Out of these, reviews with a rating of 5 is highest in this dataset.

In [None]:
# Length of words in reviews
df['Length'] = df['Review'].apply(lambda x: len(str(x).split(" ")))
df.head()

In [None]:
sns.displot(data=df, x='Length', hue='Rating', palette='flare', kind='kde', fill=True, aspect=4)

g = sns.FacetGrid(data=df, col='Rating')
g.map(plt.hist, 'Length', color='#973aa8')

From the plots above, we can deduce that the longer the review the higher the rating given.

# <a id="4">Text preprocessing</a>  

In [None]:
# Let's categorise the rating a bit more to make is easiaer to understand
def rating(score):
    if score > 3:
        return 'Good'
    elif score == 3:
        return 'Average'
    else:
        return 'Bad'

In [None]:
df['Rating'] = df['Rating'].apply(rating)
df.head()

In [None]:
# Wordcount of dataset before cleaning
length = df['Length'].sum()

* Stemming vs Lemmatization

In [None]:
print('Original:')
print(df['Review'][0])
print()

sentence = []
for word in df['Review'][0].split():
    stemmer = SnowballStemmer('english')
    sentence.append(stemmer.stem(word))
print('Stemming:')
print(' '.join(sentence))
print()

sentence = []
for word in df['Review'][0].split():
    lemmatizer = WordNetLemmatizer()
    sentence.append(lemmatizer.lemmatize(word, 'v'))
print('Lemmatization:')
print(' '.join(sentence))

Lemmatization will be used in this instance. 

Stemming here does not give a meaningful representation of some words such as arrived. 
Stemming changes the word to 'arriv' where as Lemmatization uses the base form of the word 'arrive'.

In [None]:
def cleaning(text):
    #removal of punctuations and uppercase
    clean_text = text.translate(str.maketrans('','',string.punctuation)).lower()
    
    #removal of stopwords
    clean_text = [word for word in clean_text.split() if word not in stopwords.words('english')]
    
    #lemmatize the word
    sentence = []
    for word in clean_text:
        lemmatizer = WordNetLemmatizer()
        sentence.append(lemmatizer.lemmatize(word, 'v'))

    return ' '.join(sentence)

In [None]:
df['Review'] = df['Review'].apply(cleaning)

In [None]:
df['Length'] = df['Review'].apply(lambda x: len(str(x).split(" ")))
new_length = df['Length'].sum()

print('Total text length before cleaning: {}'.format(length))
print('Total text length after cleaning: {}'.format(new_length))

In [None]:
df.to_csv('cleaned_df.csv', index=False)

In [None]:
# After cleaning, let's see the most common word used
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=1000, min_font_size=10, 
                height=800,width=1600,background_color="white", colormap='flare').generate(' '.join(df['Review']))

plt.imshow(wc)

# <a id="5">Using sklearn for model building</a>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Rating'], test_size=0.2)

In [None]:
tfid = TfidfVectorizer()
train_tfid_matrix = tfid.fit_transform(X_train)
test_tfid_matrix = tfid.transform(X_test)

In [None]:
pickle.dump(tfid, open('tfidf.pkl', 'wb'))

In [None]:
models = [DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(),
          LogisticRegression(max_iter=1000),
          KNeighborsClassifier(),
          BernoulliNB()]

In [None]:
accuracy = []

for model in models:
    cross_val = cross_val_score(model, train_tfid_matrix, y_train, scoring='accuracy',
                               cv=StratifiedKFold(10)).mean()
    accuracy.append(cross_val)

In [None]:
models_name = ['DecisionTreeClassifier', 'RandomForestClassifier', 'SVC',
         'LogisticRegression', 'KNeighborsClassifier', 'BernoulliNB']

acc = pd.DataFrame({'Model': models_name, 'Accuracy': accuracy})
acc

The Logistic Regression model performs best, therefore this will be used to train our model.

In [None]:
log = LogisticRegression(max_iter=1000)
log.fit(train_tfid_matrix, y_train)

pred = log.predict(test_tfid_matrix)

In [None]:
pickle.dump(log, open('ml_model.pkl', 'wb'))

In [None]:
ml = pickle.load(open('ml_model.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))
def ml_predict(text):
    clean_text = cleaning(text)
    tfid_matrix = tfidf.transform([clean_text])
    pred_proba = ml.predict_proba(tfid_matrix)
    idx = np.argmax(pred_proba)
    pred = ml.classes_[idx]
    
    return pred, pred_proba[0][idx]

ml_predict('poor room service')

In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

# <a id="6">Using TensorFlow for model building</a>

In [None]:
tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')

tokenizer.fit_on_texts(X_train)
# print(tokenizer.word_index)
total_word = len(tokenizer.word_index)
print('Total distinct words: {}'.format(total_word))

train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq)

test_seq = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_seq)

# One hot encoding the label
lb = LabelBinarizer()
train_labels = lb.fit_transform(y_train)
test_labels = lb.transform(y_test)

In [None]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
pickle.dump(lb, open('label.pkl', 'wb'))

In [None]:
model = tf.keras.models.Sequential([tf.keras.layers.Embedding(total_word, 8),
                                    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(8, kernel_regularizer=l2(0.001),
                                                          bias_regularizer=l2(0.001), activation='relu'),
                                    tf.keras.layers.Dropout(0.5),
                                    tf.keras.layers.Dense(3, activation='softmax')])

model.summary()

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_padded, train_labels, epochs=25, validation_data=(test_padded, test_labels))

In [None]:
metrics = pd.DataFrame(model.history.history)
metrics[['accuracy', 'val_accuracy']].plot()
metrics[['loss', 'val_loss']].plot()

In [None]:
pred2 = model.predict(test_padded)

In [None]:
true_labels = np.argmax(test_labels, axis=-1)
pred_labels = np.argmax(pred2, axis=-1)

In [None]:
print(confusion_matrix(true_labels, pred_labels))
print(classification_report(true_labels, pred_labels))

In [None]:
model.save('dl_model.h5')

# <a id="7">Prediction</a>

The fun part, let's use our own text to do some predictions!

In [None]:
# Logistic Regression
def ml_predict(text):
    clean_text = cleaning(text)
    tfid_matrix = tfid.transform([clean_text])
    pred = log.predict(tfid_matrix)[0]
    
    return pred

# Deep Neural Network
def dl_predict(text):
    clean_text = cleaning(text)
    seq = tokenizer.texts_to_sequences([clean_text])
    padded = pad_sequences(seq)

    pred = model.predict(padded)
    # Get the label name back
    result = lb.inverse_transform(pred)[0]
    
    return result

In [None]:
text = 'The location was such a comfy place to stay with a loved one'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text)))
print('Prediction using DNN: {}'.format(dl_predict(text)))

In [None]:
text2 = 'Very slow wifi and awful room service'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text2)))
print('Prediction using DNN: {}'.format(dl_predict(text2)))

In [None]:
text3 = 'The location is not easy to get to but the scenery is amazing'

print('Prediction using Logistic Regression: {}'.format(ml_predict(text3)))
print('Prediction using DNN: {}'.format(dl_predict(text3)))