#Booking review sentiment analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install gdown
!pip install tensorflow_text

In [None]:
!pip install wordcloud

In [None]:
!pip install tensorflow-gpu

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline
%config InlineBackend.figure_format='retina'

register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
tf.test.is_gpu_available()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Sentiment analysis/Booking/Hotel_Reviews.csv', parse_dates=['Review_Date'])
df.shape

## Exploration of the dataset

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
sns.displot(df.Reviewer_Score);
plt.show()

In [None]:
#Transform the Positive and Negative review in order to make a integer number
df["review"] = df["Negative_Review"] + df["Positive_Review"]
df["review_type"] = df["Reviewer_Score"].apply(
  lambda x: "bad" if x < 7 else "good")

In [None]:
df.review_type.value_counts()

In [None]:
df1 = df[['review_type','review']]

In [None]:
sns.countplot(
  x='review_type',
  data=df,
  order=df.review_type.value_counts().index
)

plt.xlabel("type")
plt.title("Review type");

In [None]:
good_reviews = df[df.review_type == "good"]
bad_reviews = df[df.review_type == "bad"]

In [None]:
good_reviews_text = " ".join(good_reviews.review.to_numpy().tolist())
bad_reviews_text = " ".join(bad_reviews.review.to_numpy().tolist())

In [None]:
good_reviews_cloud = WordCloud(stopwords=STOPWORDS, background_color="white").generate(good_reviews_text)
bad_reviews_cloud = WordCloud(stopwords=STOPWORDS, background_color="white").generate(bad_reviews_text)

In [None]:
def show_word_cloud(cloud, title):
  plt.figure(figsize = (16, 10))
  plt.imshow(cloud, interpolation='bilinear')
  plt.title(title)
  plt.axis("off")
  plt.show();

In [None]:
show_word_cloud(good_reviews_cloud, "Good reviews common words")

In [None]:
show_word_cloud(bad_reviews_cloud, "Bad reviews common words")

In [None]:
# Same number of good reviews as the number of bad reviews
good_df = good_reviews.sample(n=len(bad_reviews), random_state=RANDOM_SEED)
bad_df = bad_reviews

In [None]:
review_df = good_df.append(bad_df).reset_index(drop=True)
review_df.shape

In [None]:
sns.countplot(
  x='review_type',
  data=review_df,
  order=review_df.review_type.value_counts().index
)

plt.xlabel("type")
plt.title("Review type (resampled)");

## Universal sentence Encoder

In [None]:
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
sentence1 = ['the location is great']
sentence2 = ['wonderful location']

e1 = use(sentence1)
e2 = use(sentence2)

In [None]:
e1.shape

In [None]:
np.inner(e1, e2).flatten()[0]

# Preprocessing for modeling

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
type_one_hot = OneHotEncoder(sparse=False).fit_transform(review_df.review_type.to_numpy().reshape(-1,1)
                                                         )

In [None]:
type_one_hot.shape

In [None]:
train_reviews, test_reviews, y_train, y_test =\
train_test_split(review_df.review,type_one_hot, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
test_reviews.shape

In [None]:
X_train = []
for r in tqdm(train_reviews):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(review_emb)

X_train = np.array(X_train)

In [None]:
X_test = []
for r in tqdm(test_reviews):
  emb = use(r)
  review_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(review_emb)

X_test = np.array(X_test)

## Sentiment Analysis

In [None]:
model = keras.Sequential()

model.add(
  keras.layers.Dense(
    units=256,
    input_shape=(X_train.shape[1], ),
    activation='relu'
  )
)
model.add(
  keras.layers.Dropout(rate=0.5)
)

model.add(
  keras.layers.Dense(
    units=128,
    activation='relu'
  )
)
model.add(
  keras.layers.Dropout(rate=0.5)
)

model.add(keras.layers.Dense(2, activation='softmax'))
model.compile(
    loss='categorical_crossentropy', 
    optimizer=keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=16, 
    validation_split=0.1, 
    verbose=1, 
    shuffle=True
)

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.xlabel("epoch")
plt.ylabel("Cross-entropy loss")
plt.legend();

In [None]:
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.xlabel("epoch")
plt.ylabel("accuracy")
plt.legend();

## Predictions of new reviews

In [None]:
print(test_reviews.iloc[0])
print("Bad" if y_test[0][0] == 1 else "Good")

In [None]:
y_pred = model.predict(X_test[:1])
print(y_pred)
"Bad" if np.argmax(y_pred) == 0 else "Good"

In [None]:
print(test_reviews.iloc[1])
print("Bad" if y_test[1][0] == 1 else "Good")

In [None]:
y_pred = model.predict(X_test[1:2])
print(y_pred)
"Bad" if np.argmax(y_pred) == 0 else "Good"