# <b> Hotel Reviews

### <b> Setup

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from keras import * #change later
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.model_selection import train_test_split 

Using TensorFlow backend.


### <b> Data Access 

In [2]:
data = pd.read_csv('../data/Hotel/hotel-reviews/rev.csv')

### <b> Exploratory Data Analysis

In [4]:
data.head(2)

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,


In [5]:
data = data[['reviews.rating', 'reviews.text', 'reviews.title']]

In [6]:
data.head()

Unnamed: 0,reviews.rating,reviews.text,reviews.title
0,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds
1,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!
2,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge
3,5.0,We stayed here for four nights in October. The...,Good location on the Lido.
4,5.0,We stayed here for four nights in October. The...,������ ���������������


In [7]:
data.isnull().sum()

reviews.rating     862
reviews.text        22
reviews.title     1622
dtype: int64

In [8]:
data.shape

(35912, 3)

In [9]:
data.dropna(inplace=True)

In [10]:
data.isnull().sum()

reviews.rating    0
reviews.text      0
reviews.title     0
dtype: int64

In [11]:
data.shape

(34155, 3)

In [12]:
data['word_count'] = data['reviews.text'].str.split()

In [13]:
data['word_count'] = [len(w) for w in data['word_count']]

In [14]:
data['word_count'].sum()

1593730

In [15]:
target_len = data['reviews.rating'].value_counts().count()

In [16]:
text = data['reviews.text'][0]

### <b> Data Processing

In [17]:
def _cleasing_(text):
    words = word_tokenize(text) 
    text = [w.lower() for w in words if w.isalpha() ]
    text = [w for w in text if w not in stop_words]
    return ' '.join(text)

In [18]:
data['text_clean'] = [_cleasing_(line) for line in data['reviews.text']]

In [19]:
data['title_clean'] = [_cleasing_(line) for line in data['reviews.title']]

In [20]:
data.head()

Unnamed: 0,reviews.rating,reviews.text,reviews.title,word_count,text_clean,title_clean
0,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,33,pleasant min walk along sea front water bus re...,good location away crouds
1,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,44,really lovely hotel stayed top floor surprised...,great hotel jacuzzi bath
2,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,28,ett mycket bra hotell det som drog ner betyget...,lugnt
3,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,59,stayed four nights october hotel staff welcomi...,good location lido
4,5.0,We stayed here for four nights in October. The...,������ ���������������,59,stayed four nights october hotel staff welcomi...,


In [21]:
data.dropna(inplace=True)

### <b> Prepare Training Data

In [22]:
features = data['text_clean']
target = data['reviews.rating']

In [23]:
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size = 0.5, random_state = 42)

In [24]:
feature_train.shape, feature_test.shape, target_train.shape, target_test.shape

((17077,), (17078,), (17077,), (17078,))

#### <b> Enconding data

In [25]:
vocab_size = 500
embedding_dim = 120
max_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_portion = .8

In [26]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
# create the vocabulary based on training features 
tokenizer.fit_on_texts(feature_train)
# vocabulary
vocab = tokenizer.word_index

In [27]:
# FEATURES DATA
# turn sentences in integers
train_sequences = tokenizer.texts_to_sequences(feature_train)
test_sequences = tokenizer.texts_to_sequences(feature_test)

# pad sequences 
padded_training_sequences = pad_sequences(train_sequences, padding = padding_type, maxlen = max_length)
padded_test_sequences = pad_sequences(test_sequences, padding = padding_type, maxlen = max_length)

In [28]:
feature_train.head(1)

18119    kids us ages years old something ages great time
Name: text_clean, dtype: object

In [29]:
train_sequences[0]

[180, 22, 1, 259, 57, 319, 1, 7, 27]

In [30]:
padded_training_sequences[0]

array([180,  22,   1, 259,  57, 319,   1,   7,  27,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

### <b> Model Architecture
    

#### <b> Dense Neural Network

In [31]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(target_len, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 120)           60000     
_________________________________________________________________
global_average_pooling1d (Gl (None, 120)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                7744      
_________________________________________________________________
dense_1 (Dense)              (None, 41)                2665      
Total params: 70,409
Trainable params: 70,409
Non-trainable params: 0
_________________________________________________________________


In [32]:
num_epochs = 30
history = model.fit(padded_training_sequences, target_train, 
                    epochs=num_epochs, validation_data=(padded_test_sequences, target_test), verbose=2)

Train on 17077 samples, validate on 17078 samples
Epoch 1/30
17077/17077 - 3s - loss: 1.6265 - acc: 0.3683 - val_loss: 1.3128 - val_acc: 0.4390
Epoch 2/30
17077/17077 - 1s - loss: 1.2528 - acc: 0.4579 - val_loss: 1.2337 - val_acc: 0.4857
Epoch 3/30
17077/17077 - 1s - loss: 1.1961 - acc: 0.4910 - val_loss: 1.2126 - val_acc: 0.4924
Epoch 4/30
17077/17077 - 1s - loss: 1.1707 - acc: 0.5033 - val_loss: 1.2023 - val_acc: 0.4939
Epoch 5/30
17077/17077 - 1s - loss: 1.1598 - acc: 0.5129 - val_loss: 1.2003 - val_acc: 0.4959
Epoch 6/30
17077/17077 - 1s - loss: 1.1500 - acc: 0.5164 - val_loss: 1.2098 - val_acc: 0.4943
Epoch 7/30
17077/17077 - 1s - loss: 1.1425 - acc: 0.5218 - val_loss: 1.2011 - val_acc: 0.4954
Epoch 8/30
17077/17077 - 1s - loss: 1.1336 - acc: 0.5232 - val_loss: 1.2220 - val_acc: 0.4857
Epoch 9/30
17077/17077 - 1s - loss: 1.1295 - acc: 0.5231 - val_loss: 1.1999 - val_acc: 0.4947
Epoch 10/30
17077/17077 - 1s - loss: 1.1205 - acc: 0.5250 - val_loss: 1.1974 - val_acc: 0.4965
Epoch 11/

In [33]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, 'acc')
plot_graphs(history, 'loss')

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

#### <b> Convolutional Neural Network

In [34]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(target_len, activation='softmax')
])

In [35]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 120)           60000     
_________________________________________________________________
conv1d (Conv1D)              (None, 46, 128)           76928     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 41)                2665      
Total params: 147,849
Trainable params: 147,849
Non-trainable params: 0
_________________________________________________________________


In [37]:
num_epochs = 10
history = model.fit(padded_training_sequences, target_train, 
                    epochs=num_epochs, validation_data=(padded_test_sequences, target_test), verbose=2)

Train on 17077 samples, validate on 17078 samples
Epoch 1/10
17077/17077 - 6s - loss: 0.2479 - acc: 0.9219 - val_loss: 2.5786 - val_acc: 0.4428
Epoch 2/10
17077/17077 - 7s - loss: 0.2412 - acc: 0.9286 - val_loss: 2.6234 - val_acc: 0.4417
Epoch 3/10
17077/17077 - 6s - loss: 0.2191 - acc: 0.9347 - val_loss: 2.6904 - val_acc: 0.4366
Epoch 4/10
17077/17077 - 7s - loss: 0.2126 - acc: 0.9356 - val_loss: 2.6931 - val_acc: 0.4453
Epoch 5/10
17077/17077 - 7s - loss: 0.2007 - acc: 0.9380 - val_loss: 2.8405 - val_acc: 0.4429
Epoch 6/10
17077/17077 - 7s - loss: 0.1969 - acc: 0.9376 - val_loss: 2.8366 - val_acc: 0.4247
Epoch 7/10
17077/17077 - 7s - loss: 0.1838 - acc: 0.9422 - val_loss: 2.9498 - val_acc: 0.4352
Epoch 8/10
17077/17077 - 7s - loss: 0.1868 - acc: 0.9403 - val_loss: 2.8360 - val_acc: 0.4424
Epoch 9/10
17077/17077 - 8s - loss: 0.1673 - acc: 0.9441 - val_loss: 2.9299 - val_acc: 0.4416
Epoch 10/10
17077/17077 - 9s - loss: 0.1710 - acc: 0.9432 - val_loss: 2.8974 - val_acc: 0.4424
