In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from random import randint

In [0]:
import tensorflow as tf
from string import punctuation
from collections import Counter

In [0]:
import numpy as np 
import pandas as pd 
import time
from collections import Counter
import re, nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from matplotlib.colors import LinearSegmentedColormap
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import wordnet as wn

In [0]:
data = pd.read_csv('Hotel_Reviews.csv')

**Data preprocessing**

In [0]:
#Data preprocessing
## since we found the words are in mixed case letters and with trailing whitespace
#we remove those white spaces and converting the reviews to lowercases

# convert all positive reviews to lower case, remove trailing whitespace and drop misleading information
pos = data.Positive_Review
pos = pos.str.lower().str.strip()
pos1 = pos[((pos != 'no positive') &
           (pos != 'nothing'))]
# all punctuation is removed, recover needed info
pos1 = pos1.replace({'n t ':' not ', 'dont':'do not'}, regex=True)

# convert all negative reviews to lower case, remove trailing whitespace and drop misleading information
neg = data.Negative_Review
neg = neg.str.lower().str.strip()
neg1 = neg[(neg != 'no negative') &
      (neg != 'nothing')]
# all punctuation is removed, recover needed info
neg1 = neg1.replace({'n t ':' not ','dont':'do not'}, regex=True)

# # concat positive and negative reviews
total_reviews = pd.concat([pos1, neg1], axis=0)


# add score to reviews
scores = ['positive' for i in range(len(pos1))]
scores += ['negative' for i in range(len(neg1))]

# one hot encoding (1 for positive, 0 for negative)
for i in range(0, len(scores)):
    if scores[i] == 'positive':
        scores[i] = 1
    else:
        scores[i] = 0



#removing numbers
total_reviews = total_reviews.map(lambda x: re.sub('[0-9]','',str(x)))


####  remove stopwords

# load package

import nltk
import ssl
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()   #ToktokTokenizer is faster than word_tokenize
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
nltk.download('punkt')


# refine stopwords as we do not want to drop privative
# (most reviews are short, dropping them can reverse actual meaning)
stop_words = set(stopwords.words('english'))
refine_stopword = set(('no', 'nor','not'))
new_stopwords = stop_words - refine_stopword


text = total_reviews.values

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    cleaned_tokens = [token for token in tokens if token not in new_stopwords]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

total_reviews = total_reviews.apply(remove_stopwords)
 
# tokenizing
total_reviews = total_reviews.apply(tokenizer.tokenize)



# create required data frame.
review_score = pd.DataFrame()
review_score['reviews'] = total_reviews
review_score['score'] = scores

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
review_score.head()

Unnamed: 0,reviews,score
0,park outside hotel beautiful,1
1,no real complaints hotel great great location ...,1
2,location good staff ok cute hotel breakfast ra...,1
3,great location nice surroundings bar restauran...,1
4,amazing location building romantic setting,1


In [0]:
from sklearn.model_selection import train_test_split
seed=100
x=review_score['reviews']
y=review_score['score']
X_train, X_test,Y_train, Y_test = train_test_split(x,y, test_size=0.3,random_state=seed)


In [0]:
# Tokenize Text
from keras.preprocessing.text import Tokenizer
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [0]:
totalNumWords = [len(one_comment) for one_comment in X_train]
plt.hist(totalNumWords,bins = 30)
plt.show()

<Figure size 576x396 with 1 Axes>


**LSTM Network**

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence


In [0]:
from keras.preprocessing import sequence
# truncate and pad input sequences
max_review_length = 110
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [0]:
top_words = 100000
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 110, 32)           3200000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 3,253,301
Trainable params: 3,253,301
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 95.29%


**LSTM with Dropout**

In [0]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
print(model.summary())

In [0]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

**LSTM and Convolutional Neural Network **

In [0]:
# add Conv layer and max pooling layer
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 110, 32)           3200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 110, 32)           3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 55, 32)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 55, 32)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 3,256,405
Trainable params: 3,256,405
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(X_train, Y_train, epochs=3, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 95.18%
