# Hotel Review Sentiment Analysis


### Importing the relevant libraries

In [1]:
import numpy as np
import re
import pickle 
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
raw_data = pd.read_csv('Hotel_Reviews.csv')
raw_data.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


### Copy data to ensure original data is not modified

In [3]:
df = raw_data.copy()

In [4]:
df['Review'] = df.Negative_Review + df.Positive_Review
df['Review'][2]

' Rooms are nice but for elderly a bit difficult as most rooms are two story with narrow steps So ask for single level Inside the rooms are very very basic just tea coffee and boiler and no bar empty fridge  Location was good and staff were ok It is cute hotel the breakfast range is nice Will go back '

### Create the target and drop all columns that are not relevant

* Reviewer_score lower than 5 will be considered negative
* Reviewer_score equal or higher than 5 will be consider positive

In [5]:
# create the target
df['Target'] = df["Reviewer_Score"].apply(lambda x: 0 if x < 5 else 1)

df = df[['Review', 'Target']]

df.head()

Unnamed: 0,Review,Target
0,I am so angry that i made this post available...,0
1,No Negative No real complaints the hotel was g...,1
2,Rooms are nice but for elderly a bit difficul...,1
3,My room was dirty and I was afraid to walk ba...,0
4,You When I booked with your company on line y...,1


In [6]:
X, y = df.Review, df.Target

In [7]:
print('Total number of rows: ', df.shape[0])
print('Total number of positive reviews: ', y.sum())
print('Percentage of positive reivews:', y.sum()/df.shape[0])

Total number of rows:  515738
Total number of positive reviews:  493457
Percentage of positive reivews: 0.9567978314570577


### From the above output, the dataset is highly imbalance. We will have to balance in such a way it is approximately 50% positive and negative

In [8]:
#remove excess 0s
one_counter = 0
counter = 0
indices_to_remove =[]


for index, row in df.iterrows():
    if row['Target'] == 1:
        one_counter+=1
        if one_counter >= (df.shape[0] - df.Target.sum()):
            indices_to_remove.append(index)
    
df_balanced = df.drop(indices_to_remove)
df_balanced.reset_index(inplace=True, drop=True)

#check if targets are balance (approx. 50%)

print(df_balanced['Target'].sum())
print(df_balanced['Target'].shape[0])
print(df_balanced['Target'].sum()/df_balanced['Target'].shape[0])

22280
44561
0.49998877942595543


In [9]:
X = df_balanced['Review']
y = df_balanced['Target']

### Text cleaning
* Removing white spaces, punctuations, single letter words

In [10]:
# Creating the corpus

def text_cleaner(X):
    corpus = []
    for i in range(0, len(X)):
        review = re.sub(r'\W', ' ', str(X[i]))
        review = review.lower()
        review = re.sub(r'^br$', ' ', review)
        review = re.sub(r'\s+br\s+',' ',review)
        review = re.sub(r'\s+[a-z]\s+', ' ',review)
        review = re.sub(r'^b\s+', '', review)
        review = re.sub(r'\s+', ' ', review)
        corpus.append(review)
    return X

corpus = text_cleaner(X)

### Prior to lemmatizing, we have to take a part of speech parameter, “pos”.

### If not supplied, the default is “noun”. We will make a function to solve it

In [11]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

### In lemmatisation, the part of speech of a word should be first determined and will return the dictionary form of a word, which must be a valid word.

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()

# Lemmatization
pos_tags = pos_tag(corpus)
corpus = [WordNetLemmatizer().lemmatize(text[0], get_wordnet_pos(text[1])) for text in pos_tags]

### Creating the Tf-Idf model

In [13]:
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 6000, min_df = 3, max_df = 0.6, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

### Splitting the dataset into training and test set

In [14]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Training the model

In [15]:
# Training the classifier
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(text_train,sent_train)

LogisticRegression()

### Testing model performance

In [16]:
sent_pred = model.predict(text_test)

In [17]:
model.score(text_test, sent_test)

0.8633456748569506

In [22]:
sample = [text_cleaner("""The room was simple, quite small and had basic equipments. The room was also clean. 
The main advantage of the room was the balcony which offers a really nice view on the street. 
The breakfast was also simple. We preferred to take it outside as the continental breakfast was not really suitable for us. 
At last, the location is really great, in the Chinatown Food Street and close to the Chinatown train station. 
The only problem is the street which is quite noisy until late night.""")]

sample = vectorizer.transform(sample).toarray()
sentiment = model.predict(sample)
sentiment

array([1], dtype=int64)