In [121]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/daru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/daru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [122]:
reviews = pd.read_csv("data/reviews_with_touchpoints.csv")

reviews.head()

Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
0,8/3/2017,7.1,United Kingdom,The car park was small and unpleasant People ...,The location was excellent for getting to the O2,3,7.9,"[' Leisure trip ', ' Group ', ' Standard Doubl...",[1],[8]
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
2,8/2/2017,7.1,United Kingdom,I asked how far the O2 was and got told a 7 m...,No Positive,1,6.3,"[' Leisure trip ', ' Solo traveler ', ' Standa...",,
3,8/2/2017,7.1,United Kingdom,Hot stuffy room air con not working properly ...,The bed was OK,2,5.4,"[' Couple ', ' Standard Twin Room ', ' Stayed ...",[8],[8]
4,8/2/2017,7.1,United Kingdom,Although the price seems like it is cheap you...,The Receptionists and Cleaners are very polit...,4,6.3,"[' Leisure trip ', ' Family with young childre...","[4, 7, 8]","[8, 10, 11]"


Previous research revealed that touchpoint 8 is a pain point. Many negative reviews reference it. Let's find out why.

In [123]:
# Replace NaN in negative_touchpoints and positive_touchpoints with empty string
reviews["negative_touchpoints"] = reviews["negative_touchpoints"].fillna("")
reviews["positive_touchpoints"] = reviews["positive_touchpoints"].fillna("")

print(len(reviews))
reviews.head()


4789


Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
0,8/3/2017,7.1,United Kingdom,The car park was small and unpleasant People ...,The location was excellent for getting to the O2,3,7.9,"[' Leisure trip ', ' Group ', ' Standard Doubl...",[1],[8]
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
2,8/2/2017,7.1,United Kingdom,I asked how far the O2 was and got told a 7 m...,No Positive,1,6.3,"[' Leisure trip ', ' Solo traveler ', ' Standa...",,
3,8/2/2017,7.1,United Kingdom,Hot stuffy room air con not working properly ...,The bed was OK,2,5.4,"[' Couple ', ' Standard Twin Room ', ' Stayed ...",[8],[8]
4,8/2/2017,7.1,United Kingdom,Although the price seems like it is cheap you...,The Receptionists and Cleaners are very polit...,4,6.3,"[' Leisure trip ', ' Family with young childre...","[4, 7, 8]","[8, 10, 11]"


In [124]:

# Select reviews that have touchpoint 8 mentioned in negative_touchpoints

negative_reviews = reviews[reviews["negative_touchpoints"].str.contains("8")]

print(len(negative_reviews))
negative_reviews.head()

2851


Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
0,8/3/2017,7.1,United Kingdom,The car park was small and unpleasant People ...,The location was excellent for getting to the O2,3,7.9,"[' Leisure trip ', ' Group ', ' Standard Doubl...",[1],[8]
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
3,8/2/2017,7.1,United Kingdom,Hot stuffy room air con not working properly ...,The bed was OK,2,5.4,"[' Couple ', ' Standard Twin Room ', ' Stayed ...",[8],[8]
4,8/2/2017,7.1,United Kingdom,Although the price seems like it is cheap you...,The Receptionists and Cleaners are very polit...,4,6.3,"[' Leisure trip ', ' Family with young childre...","[4, 7, 8]","[8, 10, 11]"
5,8/2/2017,7.1,Ireland,The hotel has seen better days including the ...,My daughter and I were upgraded free of charg...,4,7.5,"[' Leisure trip ', ' Group ', ' Standard Doubl...","[2, 8, 10]",[8]


In [125]:
# Select reviews that have a reviewer_score markedly below the average_score

average_score = negative_reviews["average_score"].iloc[0] # 7.1 is the average score for all entries

negative_reviews = negative_reviews[negative_reviews["reviewer_score"] < (average_score - 1)]

print(len(reviews))
reviews.head()

4789


Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
0,8/3/2017,7.1,United Kingdom,The car park was small and unpleasant People ...,The location was excellent for getting to the O2,3,7.9,"[' Leisure trip ', ' Group ', ' Standard Doubl...",[1],[8]
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
2,8/2/2017,7.1,United Kingdom,I asked how far the O2 was and got told a 7 m...,No Positive,1,6.3,"[' Leisure trip ', ' Solo traveler ', ' Standa...",,
3,8/2/2017,7.1,United Kingdom,Hot stuffy room air con not working properly ...,The bed was OK,2,5.4,"[' Couple ', ' Standard Twin Room ', ' Stayed ...",[8],[8]
4,8/2/2017,7.1,United Kingdom,Although the price seems like it is cheap you...,The Receptionists and Cleaners are very polit...,4,6.3,"[' Leisure trip ', ' Family with young childre...","[4, 7, 8]","[8, 10, 11]"


In [126]:
negative_reviews = negative_reviews["negative_review"].tolist()

# Print the first 10 reviews
for review in negative_reviews[:10]:
    print(review)
    print()

 Hot stuffy room air con not working properly Filthy windows Wallpaper ripped and worn bedside furniture tatty carpets worn and damaged bathroom door latch worn making door difficult to open from the inside Shower from another era controlled by bath taps Poor shower curtain too resulting in wet floor Carpets in corridors worn out and tatty nasty musty aroma to these areas too All in all this place is around ten years overdue a complete renovation 

 Construction going on all around the hotel so many paths closed Had to pay extra to use hotel pool and spa which was a health hazard in itself Stairs leading to changing rooms and pool were filthy pool tiles didn t look like they had ever been cleaned wood was rotted in the sauna floor had food and hair on it outside The shower outside sauna had a hot and a cold button had to press both at same time in order not to scald or freeze yourself Pasta in pizzaria was microwaved Staff were incredibly rude in all areas of the hotel at one point my 

In [127]:
review_text = " ".join(negative_reviews)
print(len(review_text))

# Tokenize the text
tokens = nltk.word_tokenize(review_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [w.lower() for w in tokens if len(w) > 1 and w not in stop_words and w.isalpha()]

print(len(tokens))

tokens[:25]

284491
29250


['hot',
 'stuffy',
 'room',
 'air',
 'con',
 'working',
 'properly',
 'filthy',
 'windows',
 'wallpaper',
 'ripped',
 'worn',
 'bedside',
 'furniture',
 'tatty',
 'carpets',
 'worn',
 'damaged',
 'bathroom',
 'door',
 'latch',
 'worn',
 'making',
 'door',
 'difficult']

In [128]:
# Lemmatize the tokens

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

counter = 0
for i in range(len(tokens)):
    if tokens[i] != lemmatized_tokens[i]:
        counter += 1
        print(tokens[i], lemmatized_tokens[i])
    
    if counter > 25:
        break

windows window
carpets carpet
taps tap
carpets carpet
corridors corridor
areas area
years year
paths path
stairs stair
rooms room
tiles tile
areas area
points point
vents vent
customers customer
beds bed
mice mouse
customers customer
areas area
mice mouse
occasions occasion
walls wall
toilets toilet
curtains curtain
sockets socket
phones phone


In [129]:
# Get the frequency distribution of the tokens
fdist = nltk.FreqDist(lemmatized_tokens)

fdist.most_common(50)

[('room', 1266),
 ('the', 738),
 ('hotel', 585),
 ('bed', 359),
 ('staff', 317),
 ('window', 277),
 ('old', 226),
 ('breakfast', 222),
 ('need', 199),
 ('night', 180),
 ('dirty', 173),
 ('poor', 171),
 ('wifi', 154),
 ('like', 150),
 ('would', 150),
 ('work', 147),
 ('could', 147),
 ('air', 143),
 ('bathroom', 143),
 ('tired', 140),
 ('one', 139),
 ('dated', 131),
 ('no', 129),
 ('stay', 124),
 ('free', 122),
 ('get', 117),
 ('shower', 115),
 ('time', 111),
 ('uncomfortable', 111),
 ('it', 111),
 ('carpet', 110),
 ('hot', 109),
 ('reception', 108),
 ('service', 107),
 ('pay', 105),
 ('day', 105),
 ('pool', 103),
 ('noisy', 98),
 ('very', 98),
 ('even', 96),
 ('also', 96),
 ('sleep', 94),
 ('cold', 89),
 ('water', 89),
 ('really', 86),
 ('door', 84),
 ('told', 83),
 ('u', 83),
 ('floor', 82),
 ('wall', 82)]

Relevant problems:
- Bedding and mattreses
- Windows
- Old style
- Wi-fi
- Cleaniness
- Noise
- Breakfast
- Pool

# Positive reviews

Let's look at the strong points of the facilities to get a clearer picture.

In [130]:
positive_reviews = reviews[reviews["positive_touchpoints"].str.contains("8")]

print(len(positive_reviews))
positive_reviews.head()

1815


Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
3,8/2/2017,7.1,United Kingdom,Hot stuffy room air con not working properly ...,The bed was OK,2,5.4,"[' Couple ', ' Standard Twin Room ', ' Stayed ...",[8],[8]
4,8/2/2017,7.1,United Kingdom,Although the price seems like it is cheap you...,The Receptionists and Cleaners are very polit...,4,6.3,"[' Leisure trip ', ' Family with young childre...","[4, 7, 8]","[8, 10, 11]"
5,8/2/2017,7.1,Ireland,The hotel has seen better days including the ...,My daughter and I were upgraded free of charg...,4,7.5,"[' Leisure trip ', ' Group ', ' Standard Doubl...","[2, 8, 10]",[8]
8,8/2/2017,7.1,United Kingdom,The guy that served us was not very professio...,The bed was nice you could smoke in the room ...,3,7.1,"[' Leisure trip ', ' Couple ', ' Standard Twin...","[2, 8]","[2, 8, 9]"


In [131]:
# Get the reviews that have a reviewer_score above the average_score

positive_reviews = positive_reviews[positive_reviews["reviewer_score"] > average_score]

print(len(positive_reviews))
positive_reviews.head()

1015


Unnamed: 0,review_date,average_score,reviewer_nationality,negative_review,positive_review,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,positive_touchpoints,negative_touchpoints
1,8/3/2017,7.1,United Kingdom,We weren t told that the only spa facility op...,The house keeping lady made my boyfriends day...,3,8.3,"[' Leisure trip ', ' Couple ', ' Standard Doub...","[7, 8]",[8]
5,8/2/2017,7.1,Ireland,The hotel has seen better days including the ...,My daughter and I were upgraded free of charg...,4,7.5,"[' Leisure trip ', ' Group ', ' Standard Doubl...","[2, 8, 10]",[8]
9,8/2/2017,7.1,United Kingdom,The building site next door Dirty windows spo...,Room overlooking Thames good bed and room Bre...,1,7.5,"[' Solo traveler ', ' Standard Double Room wit...","[2, 8, 10]",[8]
10,8/2/2017,7.1,United Kingdom,Access was a little tricky due to local const...,Excellent value for money Friendly staff lots...,1,8.8,"[' Business trip ', ' Solo traveler ', ' Stand...","[1, 8, 9, 10, 12]","[5, 6, 8]"
13,8/2/2017,7.1,United Kingdom,The lifts were badly adjusted with dangerous ...,Great location and spacious room but the whol...,23,7.5,"[' Business trip ', ' Solo traveler ', ' Stand...",[8],[8]


In [132]:
positive_reviews = positive_reviews["positive_review"].tolist()

# Print the first 10 reviews
for review in positive_reviews[:10]:
    print(review)
    print()

 The house keeping lady made my boyfriends day with how funny she was

 My daughter and I were upgraded free of charge to an executive room with a balcony over the water We enjoyed the quality and price in the Spice restaurant and I was delighted to find porridge for breakfast in the Jenny restaurant The electric fan was a welcome addition to the room 

 Room overlooking Thames good bed and room Breakfast was substantial and well prepared Location was perfect for my needs 

 Excellent value for money Friendly staff lots of amenities a broad range of dining options a Spa downstairs all just a stone s throw from Canary Wharf Have already recommended this to family friends for last minute work stopovers short breaks in case they need to stay in this part of London 

 Great location and spacious room but the whole place looks a bit tired 

 We liked everything about this hotel and was so so pleased you still have smoking rooms so good as my partner didn t have to leave me to go outside for

In [133]:
review_text = " ".join(positive_reviews)
print(len(review_text))

# Tokenize the text
tokens = nltk.word_tokenize(review_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [w.lower() for w in tokens if len(w) > 1 and w not in stop_words and w.isalpha()]

print(len(tokens))

tokens[:25]

104027
11206


['the',
 'house',
 'keeping',
 'lady',
 'made',
 'boyfriends',
 'day',
 'funny',
 'my',
 'daughter',
 'upgraded',
 'free',
 'charge',
 'executive',
 'room',
 'balcony',
 'water',
 'we',
 'enjoyed',
 'quality',
 'price',
 'spice',
 'restaurant',
 'delighted',
 'find']

In [134]:
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

counter = 0
for i in range(len(tokens)):
    if tokens[i] != lemmatized_tokens[i]:
        counter += 1
        print(tokens[i], lemmatized_tokens[i])
    
    if counter > 25:
        break

boyfriends boyfriend
needs need
lots lot
amenities amenity
options option
friends friend
stopovers stopover
breaks break
looks look
rooms room
views view
needs need
carpets carpet
corners corner
drinks drink
us u
facilities facility
choices choice
sleeps sleep
days day
lines line
lots lot
works work
beds bed
shops shop
restaurants restaurant


In [135]:
# Get the frequency distribution of the tokens
fdist = nltk.FreqDist(lemmatized_tokens)

fdist.most_common(50)

[('room', 699),
 ('the', 337),
 ('good', 326),
 ('location', 301),
 ('hotel', 287),
 ('staff', 269),
 ('clean', 248),
 ('great', 228),
 ('view', 196),
 ('nice', 193),
 ('comfortable', 162),
 ('bed', 131),
 ('friendly', 115),
 ('excellent', 99),
 ('lovely', 95),
 ('canary', 91),
 ('wharf', 91),
 ('value', 90),
 ('stay', 88),
 ('window', 86),
 ('helpful', 85),
 ('price', 83),
 ('facility', 83),
 ('breakfast', 73),
 ('spacious', 69),
 ('restaurant', 65),
 ('really', 63),
 ('we', 62),
 ('upgraded', 61),
 ('money', 61),
 ('very', 60),
 ('size', 59),
 ('tube', 58),
 ('london', 56),
 ('comfy', 56),
 ('would', 55),
 ('big', 54),
 ('bar', 54),
 ('close', 52),
 ('well', 51),
 ('amazing', 49),
 ('bathroom', 45),
 ('area', 44),
 ('one', 42),
 ('station', 42),
 ('large', 39),
 ('pool', 38),
 ('easy', 38),
 ('beautiful', 38),
 ('everything', 37)]

Strong Points:

- Staff
- Views/location
- Price