In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
from wordcloud import WordCloud
import nltk
nltk.download(['stopwords',
               'punkt',
               'wordnet',
               'omw-1.4',
               'vader_lexicon'
               ])
%matplotlib inline

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
tourist_accommodation_reviews = pd.read_csv('tourist_accommodation_reviews.csv')
tourist_accommodation_reviews.head()

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer('[a-zA-Z0-9\']+')

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)

In [None]:
lemmatizer=nltk.stem.WordNetLemmatizer()
stemmer=nltk.stem.PorterStemmer()
words=['cacti','sings','hopped','rocks','better','easily']
pos=['n','v','v','n','a','r']
lemmatized_words=[lemmatizer.lemmatize(words[i], pos=pos[i]) for i in range(6)]
stemmed_words=[stemmer.stem(word) for word in words]

print("Lemmatized words:", lemmatized_words)
print("Stemmed words:", stemmed_words)

In [None]:
def preprocess_text(text):
  tokenized_document = nltk.tokenize.RegexpTokenizer('[a-zA-Z0-9\']+').tokenize(text)
  cleaned_tokens = [word.lower() for word in tokenized_document if word.lower() not in stop_words]
  stemmed_text = [nltk.stem.PorterStemmer().stem(word) for word in cleaned_tokens]
  return stemmed_text

In [None]:
tourist_accommodation_reviews['Review_Text']=tourist_accommodation_reviews['Review'].apply(preprocess_text)
tourist_accommodation_reviews.head()

In [None]:
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x=vectorizer.fit_transform(tourist_accommodation_reviews['Review_Text'].map(' '.join))
x=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
x.head()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [None]:
tourist_accommodation_reviews.describe()

In [None]:
tourist_accommodation_reviews['compound'] = [sentiment.polarity_scores(review)['compound'] for review in tourist_accommodation_reviews['Review']]
tourist_accommodation_reviews['neg'] = [sentiment.polarity_scores(review)['neg'] for review in tourist_accommodation_reviews['Review']]
tourist_accommodation_reviews['neu'] = [sentiment.polarity_scores(review)['neu'] for review in tourist_accommodation_reviews['Review']]
tourist_accommodation_reviews['pos'] = [sentiment.polarity_scores(review)['pos'] for review in tourist_accommodation_reviews['Review']]

In [None]:
tourist_accommodation_reviews.head()

In [None]:
tourist_accommodation_reviews[['compound','neg','neu','pos']].describe()

In [None]:
sns.histplot(tourist_accommodation_reviews['compound'])

In [None]:
sns.histplot(tourist_accommodation_reviews['pos'])

In [None]:
sns.histplot(tourist_accommodation_reviews['neg'])

In [None]:
(tourist_accommodation_reviews['compound']<=0).groupby(tourist_accommodation_reviews['Hotel/Restaurant name']).sum()

In [None]:
percent_negative = pd.DataFrame((tourist_accommodation_reviews['compound']<=0).groupby(tourist_accommodation_reviews['Hotel/Restaurant name']).sum()
                              /tourist_accommodation_reviews['Hotel/Restaurant name'].groupby(tourist_accommodation_reviews['Hotel/Restaurant name']).count()*100,
                               columns=['% negative reviews']).sort_values(by='% negative reviews')
      
percent_negative

In [None]:
sns.barplot(data=percent_negative, x='% negative reviews', y=percent_negative.index, color='c')

In [None]:
from nltk.corpus.reader import reviews
tourist_accommodation_reviews_positive_subset=tourist_accommodation_reviews.loc[(tourist_accommodation_reviews['Hotel/Restaurant name']=='Buffalo Steak House - Kata Plaza')
                                                                          & (tourist_accommodation_reviews['compound']>0),:]
tourist_accommodation_reviews_negative_subset=tourist_accommodation_reviews.loc[(tourist_accommodation_reviews['Hotel/Restaurant name']=='Buffalo Steak House - Kata Plaza')
                                                                          & (tourist_accommodation_reviews['compound']<=0),:]
                                                            
tourist_accommodation_reviews_positive_subset.head()

In [None]:
import wordcloud
neg_tokens=[word for review in tourist_accommodation_reviews_negative_subset['Review_Text'] for word in review]

wordcloud = WordCloud(background_color='white').generate_from_text(
    ' '.join(neg_tokens))

plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
pos_tokens=[word for review in tourist_accommodation_reviews_positive_subset['Review_Text'] for word in review]

wordcloud = WordCloud(background_color='white').generate_from_text(
    ' '.join(neg_tokens))

plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from nltk.probability import FreqDist
pos_freqdist = FreqDist(pos_tokens)
pos_freqdist.tabulate(10)

In [None]:
from nltk.probability import FreqDist
neg_freqdist = FreqDist(neg_tokens)
neg_freqdist.tabulate(10)

In [None]:
pos_freqdist.plot(30)

In [None]:
neg_freqdist.plot(30)