## Import

In [None]:
import pandas as pd

data = pd.read_json(f'../data/processed/hotel_reviews_all.json')

data.head()

## Word segmentation

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

words_in_reviews = {}

data_size = len(data)

for review_index in range(len(data)):
    mytext = data.loc[review_index, 'review_text']

    my_sentences = sent_tokenize(mytext, "english")
    stopwords = nltk.corpus.stopwords.words('english')

    for sentence in my_sentences:
        # Replace '.' and ',' by spaces
        for ponctuation in ['.', ',', '<', '>']:
            sentence = sentence.replace(ponctuation, ' ')


        words = word_tokenize(sentence, "english")
        # Filter out stopwords
        non_stopwords = [word for word in words if word.lower() not in stopwords]

        for word in non_stopwords:
            if word in words_in_reviews.keys():
                words_in_reviews[word] = words_in_reviews[word] + 1
            else:
                words_in_reviews[word] = 1
    

words_in_reviews_dict = {
    "word": words_in_reviews.keys(),
    "count": words_in_reviews.values(),
}


pd.DataFrame.from_dict(words_in_reviews_dict).to_csv('../data/analysis/words.json', index=False)


# Analysis

In [None]:

words = pd.read_csv(f'../data/analysis/words.json')

words.head()

In [None]:
word_count = []

for i in range(len(words)):
    word_count.append( (words.loc[i, "word"], words.loc[i, "count"]) )


word_count.sort(key=lambda elem: elem[1], reverse=True)

# The 10 most common words
print(word_count[:10])

## Plot Word Cloud


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# the ammount of different words on the wordcloud
limit = 200 

wordcloud = WordCloud(width = 800, height = 800,
                collocations=False,
                background_color ='white',
                min_font_size = 10).generate(' '.join(map(lambda x: (x[0] + " ") * x[1], word_count[:limit])))

# plot the WordCloud image
fig = plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

fig.savefig('../data/plots/wordcloud.png')