# Data

### Libraries

In [None]:
# import libraries

import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

### Data preparation

In [None]:
def load_filtered_data(city):
    '''
    Load filtered data from a city
    '''
    file = ['business', 'checkin', 'review', 'tip', 'user']
    data = {}
    for f in file:
        data[f] = pd.read_csv(f'../filtered_cities/{city}_{f}.csv')
    return data

In [None]:
# load data

city = 'St. Louis'
data = load_filtered_data(city)

In [None]:
# review data

data['review'].info()

In [None]:
# distribution of review ratings

data['review']['stars'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Review Ratings')
plt.show()

In [None]:
def sentiment(rating):
    '''
    Return sentiment value based on rating (-1 for negative, 0 for neutral, 1 for positive)
    '''
    if rating == 1 or rating == 2:
        return -1
    elif rating == 3:
        return 0
    elif rating == 4 or rating == 5:
        return 1

In [None]:
# dataframe with reviews and sentiment values (based on stars)

reviews = data['review'].copy()
reviews['sentiment'] = reviews['stars'].apply(sentiment)
review_df = reviews[['review_id', 'text', 'sentiment']]

review_df

In [None]:
# distribution of sentiment values

review_df['sentiment'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Review Sentiments')
plt.xticks(ticks=[0, 1, 2], labels=['negative', 'neutral', 'positive'])
plt.show()

In [None]:
# save review data

review_df.to_csv(f'data/review_sentiment.csv', index=False)

### Text processing

In [None]:
# see content of reviews

for review in review_df['text'].head(5).values:
    print('-' * 50)
    print(review)

In [None]:
# common negation words

negation_words = ["don't", "no", "none", "never", "nowhere", "neither", "nor", "not", "cannot","didn't", "couldn't", "wouldn't", "won't", "can't", "aren't", "isn't", "wasn't", "haven't", "hasn't", "hadn't"]

In [None]:
def process_review(text, lemmatizer, sw, negation_handling=True):
    '''
    Process a review text by removing non-alphabetic characters, converting to lowercase, removing stopwords, and stemming
    '''
    text = re.sub('[^a-zA-Z!\']', ' ', text)
    text = re.sub('!', ' ! ', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in sw]
    words = [lemmatizer.lemmatize(word) for word in words]

    if negation_handling:
        for i in range(len(words)):
            if words[i] in negation_words and i < len(words) - 1:
                words[i+1] = 'NOT_' + words[i+1]
    print("NOT_like" in negation_words) 
    words = [word for word in words if word not in negation_words]

    text = ' '.join(words)
    return text
# TODO: investigate if there is are better ways to process reviews (e.g. maintain exclamation marks?) and handle with negation

In [None]:
# stemmer and stopwords

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [None]:
# text processing

corpus = []
corpus_negative = []
corpus_neutral = []
corpus_positive = []

for i in range(0, review_df['text'].size):
    review = process_review(review_df['text'][i], lemmatizer, sw)
    corpus.append(review)
    sentiment = review_df['sentiment'][i]
    if sentiment == -1:
        corpus_negative.append(review)
    elif sentiment == 0:
        corpus_neutral.append(review)
    elif sentiment == 1:
        corpus_positive.append(review)

In [None]:
# see content of processed reviews

for review in corpus[:5]:
    print('-' * 50)
    print(review)

In [None]:
# save corpus data

with open('data/corpus.txt', 'w') as f:
    for review in corpus:
        f.write(review + '\n')

### Wordclouds

In [None]:
# global word cloud

wordcloud = WordCloud().generate(" ".join(corpus))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# word cloud for negative reviews

wordcloud = WordCloud().generate(" ".join(corpus_negative))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# word cloud for neutral reviews

wordcloud = WordCloud().generate(" ".join(corpus_neutral))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# word cloud for positive reviews

wordcloud = WordCloud().generate(" ".join(corpus_positive))

plt.figure()
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Frequency distributions

In [None]:
# frequency distribution of words

all_words = ' '.join(corpus).split()
fd = nltk.FreqDist(all_words)

negative_words = ' '.join(corpus_negative).split()
fd_negative = nltk.FreqDist(negative_words)

neutral_words = ' '.join(corpus_neutral).split()
fd_neutral = nltk.FreqDist(neutral_words)

positive_words = ' '.join(corpus_positive).split()
fd_positive = nltk.FreqDist(positive_words)

In [None]:
# most common words in reviews

fd.most_common(10)

In [None]:
# most common words in negative reviews

fd_negative.most_common(10)

In [None]:
# most common words in neutral reviews

fd_neutral.most_common(10)

In [None]:
# most common words in positive reviews

fd_positive.most_common(10)