## Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from textblob import TextBlob
from sklearn.metrics import accuracy_score

## Reading in the cleaned corpus

In [2]:
# Loading and formating dataframe
df = pd.read_csv("data_sentiment.csv")
df["Date"] = pd.to_datetime(df["Date"])
df["Review"] = df["Review"].apply(str)

# reviews = pd.read_csv("./classified_data.csv")

# Getting a DataFrame of reviews by restaurants

In [None]:
# To get unique restaurant names
restaurant_names = []
for rest in reviews["Restaurant"]: # each restaurant name
    if rest not in restaurant_names: 
        restaurant_names.append(rest)

print(len(restaurant_names)) # number of restaurants

In [None]:
# To get restaurant name and respective reviews (separately including date)
reviewsByRestaurant = dict.fromkeys(restaurant_names, "")
for rest in restaurant_names: 
    filtered= []
    reviewsByRestaurant[rest] = [] # to contain a list of sentences for that restaurant    
    filtered = reviews[reviews['Restaurant'] == rest]
    for i in range(len(filtered)): 
        review_info = {} # to contain key value pairs of label: sentence, date: date
        review_info['Review'] = filtered.iloc[i]['Review_clean']
        review_info['Date'] = pd.to_datetime(filtered.iloc[i]['Date'])
        review_info['Label'] = filtered.iloc[i]['Label']
        review_info['Stars'] = filtered.iloc[i]['Stars']
        reviewsByRestaurant[rest].append(review_info)

print(reviewsByRestaurant)

In [None]:
# To get df of restaurant and labels with date
rows = []

for rest in restaurant_names:
    for review_info in reviewsByRestaurant[rest]:
        review_info['Restaurant'] = rest
        rows.append(review_info)

df = pd.DataFrame(rows)
df = df[['Restaurant', 'Review', 'Date', 'Label', 'Stars']]

df.head()

In [232]:
# Checking datatype for each column
df.dtypes

Restaurant            object
Review                object
Date          datetime64[ns]
Label                  int64
Stars                  int64
dtype: object

In [102]:
# Viewing dataset
df.head()

Unnamed: 0,Restaurant,Review,Date,Label,Stars
0,Happy Tummy,fresh ingredient friendly peep and so much che...,2016-04-06,1,5
1,Cibo Italiano,a small selection of italian wine by the glass...,2015-12-24,1,4
2,Cibo Italiano,cultural relevant singaporean cuisine in very ...,2018-08-19,4,5
3,Cibo Italiano,generous with the clam,2016-11-28,1,5
4,Yan kee Noodle House,the plus point is that the price remains the s...,2018-12-28,1,4


In [502]:
# Viewing specifications of dataset
df.shape

(1926, 9)

# Sentiment Analysis

In [3]:
from textblob import TextBlob
import numpy as np
import math
from sklearn.metrics import accuracy_score

In [4]:
# Sentiment polarity analysis for each review
df["sentiment_polarity"] = df.apply (lambda row: TextBlob(row[1]).sentiment.polarity, axis=1)

In [5]:
# Viewing dataframe with sentiment scores
df.head()

Unnamed: 0,Restaurant,Review,Date,Label,Stars,sentiment_polarity
0,Happy Tummy,fresh ingredient friendly peep and so much che...,2016-04-06,1,5,0.000714
1,Cibo Italiano,a small selection of italian wine by the glass...,2015-12-24,1,4,0.15
2,Cibo Italiano,cultural relevant singaporean cuisine in very ...,2018-08-19,4,5,0.26375
3,Cibo Italiano,generous with the clam,2016-11-28,1,5,0.0
4,Yan kee Noodle House,the plus point is that the price remains the s...,2018-12-28,1,4,0.0


In [6]:
# adjusting sentiment scores to [0 - 1] range
def adj_sentiment(sentiment):
    return ((sentiment + 1)/2)

In [7]:
# adjusting sentiment
df["adjusted_sentiment"] = df.apply (lambda row: adj_sentiment(row[5]), axis=1)

In [8]:
# Viewing daataframe with sentiment scores
df.head()

Unnamed: 0,Restaurant,Review,Date,Label,Stars,sentiment_polarity,adjusted_sentiment
0,Happy Tummy,fresh ingredient friendly peep and so much che...,2016-04-06,1,5,0.000714,0.500357
1,Cibo Italiano,a small selection of italian wine by the glass...,2015-12-24,1,4,0.15,0.575
2,Cibo Italiano,cultural relevant singaporean cuisine in very ...,2018-08-19,4,5,0.26375,0.631875
3,Cibo Italiano,generous with the clam,2016-11-28,1,5,0.0,0.5
4,Yan kee Noodle House,the plus point is that the price remains the s...,2018-12-28,1,4,0.0,0.5


# Creating Date weights

In [9]:
df.shape

(1926, 7)

In [10]:
df = df.sort_values(['Restaurant', 'Label'])
tmp = df.groupby(['Restaurant', 'Label']).size()
rank_range = tmp.map(range)
rank =[(item +1) for sublist in rank_range for item in sublist]
len(rank)

1926

In [11]:
# Creating variable with difference between base date and date
df["weights"] = [1/x for x in rank]

In [12]:
df.head()

Unnamed: 0,Restaurant,Review,Date,Label,Stars,sentiment_polarity,adjusted_sentiment,weights
293,10 at Claymore,they had a small but good selection of dessert,2017-03-25,1,4,0.225,0.6125,1.0
294,10 at Claymore,and expensive tea you know the kind where the ...,2014-10-02,1,4,0.05,0.525,0.5
295,10 at Claymore,freshly made piping hot and generous filling n...,2014-10-02,2,4,0.4,0.7,1.0
297,10 at Claymore,the perfectly cooked meat gave way to a moist ...,2013-04-18,2,4,0.364416,0.682208,0.5
298,10 at Claymore,i love their penang style prawn noodle that co...,2012-08-19,2,3,0.158333,0.579167,0.333333


# Star Prediction for each Label

In [13]:
# Creating recency weighted average grouped by 
def grouped_weighted_avg(values, weights, by):
    return (values * weights).groupby(by).sum() / weights.groupby(by).sum()

In [14]:
# Removing label 5 since it is not a preassigned class
clean_data = df[df.Label != 5]

In [15]:
# Calculating exponentially rencency weighted average of sentiment for eah label
label_sentiment = grouped_weighted_avg(clean_data["adjusted_sentiment"], clean_data["weights"], (clean_data["Restaurant"], clean_data["Label"]))

  This is separate from the ipykernel package so we can avoid doing imports until


In [16]:
# Adjusting sentiment to fit [1,5]
label_sentiment = label_sentiment.apply((lambda x: math.ceil(x*5)))

In [17]:
# Viewing the predicted stars for each predetermined aspect
label_sentiment.head()

Restaurant        Label
10 at Claymore    1        3
                  2        4
                  3        4
126 Eating House  3        5
2it & Drink       2        5
dtype: int64

In [18]:
# Viewing the shape of the dataframe
label_sentiment.shape

(824,)

In [19]:
pred_label = []
for x in label_sentiment:
    pred_label.append(x)

In [20]:
# Calculating exponentially rencency weighted average of stars for labels
label_star = grouped_weighted_avg(clean_data["Stars"], clean_data["weights"], (clean_data["Restaurant"], clean_data["Label"]))

  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
label_star = label_star.apply((lambda x: math.ceil(x)))

In [22]:
label_star.head()

Restaurant        Label
10 at Claymore    1        4
                  2        4
                  3        5
126 Eating House  3        4
2it & Drink       2        5
dtype: int64

In [23]:
label_star.shape

(824,)

In [24]:
label_truth = []
for x in label_star:
    label_truth.append(x)

In [25]:
# Accuracy of sentiment predicted stars and the average stars for each label
accuracy_score(label_truth, pred_label)

0.316747572815534

# Overall Restaurant Star Prediction

In [44]:
# Calculating exponentially rencency weighted average of sentiment overall
overall_sentiment = grouped_weighted_avg(clean_data["adjusted_sentiment"], clean_data["weights"], (clean_data["Restaurant"]))

In [45]:
# Adjusting sentiment to fit [1,5]
overall_sentiment = overall_sentiment.apply((lambda x: math.ceil(x*5)))

In [46]:
overall_sentiment.head()

Restaurant
10 at Claymore      4
126 Eating House    5
2it & Drink         5
328 Katong Laksa    4
8 Korean BBQ        4
dtype: int64

In [47]:
pred_overall = []
for x in overall_sentiment:
    pred_overall.append(x)

In [48]:
# Calculating exponentially rencency weighted average of star overall
overall_star = grouped_weighted_avg(clean_data["Stars"], clean_data["weights"], (clean_data["Restaurant"]))

In [49]:
overall_star = overall_star.apply((lambda x: math.ceil(x)))

In [50]:
overall_star.head()

Restaurant
10 at Claymore      5
126 Eating House    4
2it & Drink         5
328 Katong Laksa    3
8 Korean BBQ        4
dtype: int64

In [51]:
overall_truth = []
for x in overall_star:
    overall_truth.append(x)

In [52]:
# Accuracy of sentiment predicted stars and the average stars overall
accuracy_score(overall_truth, pred_overall)

0.2916666666666667