## Data processing

In [41]:
import pandas as pd
cafe_reviews = pd.read_parquet("cafe_reviews.parquet")

In [42]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import TweetTokenizer
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize, WhitespaceTokenizer, RegexpTokenizer
from nltk.corpus import words
from nltk.tokenize import SyllableTokenizer

# download package
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')

# upload stopwords and string
punctuations = set(string.punctuation)
english_words = set(words.words())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [43]:
cafe_reviews.head()

Unnamed: 0,review_id,business_id,user_id,stars,text,date,name,review_count,yelping_since,cafe_review
26696,uLsgFiVJJ25kL210IZ_ubA,oT5Bidkfa7cGOp1806ryXQ,Ne3DSl7bmpvjqEeXD2BQ9A,4,Good Starbucks location and easy to get in and...,2016-07-15 21:29:03,Keshia,139.0,2011-10-22 10:54:54,True
34514,JHlC1TR7n-pDaNkjQb7jdA,oT5Bidkfa7cGOp1806ryXQ,cLz_xjT_-5EcOShmBB-4gQ,5,Wonderful staff here really make the visit eve...,2017-08-09 11:38:02,Jason,20.0,2013-10-30 19:46:03,True
40368,qK416ZpKOiPobkvbcwxY6w,oT5Bidkfa7cGOp1806ryXQ,s31DfJUYKTWyPtXmTFx0NQ,1,Ordered 2 coffees and a muffin. Got one coffee...,2017-11-19 16:20:16,Robin,32.0,2016-06-10 02:52:08,True
72933,Ju52qzbADpR7EsS6xLUclg,KPnJ3hdHljTWbsq_mx5oZQ,pNgfY8VRnXRqJswwU99gnQ,2,Thanks to extremely poor customer service that...,2012-10-15 17:20:19,Kris,233.0,2012-09-23 03:41:57,True
79525,yqqhQ6ex6ZdbotwnYaDadg,oT5Bidkfa7cGOp1806ryXQ,q1XsSG9XtLoIKBFOgfej_w,5,This is probably my favorite Starbucks. It's r...,2017-03-31 14:23:56,Leah,177.0,2013-06-26 15:57:38,True


In [44]:
cafe_reviews.drop(columns=["review_id", "business_id", "user_id", "date", "name", "review_count", "yelping_since", "cafe_review"], inplace=True)
cafe_reviews.head()

Unnamed: 0,stars,text
26696,4,Good Starbucks location and easy to get in and...
34514,5,Wonderful staff here really make the visit eve...
40368,1,Ordered 2 coffees and a muffin. Got one coffee...
72933,2,Thanks to extremely poor customer service that...
79525,5,This is probably my favorite Starbucks. It's r...


In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cafe_reviews["text"], cafe_reviews["stars"], test_size=0.2, random_state=42)

## Text Cleaner

In [46]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
import string
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def cleanup_text(sentence):
  # First, word tokenize.
  tokenized_sms_messages = word_tokenize(sentence)

  # Lower case
  tokenized_sms_messages = [word.lower() for word in tokenized_sms_messages]

  # Remove punctuation
  tokenized_sms_messages = [word for word in tokenized_sms_messages if word not in string.punctuation]

  # stem
  # tokenized_sms_messages = [ps.stem(word) for word in tokenized_sms_messages]
    
  # Remove stop words
  tokenized_sms_messages = [word for word in tokenized_sms_messages if word not in stop_words]

  # put it back together
  tokenized_sms_messages = TreebankWordDetokenizer().detokenize(tokenized_sms_messages)

  return tokenized_sms_messages

## NLP Model：TextBlob to directly tell the sentiment

In [48]:
cafe_reviews["cleaned_text"] = cafe_reviews["text"].apply(cleanup_text)

In [49]:
cafe_reviews.head()

Unnamed: 0,stars,text,cleaned_text
26696,4,Good Starbucks location and easy to get in and...,good starbucks location easy get location larg...
34514,5,Wonderful staff here really make the visit eve...,wonderful staff really make visit every time c...
40368,1,Ordered 2 coffees and a muffin. Got one coffee...,ordered 2 coffees muffin got one coffee muffin...
72933,2,Thanks to extremely poor customer service that...,thanks extremely poor customer service driven ...
79525,5,This is probably my favorite Starbucks. It's r...,probably favorite starbucks's really chill loc...


### Apply TextBlob Sentiment Analysis

In [50]:
!pip install textblob



In [51]:
from textblob import TextBlob

cafe_reviews["sentiment"] = cafe_reviews["cleaned_text"].apply(lambda review: TextBlob(review).sentiment.polarity)
cafe_reviews['Sentiment_Category'] = cafe_reviews["sentiment"].apply(
    lambda polarity: 'Positive' if polarity > 0.1 else ('Negative' if polarity < 0.05 else 'Neutral'))

In [52]:
cafe_reviews.head()

Unnamed: 0,stars,text,cleaned_text,sentiment,Sentiment_Category
26696,4,Good Starbucks location and easy to get in and...,good starbucks location easy get location larg...,0.441667,Positive
34514,5,Wonderful staff here really make the visit eve...,wonderful staff really make visit every time c...,0.510185,Positive
40368,1,Ordered 2 coffees and a muffin. Got one coffee...,ordered 2 coffees muffin got one coffee muffin...,-0.044444,Negative
72933,2,Thanks to extremely poor customer service that...,thanks extremely poor customer service driven ...,0.0825,Neutral
79525,5,This is probably my favorite Starbucks. It's r...,probably favorite starbucks's really chill loc...,0.428571,Positive


## Training KNN Sentiment Analysis Model to help future analysis

### Clean up the contradictory comments （ low stars but positive /. high stars but negative)

In [53]:
df_valid_review = cafe_reviews.copy()
df_valid_review.drop(columns=["text"], inplace=True)
df_valid_review.head()

Unnamed: 0,stars,cleaned_text,sentiment,Sentiment_Category
26696,4,good starbucks location easy get location larg...,0.441667,Positive
34514,5,wonderful staff really make visit every time c...,0.510185,Positive
40368,1,ordered 2 coffees muffin got one coffee muffin...,-0.044444,Negative
72933,2,thanks extremely poor customer service driven ...,0.0825,Neutral
79525,5,probably favorite starbucks's really chill loc...,0.428571,Positive


In [54]:
df_valid_review["Sentiment_Category"] = df_valid_review["Sentiment_Category"].astype(str)

df_valid_review = df_valid_review[~(
    ((df_valid_review["Sentiment_Category"] == "Positive") & (df_valid_review["stars"] < 3)) |
    ((df_valid_review["Sentiment_Category"] == "Negative") & (df_valid_review["stars"] > 3))
)]
df_valid_review.head()

Unnamed: 0,stars,cleaned_text,sentiment,Sentiment_Category
26696,4,good starbucks location easy get location larg...,0.441667,Positive
34514,5,wonderful staff really make visit every time c...,0.510185,Positive
40368,1,ordered 2 coffees muffin got one coffee muffin...,-0.044444,Negative
72933,2,thanks extremely poor customer service driven ...,0.0825,Neutral
79525,5,probably favorite starbucks's really chill loc...,0.428571,Positive


### Train model

In [55]:
df_valid_review.head()

Unnamed: 0,stars,cleaned_text,sentiment,Sentiment_Category
26696,4,good starbucks location easy get location larg...,0.441667,Positive
34514,5,wonderful staff really make visit every time c...,0.510185,Positive
40368,1,ordered 2 coffees muffin got one coffee muffin...,-0.044444,Negative
72933,2,thanks extremely poor customer service driven ...,0.0825,Neutral
79525,5,probably favorite starbucks's really chill loc...,0.428571,Positive


In [56]:
df_valid_review = df_valid_review.drop(['sentiment', 'Sentiment_Category'], axis=1)
df_valid_review.head(1)

Unnamed: 0,stars,cleaned_text
26696,4,good starbucks location easy get location larg...


In [57]:
df_valid_review["sentiment"] = df_valid_review["stars"].apply(lambda x: "positive" if x >=3 else "negative") # different criteria may apply here

In [58]:
import gensim.downloader as api

# Load the pretrained model
pretrained_model = api.load('glove-wiki-gigaword-50')

import numpy as np

vector_size = pretrained_model.vector_size  # Get the embedding size

tokenized_reviews = df_valid_review["cleaned_text"]

embeddings = list(map(lambda tokenized_review: pretrained_model.get_mean_vector(tokenized_review) if len(tokenized_review) > 0 else np.zeros(vector_size), tokenized_reviews))
     

In [59]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pandas as pd

X= df_valid_review["cleaned_text"]
y= df_valid_review["sentiment"]
def assess_model(df, X, y_column):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  # train the model
  classifier = KNeighborsClassifier()
  classifier.fit(X_train, y_train)

  # Predict on the test data
  y_pred = classifier.predict(X_test)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  f1_score = sklearn.metrics.f1_score(y_test, y_pred, pos_label="positive")
  print(f"Accuracy: {accuracy}")
  print(f"f1_score: {f1_score}")
  print(sklearn.metrics.classification_report(y_test,y_pred))
  display(pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=classifier.classes_, index=classifier.classes_ ))


In [60]:
assess_model(cafe_reviews, embeddings, "sentiment")

Accuracy: 0.6342857142857142
f1_score: 0.7217391304347827
              precision    recall  f1-score   support

    negative       0.49      0.44      0.47        63
    positive       0.70      0.74      0.72       112

    accuracy                           0.63       175
   macro avg       0.60      0.59      0.59       175
weighted avg       0.63      0.63      0.63       175



Unnamed: 0,negative,positive
negative,0.444444,0.555556
positive,0.258929,0.741071
