<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Libraries</a></span></li><li><span><a href="#Train-the-model-on-twitter-data" data-toc-modified-id="Train-the-model-on-twitter-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Train the model on twitter data</a></span></li><li><span><a href="#Sentiment-Analysis-for-The-Bottom-Line" data-toc-modified-id="Sentiment-Analysis-for-The-Bottom-Line-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentiment Analysis for The Bottom Line</a></span></li><li><span><a href="#Result" data-toc-modified-id="Result-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Result</a></span></li><li><span><a href="#Restuarant-reviews" data-toc-modified-id="Restuarant-reviews-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Restuarant reviews</a></span></li><li><span><a href="#Future-work" data-toc-modified-id="Future-work-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Future work</a></span></li></ul></div>

# Import Libraries

In [135]:
import numpy as np 
import pandas as pd

import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist
import random
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

# Train the model on twitter data

In [2]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [3]:
positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]
print(tweet_tokens)

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [5]:
print(pos_tag(tweet_tokens))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [6]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [7]:
print(lemmatize_sentence(tweet_tokens))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [8]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [9]:
stop_words = stopwords.words('english')

print(remove_noise(tweet_tokens, stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [10]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [11]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [12]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [13]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)


In [14]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset
len(dataset)
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]


In [15]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.994
Most Informative Features
                      :( = True           Negati : Positi =   2050.1 : 1.0
                      :) = True           Positi : Negati =   1001.2 : 1.0
                     sad = True           Negati : Positi =     24.7 : 1.0
                     bam = True           Positi : Negati =     22.6 : 1.0
                follower = True           Positi : Negati =     22.5 : 1.0
                 welcome = True           Positi : Negati =     20.4 : 1.0
                    sick = True           Negati : Positi =     20.1 : 1.0
                     via = True           Positi : Negati =     17.2 : 1.0
                     x15 = True           Negati : Positi =     16.1 : 1.0
                      aw = True           Negati : Positi =     14.8 : 1.0
None


In [16]:
custom_tweet = "Enjoying this Costa Rica evokes the pleasure of walking into a bakery in winter having any aromas of warming spices and rich cocoa"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Positive


# Sentiment Analysis for The Bottom Line

In [32]:
df = pd.read_csv('cleaned_coffee_reviews.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Coffee Name,Roaster Name,Roaster Location,Coffee Origin,Rating,Roast Level,Review Date,Price,...,Flavor,Body,Aroma,Aftertaste,Acidity,Bottom Line,Prices Per Oz,Agtron Whole,Agtron Ground,Coffee Country
0,0,0,Costa Rica Luis Campos Anaerobic,Revel Coffee,"Billings, Montana","Tarrazu, Costa Rica",94,Medium-Light,April 2020,$23.00/12 ounces,...,9,9,9,8,9.0,Enjoying this Costa Rica evokes the pleasures ...,1.92,58,76,Costa Rica
1,1,1,Colombia Granja La Esperanza Tres Dragones,PT's Coffee Roasting Co.,"Topeka, Kansas","Valle del Cauca growing region, Colombia",94,Medium-Light,April 2020,$23.50/12 ounces,...,9,9,9,8,9.0,A bombshell of a coffee with enough personalit...,1.96,60,74,Colombia
2,2,3,Colombia Finca La Loma Microlot,BeanFruit Coffee Co.,"Jackson, Mississippi","Huila, Colombia",94,Medium-Light,March 2020,$16.50/12 ounces,...,9,9,9,8,9.0,An exceptional Colombia cup with sweet and tar...,1.38,58,76,Colombia
3,3,5,Dukunde Kawa Rwanda,JBC Coffee Roasters,"Madison, Wisconsin","Musasa, Rwanda",93,Medium-Light,March 2020,$16.25/12 ounces,...,9,9,9,8,8.0,A deep yet delicate cup redolent with crisp fr...,1.35,56,74,Rwanda
4,4,6,Buzira Burundi Natural,JBC Coffee Roasters,"Madison, Wisconsin","Murutra, Kayanza Province, Burundi",92,Medium-Light,March 2020,$18.20/12 ounces,...,9,8,9,8,8.0,An exceptionally cleanly expressed natural-pro...,1.52,58,76,Burundi


In [33]:
#  remember there were 6 null values! 
# print(df.loc[[184]])

In [34]:
df = df.dropna(axis=0, subset=['Bottom Line'])

In [41]:
sentiments = []

for i, blurb in df['Bottom Line'].items():
    custom_tokens = remove_noise(word_tokenize(blurb))
    sentiment = classifier.classify(dict([token, True] for token in custom_tokens))
    sentiments.append(sentiment)
#     print(i, sentiment)

In [42]:
# len(sentiments) 2189
print("Positive Reviews: ", sentiments.count("Positive"))
print("Negative Reviews: ", sentiments.count("Negative"))

Positive Reviews:  1831
Negative Reviews:  358


In [83]:
# The list of sentiments

# sentiments

In [84]:
# The index of the first negative sentiment

sentiments.index('Negative')

7

In [81]:
# print(df.loc[[7]])

# The results:

#    Unnamed: 0.1  Unnamed: 0.1.1               Coffee Name  \
# 7             7              11  Birambo Village DR Congo   

#           Roaster Name    Roaster Location               Coffee Origin  \
# 7  JBC Coffee Roasters  Madison, Wisconsin  South Kivu Province, Congo   

#    Rating   Roast Level Review Date             Price       ...       Flavor  \
# 7      91  Medium-Light  March 2020  $17.90/12 ounces       ...            9   

#    Body  Aroma  Aftertaste  Acidity  \
# 7     8      8           8      8.0   

#                                          Bottom Line Prices Per Oz  \
# 7  A balanced, sweetly savory DR Congo cup with d...          1.49   

#    Agtron Whole  Agtron Ground  Coffee Country  
# 7            56             72           Congo  

In [77]:
# this review was given as negative and doesn't seem accurate

for i, blurb in df['Bottom Line'].items():
    if "balanced, sweetly savory" in blurb:
        print(i, df.at[i, 'Bottom Line']) 

7 A balanced, sweetly savory DR Congo cup with dried fruit and chocolate notes throughout.


In [96]:
# There are 358 reviews that scored negatively on the sentiment analysis.  

# # count of the 358 negative reviews 
count = 0 

# track the index number of the list
index_count = 0

for sentiment in sentiments: 
    if sentiment == "Negative":
        count += 1
        print("Index: ", index_count, " Rating: ", df.at[index_count, 'Rating'],df.at[index_count, 'Bottom Line'])
        print(" ")
    index_count += 1


# # This prints out the index number
# # for i in range(len(sentiments)): 
# #     print(i)
# #     if i == "Negative":
# #         print(i)

Index:  7  Rating:  91 A balanced, sweetly savory DR Congo cup with dried fruit and chocolate notes throughout.
 
Index:  8  Rating:  92 A pretty, richly sweet Kona cup that leads with deep floral and nut tones.
 
Index:  12  Rating:  91 A pleasingly balanced cup, quiet, accessible and subtly original.
 
Index:  15  Rating:  92 A solid Kona cup that leads with nut tones and bright, yet accessible, acidity.
 
Index:  21  Rating:  92 Sweet flowers and intense sandalwood power this silky-bodied, fragrant Yirgacheffe.
 
Index:  22  Rating:  94 A quietly original, resonant cup, exquisitely balanced and complete.
 
Index:  27  Rating:  90 A richly expressed, crisply bittersweet microlot from a tiny, woman-owned and operated farm in Peru.
 
Index:  29  Rating:  92 A friendly, straightforward washed Ethiopia cup with both organic and fair trade credentials.
 
Index:  30  Rating:  92 A washed Sidamo cup with a developed roast profile that amplifies chocolate and floral notes; deeply and richly 

In [155]:
p_count = 0
n_count = 0
total = 0

for i, rating in df['Rating'].items():
    total += 1
    if rating <= 89:
        n_count += 1
#         print("Index: ", i, " Rating: ", df.at[i, 'Rating'],df.at[i, 'Bottom Line'])
#         print("")
#         print(df.at[i, 'Rating'])
    if rating >= 95:
        p_count += 1
#         print("Index: ", i, " Rating: ", df.at[i, 'Rating'],df.at[i, 'Bottom Line'])
#         print("")


print("Number of low ratings: ", n_count, " ", round(n_count/total * 100,2),"%")
print("Number of high ratings: ", p_count, '', round(p_count/total * 100, 2),"%")
print("Number of total ratings: ", total)

Number of low ratings:  176   8.04 %
Number of high ratings:  222  10.14 %
Number of total ratings:  2189


In [None]:
# for sentiment in sentiments: 
#     if sentiment == "Negative":
#         print("Index: ", index_count, " Rating: ", df.at[index_count, 'Rating'],df.at[index_count, 'Bottom Line'])
# for i, rating in df['Rating'].items():
#     if rating <= 87:

# Result
It doesn't seem that using the tweets to train a model was very successful on coffee data.  Many of the negative reviews seem quite positive.

# Restuarant reviews

In [134]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [136]:
corpus = [] 

In [137]:
for i in range(0, 1000):  
      
    # column : "Review", row ith 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  
      
    # convert all cases to lower cases 
    review = review.lower()  
      
    # split to array(default delimiter is " ") 
    review = review.split()  
      
    # creating PorterStemmer object to 
    # take main stem of each word 
    ps = PorterStemmer()  
      
    # loop for stemming each word 
    # in string array at ith row     
    review = [ps.stem(word) for word in review 
                if not word in set(stopwords.words('english'))]  
                  
    # rejoin all string array elements 
    # to create back into a string 
    review = ' '.join(review)   
      
    # append each string to create 
    # array of clean text  
    corpus.append(review)  

In [138]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 1500 feature. 
# "max_features" is attribute to 
# experiment with to get better results 
cv = CountVectorizer(max_features = 1500)  
  
# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()  
  
# y contains answers if review 
# is positive or negative 
y = dataset.iloc[:, 1].values  

In [142]:
# Splitting the dataset into 
# the Training set and Test set 
# from sklearn.cross_validation import train_test_split 
from sklearn.model_selection import train_test_split
  
# experiment with "test_size" 
# to get better results 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [143]:
# Fitting Random Forest Classification 
# to the Training set 
from sklearn.ensemble import RandomForestClassifier 
  
# n_estimators can be said as number of 
# trees, experiment with n_estimators 
# to get better results  
model = RandomForestClassifier(n_estimators = 501, 
                            criterion = 'entropy') 
                              
model.fit(X_train, y_train)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=501, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [144]:
y_pred = model.predict(X_test) 
  
y_pred 

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0])

In [146]:
# custom_tweet = "Enjoying this Costa Rica evokes the pleasure of walking into a bakery in winter having any aromas of warming spices and rich cocoa"


# print(model.predict(custom_tweet))