In [61]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')
import nltk
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
df = pd.read_csv("beerreviews.csv")

# Task B

### Review Column Handling

In [63]:
# preprocessing reviews and extract rate for each review

df = df[["name","reviews"]]
df["rate"] = df["reviews"].map(lambda x : re.findall("\n\n[\d.]+",x)[0].lstrip("\n\n"))
df["review"] = df["reviews"].map(lambda x : "".join(re.findall("[\d.]+\n\n[^'']+",x)[0].split("\n")[1:-1]))
df = df[["name", "review","rate"]]
df

Unnamed: 0,name,review,rate
0,KBS - Maple Mackinac Fudge,12oz. bottle served in my Spiegelau tulip. Pou...,3.83
1,Doppelganger,"Juicy, dank and delicious. This beer is an imp...",4.64
2,The Abyss,"This barrel aged imperial stout, is rich, smoo...",4.58
3,Schaarbeekse Kriek,750ml bottle served in a Mort Subite chalice.B...,4.61
4,Black Tuesday - Rum Barrel-Aged,L: Midnight obsidian and mahogany hues with a ...,4.44
...,...,...,...
6214,King Julius,Solid beer Treehouse does right. I find this o...,4.67
6215,Abrasive Ale,This is a lovely looking brew. Golden liquid w...,4.53
6216,Double Citra®,,4.59
6217,Beer Geek Vanilla Shake - Bourbon Barrel-Aged,"Very good head production, and good retention....",4.65


In [64]:
## create beer list for unique 250 beers

beer_list = df["name"].unique()

## lower, tokenize, remove stopwords

In [65]:
## tokenize reviews

stop = stopwords.words('english')
trans=str.maketrans({key: None for key in string.punctuation})

df["review"] = df["review"].astype(str)
df["reviews_tokens"] = df["review"].apply(lambda x: x.translate(trans))
df["reviews_tokens"] = df["reviews_tokens"].apply(lambda x: word_tokenize(x.lower()))
df["reviews_tokens"] =df["reviews_tokens"].apply(lambda words_list: [x for x in words_list if x not in stop])

df

Unnamed: 0,name,review,rate,reviews_tokens
0,KBS - Maple Mackinac Fudge,12oz. bottle served in my Spiegelau tulip. Pou...,3.83,"[12oz, bottle, served, spiegelau, tulip, pours..."
1,Doppelganger,"Juicy, dank and delicious. This beer is an imp...",4.64,"[juicy, dank, delicious, beer, imperial, versi..."
2,The Abyss,"This barrel aged imperial stout, is rich, smoo...",4.58,"[barrel, aged, imperial, stout, rich, smooth, ..."
3,Schaarbeekse Kriek,750ml bottle served in a Mort Subite chalice.B...,4.61,"[750ml, bottle, served, mort, subite, chaliceb..."
4,Black Tuesday - Rum Barrel-Aged,L: Midnight obsidian and mahogany hues with a ...,4.44,"[l, midnight, obsidian, mahogany, hues, thin, ..."
...,...,...,...,...
6214,King Julius,Solid beer Treehouse does right. I find this o...,4.67,"[solid, beer, treehouse, right, find, one, del..."
6215,Abrasive Ale,This is a lovely looking brew. Golden liquid w...,4.53,"[lovely, looking, brew, golden, liquid, consid..."
6216,Double Citra®,,4.59,[]
6217,Beer Geek Vanilla Shake - Bourbon Barrel-Aged,"Very good head production, and good retention....",4.65,"[good, head, production, good, retention, body..."


## Word Frequency

In [66]:
from collections import Counter 

## create an empty list to store tokenized words
word_list = []

## append each word list in 'Filtered comments' in to a new list
for i in df['reviews_tokens']:
    word_list.extend(i)
    
## generate word frequency
word_frequencies = Counter(w for w in word_list)

In [67]:
## ckeck top word frequency words
df_word_frequencies = pd.DataFrame([word_frequencies]).T
df_word_frequencies = df_word_frequencies.reset_index()
df_word_frequencies.columns = ['Word','Frequency']
df_word_frequencies = df_word_frequencies.sort_values(by=['Frequency'],ascending=False)
df_word_frequencies[:30]

Unnamed: 0,Word,Frequency
13,head,2877
49,beer,2832
20,dark,2118
222,chocolate,2042
199,’,1941
24,taste,1682
124,sweet,1628
223,bourbon,1502
28,coffee,1493
156,vanilla,1447


## Determine Assumed three attributes

Even though random selection of attributes is possible, we want to see the recommendation results based upon most frequently mentioned attributes. Many words can convey neutral meaning and looking through all the contexts is little bit out of interest at this moment. Hence we've arbitrarily selected some attributes, depending on the less context-reliant words.

First, We have adopted the Aggressive, as the reviews we are interested in have been written on the craft beers, which basically provide user with idiosyncratic and characterful experience compared to the experience from what people used to have. This attribute  can easily be changed with the 'Balanced' as one might delve into more general taste-having consumers. Then, cetering on the key-words such as carbonation, fruit, thick, and etc, Crisp and Robust were choosen.

# Task C
**Three Attributes:**  
Aggressive (Boldly assertive aroma and/or taste);  
Crisp: Highly carbonated; effervescent;  
Robust: Rich and full-bodied;  

## Preprossessing for similarity analysis

In [68]:
## building new df
df2 = df[['name','review','rate','reviews_tokens']]

def joinwords(comment):   
    return " ".join(comment) 

df2['joined_review'] = df2['reviews_tokens'].map(joinwords)
df2

Unnamed: 0,name,review,rate,reviews_tokens,joined_review
0,KBS - Maple Mackinac Fudge,12oz. bottle served in my Spiegelau tulip. Pou...,3.83,"[12oz, bottle, served, spiegelau, tulip, pours...",12oz bottle served spiegelau tulip pours color...
1,Doppelganger,"Juicy, dank and delicious. This beer is an imp...",4.64,"[juicy, dank, delicious, beer, imperial, versi...",juicy dank delicious beer imperial version alt...
2,The Abyss,"This barrel aged imperial stout, is rich, smoo...",4.58,"[barrel, aged, imperial, stout, rich, smooth, ...",barrel aged imperial stout rich smooth mediumf...
3,Schaarbeekse Kriek,750ml bottle served in a Mort Subite chalice.B...,4.61,"[750ml, bottle, served, mort, subite, chaliceb...",750ml bottle served mort subite chalicebottle ...
4,Black Tuesday - Rum Barrel-Aged,L: Midnight obsidian and mahogany hues with a ...,4.44,"[l, midnight, obsidian, mahogany, hues, thin, ...",l midnight obsidian mahogany hues thin sandy h...
...,...,...,...,...,...
6214,King Julius,Solid beer Treehouse does right. I find this o...,4.67,"[solid, beer, treehouse, right, find, one, del...",solid beer treehouse right find one delightful...
6215,Abrasive Ale,This is a lovely looking brew. Golden liquid w...,4.53,"[lovely, looking, brew, golden, liquid, consid...",lovely looking brew golden liquid considerable...
6216,Double Citra®,,4.59,[],
6217,Beer Geek Vanilla Shake - Bourbon Barrel-Aged,"Very good head production, and good retention....",4.65,"[good, head, production, good, retention, body...",good head production good retention body deep ...


## Caculate cosine similarity

Basically, the cosine similarity can be calculated on the vectors which have same length( does not mean the norm). In other words, the two vectors in question should have same number of components. It is, however, not expectable to have that dataset in real world, we need to create a sparse matrix. This sparse matrix comprise of vector elements, which can be expressed as an union set between two vectors.

In [69]:
## calculate similarity using bag of words
attribute = 'Aggressive (Boldly assertive aroma and/or taste);Crisp: Highly carbonated; effervescent;Robust: Rich and full-bodied'

def out_of_word(review):
    
    # Make the review and attribute together
    documents =[review, attribute]
    
    # Prepare for vectorizing
    count_vectorizer = CountVectorizer(stop_words='english')
    sparse_matrix = count_vectorizer.fit_transform(documents)
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=['x', 'y'])
    
    # Calculate the cosine similarity
    return cosine_similarity(df, df)[0,1]

df2['Bag-of-Words_similarity'] = df2['joined_review'].map(out_of_word)
df2[['name','review','Bag-of-Words_similarity']]

Unnamed: 0,name,review,Bag-of-Words_similarity
0,KBS - Maple Mackinac Fudge,12oz. bottle served in my Spiegelau tulip. Pou...,0.043033
1,Doppelganger,"Juicy, dank and delicious. This beer is an imp...",0.000000
2,The Abyss,"This barrel aged imperial stout, is rich, smoo...",0.061546
3,Schaarbeekse Kriek,750ml bottle served in a Mort Subite chalice.B...,0.000000
4,Black Tuesday - Rum Barrel-Aged,L: Midnight obsidian and mahogany hues with a ...,0.000000
...,...,...,...
6214,King Julius,Solid beer Treehouse does right. I find this o...,0.000000
6215,Abrasive Ale,This is a lovely looking brew. Golden liquid w...,0.024845
6216,Double Citra®,,0.000000
6217,Beer Geek Vanilla Shake - Bourbon Barrel-Aged,"Very good head production, and good retention....",0.000000


# Task D

## Sentiment Analysis

In [70]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [76]:
## calculate sentiment score
analyser = SentimentIntensityAnalyzer()

def get_sentiment(review):
    scores = analyser.polarity_scores(review)
    return scores['compound']

df2['sentiment_sore'] = df2['joined_review'].map(get_sentiment)
df2

Unnamed: 0,name,review,rate,reviews_tokens,joined_review,Bag-of-Words_similarity,sentiment_score,sentiment_sore
0,KBS - Maple Mackinac Fudge,12oz. bottle served in my Spiegelau tulip. Pou...,3.83,"[12oz, bottle, served, spiegelau, tulip, pours...",12oz bottle served spiegelau tulip pours color...,0.043033,0.5423,0.5423
1,Doppelganger,"Juicy, dank and delicious. This beer is an imp...",4.64,"[juicy, dank, delicious, beer, imperial, versi...",juicy dank delicious beer imperial version alt...,0.000000,0.8779,0.8779
2,The Abyss,"This barrel aged imperial stout, is rich, smoo...",4.58,"[barrel, aged, imperial, stout, rich, smooth, ...",barrel aged imperial stout rich smooth mediumf...,0.061546,0.2960,0.2960
3,Schaarbeekse Kriek,750ml bottle served in a Mort Subite chalice.B...,4.61,"[750ml, bottle, served, mort, subite, chaliceb...",750ml bottle served mort subite chalicebottle ...,0.000000,0.9565,0.9565
4,Black Tuesday - Rum Barrel-Aged,L: Midnight obsidian and mahogany hues with a ...,4.44,"[l, midnight, obsidian, mahogany, hues, thin, ...",l midnight obsidian mahogany hues thin sandy h...,0.000000,0.6597,0.6597
...,...,...,...,...,...,...,...,...
6214,King Julius,Solid beer Treehouse does right. I find this o...,4.67,"[solid, beer, treehouse, right, find, one, del...",solid beer treehouse right find one delightful...,0.000000,0.9274,0.9274
6215,Abrasive Ale,This is a lovely looking brew. Golden liquid w...,4.53,"[lovely, looking, brew, golden, liquid, consid...",lovely looking brew golden liquid considerable...,0.024845,0.9903,0.9903
6216,Double Citra®,,4.59,[],,0.000000,0.0000,0.0000
6217,Beer Geek Vanilla Shake - Bourbon Barrel-Aged,"Very good head production, and good retention....",4.65,"[good, head, production, good, retention, body...",good head production good retention body deep ...,0.000000,0.7003,0.7003


# Task E

## Recommendation 

The score for evaluation which will be used to recommend craft beers will be the total value of cosine similarity and sentiment score. Thus, If a sentiment indicates negative feagure its value in terms of recommendation will drop.

In [77]:
df2['evaluation_score'] = df2['Bag-of-Words_similarity'] + df2['sentiment_sore'] 
df2.sort_values(by='evaluation_score', ascending=False)['name'][:3]

1988    Affogato - Bourbon Barrel-Aged
388                      Double Citra®
3929                 Moment Of Clarity
Name: name, dtype: object

# Task F

## Bag of Words Versus Word Embeddings

In [84]:
import spacy 
import en_core_web_lg
nlp = en_core_web_lg.load()

ModuleNotFoundError: No module named 'en_core_web_lg'

In [None]:
## calculate similarity score
attribute = nlp('Aggressive (Boldly assertive aroma and/or taste);Crisp: Highly carbonated; effervescent;Robust: Rich and full-bodied')

def cal_similarity(review):
    review = nlp(review)
    return review.similarity(attribute)

df2['Word-Vector_similarity'] = df2['joined_review'].map(cal_similarity)
df2

In [None]:
df2['evaluation_score2'] = df2['Word-Vector_similarity'] + df2['sentiment_sore'] 
df2.sort_values(by='evaluation_score2', ascending=False)['name'][:3]

In [None]:
## analyzing % of reviews that mention a preferred attribute.
def mention(i):
    for j in i:
        j = str(j)
        if j in str(attribute):
            return 1
        else:
            return 0
        
def getpercentage(i):
    return df2['name'].value_counts()[i]

## using Bag-of-Words_similarity

df3 = df2.loc[(df2['name'] == 'Affogato - Bourbon Barrel-Aged')|(df2['name'] == 'Double Citra®')|(df2['name'] == 'Moment Of Clarity')]       

df3['mention'] = df2['reviews_tokens'].map(mention)
df3.fillna(0, inplace=True)
df4 = df3[['name','mention']].groupby(['name']).sum().reset_index()

df4['percentage']= df4['mention']/df4['name'].map(getpercentage)
df4

In [None]:
## using Word_vector_similarity

df3 = df2.loc[(df2['name'] == 'Pseudo Sue')|(df2['name'] == 'Keene Idea')|(df2['name'] == 'Smooth')]       

df3['mention'] = df2['reviews_tokens'].map(mention)
df3.fillna(0, inplace=True)
df4 = df3[['name','mention']].groupby(['name']).sum().reset_index()

df4['percentage']= df4['mention']/df4['name'].map(getpercentage)
df4

## Rationale for the difference

As it has been observed from the developmental history of the natural language processing, word embedding is a newer and evolved approach utilizing not just a word itself but also the likely context of the word. This property makes win in most cases the embedding-based methods over vanilla bag-of-words methods. However, whether the result keeps same in all situation is debatable. Especially in the fields such as recommendation system in a specific context, a review would be highly likely to contain the word/topic pertaining to that context. That can result in a relatively less informative recommendation, if the word embedding, which considers which word might be in the vicinity of a word in question. Hence, in the situation in which the meaning of document pairs sharing similar context should be discerned, the simple WoB approach would benefit. 

# Task G

In [None]:
# Recommending 3 best products based on user rating in the entire dataset
df2.sort_values(by='rate', ascending=False)['name'][:3]

In [None]:
df_top3 = df2.loc[(df2['name'] == 'Omega Point')|(df2['name'] == 'Maman')|(df2['name'] == 'Ground State')]       
df_top3[['name','Bag-of-Words_similarity','Word-Vector_similarity','sentiment_sore']].groupby(['name']).mean()

These products will not meet the requirements of the user looking for recommendation because of relatively low evaluation scores calculated by both spaCy_similarity and Bag_of_words similarity. The top 3 brands are recommended based on products rating instead of the top 3 most desirable attributes across all users. The rating of individual may be inaccurate since individual preference is different, and such approach to get the top 3 brand fail to look at overall user's and instead focus on individual preference, which is not what we want.