In [1]:
import numpy as np
import urllib
import nltk
from nltk.stem.porter import *
from sklearn import linear_model

import string
import math
import random
import re
import time
from collections import Counter

In [2]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

all_data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))

data = all_data[:5000]
corpus = [d['review/text'] for d in data]

In [3]:
##########################
# 1
##########################

punctuation = set(string.punctuation)

def puncFilter(text, rmPunc):
    if rmPunc:
        return ''.join([c for c in text.lower() if c not in punctuation])
    else:
        return ' '.join(re.findall(r"\w+|[^\w\s]", text.lower()))

def text2bigrams(text, rmPunc=True):
    return nltk.bigrams(puncFilter(text, rmPunc).split())

bigrams_cnt = Counter()

for text in corpus:
    bigrams_cnt += Counter(text2bigrams(text))
    
for i in bigrams_cnt.most_common(5):
    print(i)

(('with', 'a'), 4587)
(('in', 'the'), 2595)
(('of', 'the'), 2245)
(('is', 'a'), 2056)
(('on', 'the'), 2033)


In [4]:
##########################
# 2
##########################

def word_cnt_feature(top_grams, text_grams):
    feat = [0] * len(top_grams)
    for bi in text_grams:
        try:
            feat[top_grams.index(bi)] += 1
        except:
            pass
    feat.append(1)
    return feat


def calMSE(X, y, lamda=1.0):
    clf = linear_model.Ridge(lamda, fit_intercept=False)
    clf.fit(X, y)
    predictions = clf.predict(X)
    return np.mean((y - predictions)**2)

bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]
X = [word_cnt_feature(bigrams, text2bigrams(text)) for text in corpus]
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

MSE: 0.343317


In [7]:
##########################
# 3
##########################

def text2unigram(text, rmPunc=True):
    return puncFilter(text, rmPunc).split()


class NGramTFIDF:
    def __init__(self, grams_list, cnvt, rmPunc=True):
        self.IDF_D = {}
        
        self.cnvt = cnvt
        self.rmPunc = rmPunc
        self.grams_list = grams_list
        
    def calTF(self, word, gram):
        cnt = Counter(gram)
        try:
            return cnt[word]
        except:
            return 0

    def calIDF(self, word):
        try:
            idf = self.IDF_D[word]
        except:
            numer = len(self.textGrams)
            denom = sum([1 if self.calTF(word, text) else 0 for text in self.textGrams])
            denom = denom if denom !=0 else 1e-7 # avoid division by 0
            idf = math.log10(numer / denom)
            self.IDF_D[word] = idf
        return idf

    def calTF_IDF(self, word, textGram):
        tf = self.calTF(word, textGram)
        idf = self.calIDF(word)
        return tf * idf
    
    def train(self, corpus):
        self.TF_IDF = []
        self.textGrams = [self.cnvt(text, self.rmPunc) for text in corpus]
        
        for i , text in enumerate(self.textGrams):
            tf_idf = [self.calTF_IDF(gram, text) for gram in self.grams_list]
            self.TF_IDF.append(tf_idf)

In [8]:
word_list = ['foam', 'smell', 'banana', 'lactic', 'tart']

ngram_tf_idf = NGramTFIDF(word_list, text2unigram)
ngram_tf_idf.train(corpus)

for word, tf_idf in zip(word_list, ngram_tf_idf.TF_IDF[0]):
    print('"%s" tf-idf score: %f' % (word, tf_idf))

"foam" tf-idf score: 2.275737
"smell" tf-idf score: 0.537902
"banana" tf-idf score: 3.355561
"lactic" tf-idf score: 5.841638
"tart" tf-idf score: 1.806875


In [9]:
##########################
# 4
##########################

def cosine_similarity(x1, x2):
    x1 = np.array(x1)
    x2 = np.array(x2)
    numer = x1.dot(x2)
    denom = np.linalg.norm(x1) * np.linalg.norm(x2)
    denom = denom if denom !=0 else 1e-7 # avoid division by 0
    return numer / denom

unigrams_cnt = Counter()

for text in corpus:
    unigrams_cnt += Counter(text2unigram(text))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]

ngram_tf_idf = NGramTFIDF(unigrams, text2unigram)
ngram_tf_idf.train(corpus)

tf_idf1 = ngram_tf_idf.TF_IDF[0]
tf_idf2 = ngram_tf_idf.TF_IDF[1]

cos_sim = cosine_similarity(tf_idf1, tf_idf2)
print('Cosine similarity: %f' %(cos_sim))

Cosine similarity: 0.106130


In [10]:
##########################
# 5
##########################

cos_sim = {}
for k, v in enumerate(ngram_tf_idf.TF_IDF):
    cos_sim[k] = cosine_similarity(ngram_tf_idf.TF_IDF[0], v)

In [11]:
idx = sorted(cos_sim.items(), key=lambda x: x[1], reverse=True)[1][0]
data[idx]

print('beerID: %s' % data[idx]['beer/beerId'])
print('profileName: %s' % data[idx]['beer/name'])
data[idx]['review/text']

beerID: 52211
profileName: Frog's Hollow Double Pumpkin Ale


'Poured from a 22oz bottle to a Dogfish Head Snifter.\t\tColor: Slight hazy orange with an off white head.\t\tSmell: Cinnamon, banana, pumpkin and nutmeg.\t\tTaste: Alcohol, pumpkin, nutmeg, allspice and a hint of banana.\t\tMouthfeel: Medium carbonation, smooth, medium dryness on the palate.\t\tOverall: The smell is GREAT! The banana was a huge surprise for me. The taste had too much alcohol presence. Seemed to overpower the other flavors. Cheers!'

In [12]:
##########################
# 6
##########################

X = ngram_tf_idf.TF_IDF
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

MSE: 1.135513


In [37]:
##########################
# 7
##########################

data = all_data
corpus = [d['review/text'] for d in data]

def split_data(X, Y, n_train, n_val, n_test, shuffle=False):
    m = len(X)
    n_val += n_train
    n_test += n_val

    if shuffle:
        r = list(zip(X, Y))
        random.shuffle(r)
        X, Y = list(zip(*r))

    return (X[:n_train], Y[:n_train]), (X[n_train:n_val], Y[n_train:n_val]), \
            (X[n_val:n_test], Y[n_val:n_test])

# X = [feature(text) for text in corpus]
y = [d['review/overall'] for d in data]

trainData, valData, testData = split_data(corpus, y, 5000, 5000, 5000, shuffle=True)

trainCorpus, trainY = trainData
valCorpus, valY = valData
testCorpus, testY = testData

In [38]:
class Model:
    def __init__(self, top_grams, cnvt, rmPunc, use_tf_idf):
        self.top_grams = top_grams
        self.cnvt = cnvt
        self.rmPunc = rmPunc
        self.use_tf_idf = use_tf_idf
        ngram_tf_idf = NGramTFIDF(top_grams, cnvt, rmPunc)
        
    def get_feature(self, corpus):
        if self.use_tf_idf:
            ngram_tf_idf.train(corpus)
            self.TF_IDF = ngram_tf_idf.TF_IDF
            return self.TF_IDF
        
        else:
            return [word_cnt_feature(self.top_grams, self.cnvt(text, self.rmPunc)) 
                    for text in corpus]
    
    def train(self, X, y, lamda):
        self.clf = linear_model.Ridge(lamda, fit_intercept=False)
        self.clf.fit(X, y)
        
    def validation(self, X, y, lamdas):
        best_mse = math.inf
        best_lamda = None
        best_clf = None
        
        for lamda in lamdas:
            self.train(X, y, lamda)
            mse = self.test(X, y)
            
            if mse < best_mse:
                best_mse = mse
                best_lamda = lamda
                
        self.best_lamda = best_lamda
    
    def test(self, X, y):
        predictions = self.predict(X)
        return np.mean((y - predictions)**2)
    
    def predict(self, X):
        return self.clf.predict(X)

In [39]:
lamdas = [0.01, 0.1, 1, 10, 100]

unigrams_cnt = Counter()
bigrams_cnt = Counter()
unigrams_cnt_ps = Counter()
bigrams_cnt_ps = Counter()

for text in trainCorpus:
    unigrams_cnt += Counter(text2unigram(text))
    bigrams_cnt += Counter(text2bigrams(text))
    unigrams_cnt_ps += Counter(text2unigram(text, False))
    bigrams_cnt_ps += Counter(text2bigrams(text, False))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]
bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]
unigrams_ps = [entry[0] for entry in unigrams_cnt_ps.most_common(1000)]
bigrams_ps = [entry[0] for entry in bigrams_cnt_ps.most_common(1000)]

In [40]:
def print_time(startTime):
    ts = time.time() - startTime
    print('%dm, %ds' % (ts/60, ts%60))

def pipeline(top_grams, cnvt, rmPunc, use_tf_idf):
    startTime = time.time()

    model = Model(top_grams, cnvt, rmPunc, use_tf_idf)
    
    trainX = model.get_feature(trainCorpus)
    valX = model.get_feature(valCorpus)
    testX = model.get_feature(testCorpus)

    model.validation(valX, valY, lamdas)
    model.train(trainX, trainY, model.best_lamda)
    mse = model.test(testX, testY)
    print_time(startTime)

    return mse, model.best_lamda

In [41]:
p1 = pipeline(unigrams, text2unigram, rmPunc=True, use_tf_idf=True)
p2 = pipeline(bigrams, text2bigrams, rmPunc=True, use_tf_idf=True)

p3 = pipeline(unigrams_ps, text2unigram, rmPunc=False, use_tf_idf=True)
p4 = pipeline(bigrams_ps, text2bigrams, rmPunc=False, use_tf_idf=True)

p5 = pipeline(unigrams, text2unigram, rmPunc=True, use_tf_idf=False)
p6 = pipeline(bigrams, text2bigrams, rmPunc=True, use_tf_idf=False)

p7 = pipeline(unigrams_ps, text2unigram, rmPunc=False, use_tf_idf=False)
p8 = pipeline(bigrams_ps, text2bigrams, rmPunc=False, use_tf_idf=False)

2m, 30s
2m, 29s
2m, 32s
2m, 26s
0m, 12s
0m, 51s
0m, 12s
0m, 55s


In [42]:
res = [p1, p2, p3, p4, p5, p6, p7, p8]

model_desc = ['Unigrams, Remove, tfidf',
'Bigrams, Remove, tfidf',
'Unigrams, Perserve, tfidf',
'Bigrams, Perserve, tfidf',
'Unigrams, Remove, word counts',
'Bigrams, Remove, word counts',
'Unigrams, Perserve, word counts',
'Bigrams, Perserve, word counts']

print ("{:<35} {:<8} {:<8}".format('model desc','mse','lambda'))
for m, r in zip(model_desc, res):
    print ("{:<35} {:<8.2f} {:<8.2f}".format(m, r[0], r[1]))

model desc                          mse      lambda  
Unigrams, Remove, tfidf             2.02     0.01    
Bigrams, Remove, tfidf              2.02     0.01    
Unigrams, Perserve, tfidf           2.02     0.01    
Bigrams, Perserve, tfidf            2.02     0.01    
Unigrams, Remove, word counts       0.43     0.01    
Bigrams, Remove, word counts        0.49     0.01    
Unigrams, Perserve, word counts     0.43     0.01    
Bigrams, Perserve, word counts      0.47     0.01    
