In [1]:
import numpy as np
import urllib
import nltk
from nltk.stem.porter import *
from sklearn import linear_model

import string
import math
import random
import re
from collections import Counter

In [2]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

all_data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))

data = all_data[:5000]
corpus = [d['review/text'] for d in data]

In [3]:
##########################
# 1
##########################

punctuation = set(string.punctuation)

def puncFilter(text, rmPunc):
    if rmPunc:
        return ''.join([c for c in text.lower() if c not in punctuation])
    else:
        ''.join(re.findall(r"\w+|[^\w\s]", text.lower()))

def text2bigrams(text, rmPunc=True):
    return nltk.bigrams(puncFilter(text, rmPunc).split())

bigrams_cnt = Counter()

for text in corpus:
    bigrams_cnt += Counter(text2bigrams(text))
    
for i in bigrams_cnt.most_common(5):
    print(i)

(('with', 'a'), 4587)
(('in', 'the'), 2595)
(('of', 'the'), 2245)
(('is', 'a'), 2056)
(('on', 'the'), 2033)


In [4]:
##########################
# 2
##########################

def word_cnt_feature(top_grams, text_grams):
    feat = [0] * len(top_grams)
    for bi in text_grams:
        try:
            feat[top_grams.index(bi)] += 1
        except:
            pass
    feat.append(1)
    return feat


def calMSE(X, y, lamda=1.0):
    clf = linear_model.Ridge(lamda, fit_intercept=False)
    clf.fit(X, y)
    predictions = clf.predict(X)
    return np.mean((y - predictions)**2)

bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]
X = [word_cnt_feature(bigrams, text2bigrams(text)) for text in corpus]
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

MSE: 0.342262


In [5]:
##########################
# 3
##########################

def calTF(word, gram):
    cnt = Counter(gram)
    try:
        return cnt[word]
    except:
        return 0
    
def calIDF(word, cnvt, rmPunc):
    try:
        idf = IDF_D[word]
    except:
        numer = len(corpus)
        denom = sum([word in cnvt(text, rmPunc) for text in corpus])
        denom = denom if denom !=0 else 1e-7 # avoid division by 0
        idf = math.log10(numer / denom)
        IDF_D[word] = idf
    return idf
    
def calTF_IDF(word, text, cnvt, rmPunc=True):
    gram = cnvt(text, rmPunc)
    tf = calTF(word, gram)
    idf = calIDF(word, cnvt, rmPunc)
    return tf * idf

def text2unigram(text, rmPunc=True):
    return puncFilter(text, rmPunc).split()

In [6]:
IDF_D = {}
word_list = ['foam', 'smell', 'banana', 'lactic', 'tart']
text = corpus[0]
for word in word_list:
    tf_idf = calTF_IDF(word, text, text2unigram)
    print('"%s" tf-idf score: %f' % (word, tf_idf))

"foam" tf-idf score: 2.275737
"smell" tf-idf score: 0.537902
"banana" tf-idf score: 3.355561
"lactic" tf-idf score: 5.841638
"tart" tf-idf score: 1.806875


In [7]:
##########################
# 4
##########################

def cosine_similarity(x1, x2):
    numer = x1.dot(x2)
    denom = np.linalg.norm(x1) * np.linalg.norm(x2)
    denom = denom if denom !=0 else 1e-7 # avoid division by 0
    return numer / denom

unigrams_cnt = Counter()

for text in corpus:
    unigrams_cnt += Counter(text2unigram(text))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]

text1 = corpus[0]
text2 = corpus[1]

IDF_D = {}
tf_idf1 = np.array([calTF_IDF(w, text1, text2unigram) for w in unigrams])
tf_idf2 = np.array([calTF_IDF(w, text2, text2unigram) for w in unigrams])

cos_sim = cosine_similarity(tf_idf1, tf_idf2)
print('Cosine similarity: %f' %(cos_sim))

Cosine similarity: 0.106130


In [8]:
##########################
# 5
##########################

IDF_D = {}
TF_IDF = {}
for i, text in enumerate(corpus):
    TF_IDF[i] = np.array([calTF_IDF(w, text, text2unigram) for w in unigrams])

In [9]:
cos_sim_res = {}
for k, v in TF_IDF.items():
    cos_sim_res[k] = cosine_similarity(TF_IDF[0], v)

In [10]:
idx = sorted(cos_sim_res.items(), key=lambda x: x[1], reverse=True)[1][0]
data[idx]

print('beerID: %s' % data[idx]['beer/beerId'])
print('profileName: %s' % data[idx]['beer/name'])

beerID: 52211
profileName: Frog's Hollow Double Pumpkin Ale


In [11]:
##########################
# 6
##########################

def tf_idf_feature(TF_IDF,i):
    return TF_IDF[i]

X = [tf_idf_feature(TF_IDF, i) for i in range(len(data))]
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

MSE: 1.133687


In [12]:
##########################
# 7
##########################

data = all_data
corpus = [d['review/text'] for d in data]

def split_data(X, Y, n_train, n_val, n_test, shuffle=False):
    m = len(X)
    n_val += n_train
    n_test += n_val

    if shuffle:
        r = list(zip(X, Y))
        random.shuffle(r)
        X, Y = list(zip(*r))

    return (X[:n_train], Y[:n_train]), (X[n_train:n_val], Y[n_train:n_val]), \
            (X[n_val:n_test], Y[n_val:n_test])

# X = [feature(text) for text in corpus]
y = [d['review/overall'] for d in data]

trainData, valData, testData = split_data(corpus, y, 5000, 5000, 5000, shuffle=True)

trainCorpus, trainY = trainData
valCorpus, valY = valData
testCorpus, testY = testData

In [13]:
class Model:
    def __init__(self, top_grams, cnvt, rmPunc, use_tf_idf, TF_IDF=None):
        self.top_grams = top_grams
        self.cnvt = cnvt
        self.rmPunc = rmPunc
        self.use_tf_idf = use_tf_idf
        
        self.TF_IDF = TF_IDF
        
    def get_feature(self, corpus):
        if self.use_tf_idf:
            return [tf_idf_feature(self.TF_IDF, i) for i in range(len(corpus))]
        else:
            return [word_cnt_feature(self.top_grams, self.cnvt(text, self.rmPunc)) 
                    for text in corpus]
    
    def train(self, X, y, lamda):
        self.clf = linear_model.Ridge(lamda, fit_intercept=False)
        self.clf.fit(X, y)
        
    def validation(self, X, y, lamdas):
        best_mse = math.inf
        best_lamda = None
        best_clf = None
        
        for lamda in lamdas:
            self.train(X, y, lamda)
            mse = self.test(X, y)
            
            if mse < best_mse:
                best_mse = mse
                best_lamda = lamda
                
        self.best_lamda = best_lamda
        self.clf = best_clf
    
    def test(self, X, y):
        predictions = self.predict(X)
        return np.mean((y - predictions)**2)
    
    def predict(self, X):
        return self.clf.predict(X)

In [14]:
lamda = [0.01, 0.1, 1, 10, 100]

unigrams_cnt = Counter()
bigrams_cnt = Counter()

for text in trainCorpus:
    unigrams_cnt += Counter(text2unigram(text))
    bigrams_cnt += Counter(text2bigrams(text))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]
bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]

In [15]:
IDF_D = {}
def tf_idf_calculator(grams, corpus, cnvt, rmPunc=True):
    TF_IDF = {}
    IDF_D.clear()
    
    for i, text in enumerate(corpus):
        tf_idf = [calTF_IDF(gram, text, cnvt, rmPunc) for gram in grams]
        TF_IDF[i] = tf_idf
    return TF_IDF

import time

def print_time(startTime):
    ts = time.time() - startTime
    print('%dm, %ds' % (ts/60, ts%60))

def pipeline(top_grams, cnvt, rmPunc, use_tf_idf):
    startTime = time.time()
    if use_tf_idf:
        TF_IDF = tf_idf_calculator(top_grams, trainCorpus, cnvt, rmPunc)
    else:
        TF_IDF = None
        
    print_time(startTime)

    model = Model(top_grams, cnvt, rmPunc, use_tf_idf, TF_IDF)
    
    trainX = model.get_feature(trainCorpus)
    valX = model.get_feature(valCorpus)
    testX = model.get_feature(testCorpus)
    print_time(startTime)

    model.train(trainX, trainY)
    model.validation(valX, valY)
    mse = model.test(testX, testY)
    print_time(startTime)

    print('Best Lambda: %f' % model.best_lamda)
    print('MSE: %f\n' % mse)
    return mse, model.best_lamda

In [None]:
p1 = pipeline(unigrams, text2unigram, rmPunc=True, use_tf_idf=True)
# p2 = pipeline(bigrams, text2bigrams, rmPunc=True, use_tf_idf=True)

# p3 = pipeline(unigrams, text2unigram, rmPunc=False, use_tf_idf=True)
# p4 = pipeline(bigrams, text2bigrams, rmPunc=False, use_tf_idf=True)

# p5 = pipeline(unigrams, text2unigram, rmPunc=True, use_tf_idf=False)
# p6 = pipeline(bigrams, text2bigrams, rmPunc=True, use_tf_idf=False)

# p7 = pipeline(unigrams, text2unigram, rmPunc=False, use_tf_idf=False)
# p8 = pipeline(bigrams, text2bigrams, rmPunc=False, use_tf_idf=False)

In [None]:
res = [p1, p2, p3, p4, p5, p6, p7, p8]

model_desc = []
model_desc.append('Unigrams, Remove, tfidf')
model_desc.append('Bigrams, Remove, tfidf')
model_desc.append('Unigrams, Perserve, tfidf')
model_desc.append('Bigrams, Perserve, tfidf')
model_desc.append('Unigrams, Remove, word counts')
model_desc.append('Bigrams, Remove, word counts')
model_desc.append('Unigrams, Perserve, word counts')
model_desc.append('Bigrams, Perserve, word counts')

print "{:<30} {:<8} {:<8}".format('model desc','mse','lambda')
for m, r in zip(model_desc, res)
    print "{:<30} {:<8} {:<8}".format(k, r[0], r[1])