In [13]:
import numpy as np
import urllib
import nltk
from nltk.stem.porter import *
from sklearn import linear_model

import string
import math
import random
from collections import Counter

In [2]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

all_data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))

data = all_data[:5000]
corpus = [d['review/text'] for d in data]

In [3]:
##########################
# 1
##########################

punctuation = set(string.punctuation)

def puncFilter(text):
    return ''.join([c for c in text.lower() if c not in punctuation])

def text2bigrams(text, rmPunc=True):
    if rmPunc:
        return nltk.bigrams(puncFilter(text).split())
    else:
        return nltk.bigrams(text.split())

bigrams_cnt = Counter()

for text in corpus:
    bigrams_cnt += Counter(text2bigrams(text))
    
for i in bigrams_cnt.most_common(5):
    print(i)

(('with', 'a'), 4587)
(('in', 'the'), 2595)
(('of', 'the'), 2245)
(('is', 'a'), 2056)
(('on', 'the'), 2033)


In [4]:
##########################
# 2
##########################

def word_cnt_feature(top_grams, text_grams):
    feat = [0] * len(top_grams)
    for bi in text_grams:
        try:
            feat[top_grams.index(bi)] += 1
        except:
            pass
    feat.append(1)
    return feat


def calMSE(X, y, lamda=1.0):
    clf = linear_model.Ridge(lamda, fit_intercept=False)
    clf.fit(X, y)
    predictions = clf.predict(X)
    return np.mean((y - predictions)**2)

bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]
X = [word_cnt_feature(bigrams, text2bigrams(text)) for text in corpus]
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

MSE: 0.342991


In [5]:
##########################
# 3
##########################

IDF_D = {}

def text2unigram(text, rmPunc=True):
    if rmPunc:
        return puncFilter(text).split()
    else:
        text.split()

def calTF(word, gram):
    cnt = Counter(gram)
    try:
        return cnt[word]
    except:
        return 0
    
def calIDF(word, cnvt, rmPunc):
    try:
        idf = IDF_D[word]
    except:
        numer = len(corpus)
        denom = sum([word in cnvt(text, rmPunc) for text in corpus])
        denom = denom if denom !=0 else 1e-7 # avoid division by 0
        idf = math.log10(numer / denom)
        IDF_D[word] = idf
    return idf
    
def calTF_IDF(word, text, cnvt, rmPunc=True):
    gram = cnvt(text, rmPunc)
    tf = calTF(word, gram)
    idf = calIDF(word, cnvt, rmPunc)
    return tf * idf


word_list = ['foam', 'smell', 'banana', 'lactic', 'tart']
text = corpus[0]
for word in word_list:
    tf_idf = calTF_IDF(word, text, text2unigram)
    print('"%s" tf-idf score: %f' % (word, tf_idf))

"foam" tf-idf score: 2.275737
"smell" tf-idf score: 0.537902
"banana" tf-idf score: 3.355561
"lactic" tf-idf score: 5.841638
"tart" tf-idf score: 1.806875


In [6]:
##########################
# 4
##########################

def cosine_similarity(x1, x2):
    numer = x1.dot(x2)
    denom = np.linalg.norm(x1) * np.linalg.norm(x2)
    denom = denom if denom !=0 else 1e-7 # avoid division by 0
    return numer / denom

unigrams_cnt = Counter()

for text in corpus:
    unigrams_cnt += Counter(text2unigram(text))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]

text1 = corpus[0]
text2 = corpus[1]

tf_idf1 = np.array([calTF_IDF(w, text1, text2unigram) for w in unigrams])
tf_idf2 = np.array([calTF_IDF(w, text2, text2unigram) for w in unigrams])

cos_sim = cosine_similarity(tf_idf1, tf_idf2)
print('Cosine similarity: %f' %(cos_sim))

Cosine similarity: 0.106130


In [8]:
##########################
# 5
##########################

tf_idf_res = {}
for i, text in enumerate(corpus):
    tf_idf_res[i] = np.array([calTF_IDF(w, text, text2unigram) for w in unigrams])

In [9]:
cos_sim_res = {}
for k, v in tf_idf_res.items():
    cos_sim_res[k] = cosine_similarity(tf_idf_res[0], v)

  


In [10]:
sorted(cos_sim_res.items(), key=lambda x: x[1], reverse=True)[:2]

[(0, 1.0), (4003, 0.3173276600263313)]

In [16]:
##########################
# 6
##########################

def tf_idf_feature(TF_IDF,i):
    return TF_IDF[i]

X = [tf_idf_feature(tf_idf_res, i) for i in range(len(data))]
y = [d['review/overall'] for d in data]

mse = calMSE(X, y)
print('MSE: %f' % (mse))

KeyError: 5000

In [21]:
##########################
# 7
##########################

data = all_data
corpus = [d['review/text'] for d in data]

def split_data(X, Y, n_train, n_val, n_test, shuffle=False):
    m = len(X)
    n_val += n_train
    n_test += n_val

    if shuffle:
        r = list(zip(X, Y))
        random.shuffle(r)
        X, Y = list(zip(*r))

    return (X[:n_train], Y[:n_train]), (X[n_train:n_val], Y[n_train:n_val]), \
            (X[n_val:n_test], Y[n_val:n_test])

# X = [feature(text) for text in corpus]
y = [d['review/overall'] for d in data]

trainData, valData, testData = split_data(corpus, y, 5000, 5000, 5000, shuffle=True)

In [28]:
class Model:
    def __init__(self, top_grams, corpus, cnvt, rmPunc, use_tf_idf, TF_IDF=None):
        self.top_grams = top_grams
        self.corpus = corpus
        self.cnvt = cnvt
        self.rmPunc = rmPunc
        self.use_tf_idf = use_tf_idf
        
        self.TF_IDF = TF_IDF
        
    def get_feature(self, corpus):
        if self.use_tf_idf:
            return [tf_idf_feature(self.TF_IDF, i) for i in range(len(corpus))]
        else:
            return [word_cnt_feature(self.top_grams, self.cnvt(text, self.rmPunc)) 
                    for text in corpus]
    
    def train(self, X, y, lamda):
        self.clf = linear_model.Ridge(lamda, fit_intercept=False)
        self.clf.fit(X, y)
    
    def test(self, X, y=None):
        predictions = clf.predict(X)
        if y is not None:
            return np.mean((y - predictions)**2)
        else:
            return predictions

In [None]:
lamda = [0.01, 0.1, 1, 10, 100]

In [17]:
TF_IDF1 = None
TF_IDF2 = None
TF_IDF3 = None
TF_IDF4 = None

In [23]:
trainCorpus, trainY = trainData

unigrams_cnt = Counter()
bigrams_cnt = Counter()

for text in trainCorpus:
    unigrams_cnt += Counter(text2unigram(text))
    bigrams_cnt += Counter(text2bigrams(text))
    
unigrams = [entry[0] for entry in unigrams_cnt.most_common(1000)]
bigrams = [entry[0] for entry in bigrams_cnt.most_common(1000)]

In [30]:
m1 = Model(unigrams, trainCorpus, text2unigram, rmPunc=True, use_tf_idf=True, TF_IDF=TF_IDF1)
m2 = Model(bigrams, trainCorpus, text2bigrams, rmPunc=True, use_tf_idf=True, TF_IDF=TF_IDF2)

m3 = Model(unigrams, trainCorpus, text2unigram, rmPunc=False, use_tf_idf=True, TF_IDF=TF_IDF3)
m4 = Model(bigrams, trainCorpus, text2bigrams, rmPunc=False, use_tf_idf=True, TF_IDF=TF_IDF4)

m5 = Model(unigrams, trainCorpus, text2unigram, rmPunc=True, use_tf_idf=False)
m6 = Model(bigrams, trainCorpus, text2bigrams, rmPunc=True, use_tf_idf=False)

m7 = Model(unigrams, trainCorpus, text2unigram, rmPunc=False, use_tf_idf=False)
m8 = Model(bigrams, trainCorpus, text2bigrams, rmPunc=False, use_tf_idf=False)