In [409]:
import gensim
from collections import Counter
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords
import math
from sklearn.metrics import *
nltk.download('stopwords')
stop_words = stopwords.words('english')
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __init__(self, dataset):
        self.dataset = dataset

    def __iter__(self):
        print("reading file {0}...this may take a while".format(self.dataset+"/"+self.dataset+"_corpus.linked.txt"))
        with open(self.dataset, 'rb') as f:
            for i, line in enumerate(f):

                if (i % 10000 == 0):
                    print("read {0} reviews".format(i))
                # do some pre-processing and return list of words for each review
                # text
                line = remove_stopwords(line)
                yield gensim.utils.simple_preprocess(line)

def main():
    documents = MyCorpus('train.txt')
    model = gensim.models.Word2Vec(
            documents,
            size=200,
            min_count=2,
            iter=20)
    model.wv.save_word2vec_format("restaurant_stop.200d.txt", binary=False)
    return model

def cosSim(vec1, vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))

aspect_kw = {'location': ['street', 'block', 'avenue', 'river', 'convenient'],
             'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake'],
             'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty'],
             'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small'],
             'service': ['tips', 'manager', 'wait', 'waitress', 'servers'],
             }

aspect2ind = {k:i for i,k in enumerate(aspect_kw)}
ind2aspect = {v:k for k,v in aspect2ind.items()}

aspect_kw_enhance = {'location_n': ['street', 'parking', 'avenue', 'river', 'view'],
                    'location_adj': ['convenient', 'near'],
                    'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake'],
                    'drinks_adj': ['alcoholic', 'iced', 'bottled'],
                    'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger'],
                    'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory'],
                    'ambience_n': ['atmosphere', 'room', 'decor', 'music', 'space'],
                    'ambience_adj': ['romantic', 'small', 'cozy', 'tiny'],
                    'service_n': ['tips', 'manager', 'wait', 'waitress', 'servers'],
                    'service_adj': ['rude', 'attentive', 'friendly'],
                    }

aspect_kw_1 = {'location': ['street', 'parking', 'avenue', 'river', 'view', 'convenient', 'near'],
                'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'alcoholic', 'iced', 'bottled'],
                'food': ['food', 'pizza', 'tuna', 'sushi', 'burger', 'spicy', 'tasty', 'delicious', 'bland', 'savory'],
                'ambience': ['atmosphere', 'room', 'decor', 'music', 'space', 'romantic', 'small', 'cozy', 'tiny'],
                'service': ['tips', 'manager', 'wait', 'waitress', 'servers', 'rude', 'attentive', 'friendly']
              }
               
aspect_kw_enhance_expand = {'location_n': ['street', 'parking', 'avenue', 'river', 'view', 'breathtaking', 'meters', '7th', 'overlooking', 'garage', 'valet', 'epicenter', 'eiffel_tower', 'storefront', 'access'], 
                    'location_adj': ['convenient', 'near', 'uw', 'campus', 'rental', 'highway', 'staying', 'close', 'location', 'exit', 'at', 'theatre'], 
                    'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'drink', 'wine', 'sakes', 'beers', 'beer', 'mixed_drinks', 'sauvignon', 'cocktails', 'dessert_wine', 'stiff'], 
                    'drinks_adj': ['alcoholic', 'iced', 'bottled', 'tea', 'teas', 'hefeweizen', 'jug', 'arnold', 'drink', 'brewed', 'chai', 'bellini', 'float'], 
                    'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger', 'maguro', 'burgers', 'gari', 'ahi', 'sashimi', ',', 'salmon', 'roll', 'tataki'], 
                    'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory', 'flavorful', 'flavor', 'chicken', 'sauce', 'barbecued', 'good', 'salty', 'notably'], 
                    'ambience_n': ['atmosphere', 'room', 'decor', 'music', 'space', 'loungy', 'rooftop', 'overhead', 'fixtures', 'chandeliers', 'furniture', 'ambiance', 'lighting', 'architecture'], 
                    'ambience_adj': ['romantic', 'small', 'cozy', 'tiny', 'intimate', 'atomosphere', 'cramped', 'atmoshere', 'spacious', 'dimly', 'couch', 'low_key', 'rooftop', 'suits'], 
                    'service_n': ['tips', 'manager', 'wait', 'waitress', 'servers', 'server', 'us', 'waiter', 'she', 'barbi', 'our', 'busser', 'sincere', 'acknowledging', 'even'], 
                    'service_adj': ['rude', 'attentive', 'friendly', 'helpful', 'staff', 'service', 'professional', 'outgoing', 'enthusiastic', 'responsive', 'polite', 'informative', 'server']}

sub2aspect = {i: i//2 for i in range(len(aspect_kw_enhance))}

def softmax(x):
    summ = sum(math.e**(xi) for xi in x if xi>0)
    y = [math.e**(xi)/summ if xi >0 else 0 for xi in x ]
    return y

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhangliji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [382]:
def read_vec(file):
    with open(file) as f:
        embs = f.readlines()
    wv = dict()
    for line in embs[1:]:
        line = line.strip().split()
        word = line[0]
        vec = np.array([float(x) for x in line[1:]])
        wv[word] = vec
    return wv

def calculate_topic_emb(aspect_kw, embedding_dict):
    as_topic_emb = dict()
    for asp, words in aspect_kw.items():
        asp_embs = list()
        for word in words:
            if not word in embedding_dict:
                continue
            vec = embedding_dict[word]
            asp_embs.append(vec.tolist())
        tmp_asp_emb = np.array(asp_embs).mean(axis=0)
        as_topic_emb[asp] = tmp_asp_emb / np.linalg.norm(tmp_asp_emb)

    as_topic_mat = np.array([as_topic_emb[aspect] for aspect in aspect_kw])
    return as_topic_mat

In [249]:
wv = read_vec("restaurant.200d.txt")
topic_mat = calculate_topic_emb(aspect_kw, wv)

In [319]:
current_kw, topic_mat_update = expand(aspect_kw)

{'location': ['street', 'block', 'avenue', 'river', 'convenient', 'ave', 'road', 'located', 'central', 'fremont', 'north', 'corner', 'park', 'canal', 'mills'], 'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'cocktails', 'beers', 'appetizers', 'sangria', 'champagne', 'cocktail', 'beverages', 'wine', 'desserts', 'beer'], 'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty', 'flavorful', 'fried_rice', 'curries', 'sashimi', 'chinese_food', 'pad_see_ew', 'seafood', 'pad', 'plentiful', 'portion_size'], 'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small', 'intimate', 'space', 'cozy', 'quiet', 'dining_room', 'spacious', 'patio', 'outdoor', 'fireplace', 'comfy'], 'service': ['tips', 'manager', 'wait', 'waitress', 'servers', 'bartender', 'server', 'waiter', 'hostess', 'host', 'wait_staff', 'cashier', 'hostesses', 'waitstaff', 'waiters']}
location 0.9398451795699558
drinks 0.9414686957910438
food 0.9134188614410902
ambience 0.9300946344083338
service 0.946500886754547

In [201]:
current_kw, topic_mat_update = expand(aspect_kw_enhance,k=5)

{'location_n': ['street', 'parking', 'avenue', 'river', 'view', 'bellagio', 'corner', 'pool', 'located', 'building'], 'location_adj': ['convenient', 'near', 'close', 'towards', 'located', 'north', 'mall'], 'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'cocktails', 'beers', 'appetizers', 'sangria', 'champagne'], 'drinks_adj': ['alcoholic', 'iced', 'bottled', 'tea', 'teas', 'peach', 'hibiscus', 'chai'], 'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger', 'chinese_food', 'burgers', 'hamburger', 'sandwich', 'fare'], 'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory', 'flavorful', 'yummy', 'watery', 'oily', 'salty'], 'ambience_n': ['atmosphere', 'room', 'decor', 'music', 'space', 'ambience', 'vibe', 'ambiance', 'environment', 'lighting'], 'ambience_adj': ['romantic', 'small', 'cozy', 'tiny', 'intimate', 'tight', 'spacious', 'classy', 'cramped'], 'service_n': ['tips', 'manager', 'wait', 'waitress', 'servers', 'bartender', 'server', 'waiter', 'hostess', 'host'

In [173]:
current_kw, topic_mat_update = expand(aspect_kw_enhance,ite=True)

{'location_n': ['street', 'parking', 'avenue', 'river', 'view', 'bellagio', 'pool', 'corner', 'located', 'center', 'shopping', 'mall', 'overlooking', 'casino', 'north'], 'location_adj': ['convenient', 'near', 'close', 'closer', 'closest', 'connected', 'mall', 'located', 'center', 'conveniently', 'shopping', 'north'], 'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'cocktails', 'beers', 'appetizers', 'desserts', 'beverages', 'martinis', 'drink_specials', 'sangria', 'apps', 'specialty'], 'drinks_adj': ['alcoholic', 'iced', 'bottled', 'tea', 'teas', 'chai', 'peach', 'lemonade', 'mango', 'strawberry', 'milk', 'green_tea', 'latte'], 'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger', 'chinese_food', 'fare', 'cuisine', 'mexican_food', 'foods', 'comfort_food', 'fast_food', 'breakfast_place', 'fusion', 'mex'], 'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory', 'flavorful', 'yummy', 'watery', 'oily', 'salty', 'rich', 'creamy', 'chewy', 'tender', 'moist'], 'ambien

In [244]:
def expand(aspect_kw=aspect_kw, k=10, thres=1.0,use_center=True, ite=False, wv=wv):
    aspect2ind = {k:i for i,k in enumerate(aspect_kw)}
    ind2aspect = {v:k for k,v in aspect2ind.items()}
    current_kw ={}
    for aspect in aspect_kw:
        current_kw[aspect]=aspect_kw[aspect].copy()
    topic_mat = calculate_topic_emb(aspect_kw, wv)
    topic_mat_update = topic_mat.copy()
    if ite:
        ran = k
        k = 1
    else:
        ran = 1
    for i in range(ran):
        dis_topic = defaultdict(dict)
        for asp in current_kw:
            for w in wv:
                if w not in current_kw[asp]:
                    asp_scores = [max(cosSim(topic_mat_update[aspect2ind[asp]],wv[w]),0) for asp in current_kw]
                    ordered = sorted(asp_scores,reverse=True)
                    if ind2aspect[np.argmax(asp_scores)]==asp and ordered[0]/(ordered[1]+1e-5)<thres:
                        dis_topic[asp][w]=0.0
                    else:
                        dis_topic[asp][w]=cosSim(topic_mat_update[aspect2ind[asp]],wv[w])
        '''
        for w in wv:
            if use_center:
                asp_scores = [max(cosSim(topic_mat[aspect2ind[asp]],wv[w]),0) for asp in current_kw]
            else:
                asp_scores=[]
                for aspect in current_kw:
                    ind_score = [cosSim(wv[key], wv[w]) for key in current_kw[aspect]]
                    asp_scores.append(max(ind_score))
            ordered = sorted(asp_scores,reverse=True)
            dis_topic[ind2aspect[np.argmax(asp_scores)]][w]=ordered[0]/(ordered[1]+1e-5) #ordered[0]-ordered[1])/ordered[0]
        '''
        for asp in current_kw:        
            top_words = sorted(dis_topic[asp],key=dis_topic[asp].get, reverse=True)[:k]
            current_kw[asp].extend(top_words)
        topic_mat_update = calculate_topic_emb(current_kw, wv)
    print(current_kw)
    for asp in current_kw:
        print(asp, cosSim(topic_mat[aspect2ind[asp]],topic_mat_update[aspect2ind[asp]]))
        '''
        for asp2 in current_kw:
            if asp != asp2:
                print('before',asp, asp2,cosSim(topic_mat[aspect2ind[asp]],topic_mat[aspect2ind[asp2]]))
                print('after',asp, asp2,cosSim(topic_mat_update[aspect2ind[asp]],topic_mat_update[aspect2ind[asp2]]))
        '''
    return current_kw, topic_mat_update

In [404]:
def eval(embedding_dict, as_topic_mat,filename='test_score.txt',soft=False, use_center=True,
         enhance=False,thres=0.0,current_kw=current_kw):
    with open(os.path.join('test.txt')) as f:
        test_cont = f.readlines()

    asp_labels, senti_labels, docs = list(), list(), list()
    for line in test_cont:
        _, as_label, senti_label, doc = line.strip().split('\t')
        asp_labels.append(int(as_label))
        senti_labels.append(int(senti_label))
        docs.append(doc)
    emb_size = 200
    doc_embs = list()
    doc_weights= []
    doc_choices = []
    for doc in docs:
        doc_vec = np.zeros(emb_size)
        sen_weights = []
        sen_choices = [] 
        norm = 0
        for word in gensim.utils.simple_preprocess(doc):
            if word not in embedding_dict:
                continue
            word_vec = embedding_dict[word]
            norm_word_vec = word_vec / np.linalg.norm(word_vec)
            if use_center:
                product = np.dot(as_topic_mat, norm_word_vec.reshape(-1,1))
            else:
                product=[]
                for aspect in current_kw:
                    ind_score = [cosSim(embedding_dict[w], norm_word_vec) for w in current_kw[aspect]]
                    product.append(max(ind_score))
                product = np.array(product)
            if soft: 
                word_weight = np.max(softmax(product.reshape(-1)))* np.max(product)
            else:
                word_weight = np.max(product)
            norm += word_weight
            doc_vec += word_vec * word_weight
            word_choice = np.argmax(product)
            sen_weights.append(word_weight)
            sen_choices.append(word_choice)
        doc_embs.append(doc_vec/norm)
        doc_weights.append(sen_weights)
        doc_choices.append(sen_choices)

    doc_embs = np.array(doc_embs)
    norm_doc_embs = np.array([vec/np.linalg.norm(vec) for vec in doc_embs])
    as_scores = np.dot(norm_doc_embs, as_topic_mat.T)
    raw_labels = np.argmax(as_scores, axis=1)
    if enhance:
        as_pseudo_labels = np.array([sub2aspect[l] for l in raw_labels])
    else:
        as_pseudo_labels = raw_labels
    # subsample according to softmax confidence
    as_scores = np.array([softmax(score) for score in as_scores])
    confi_scores =  np.max(as_scores, axis=1)
    print(sum(confi_scores>thres))
    as_pseudo_labels_sub = as_pseudo_labels[confi_scores>thres]
    asp_labels_sub = np.array(asp_labels)[confi_scores>thres]
    
    print(confusion_matrix(asp_labels_sub, as_pseudo_labels_sub))
    p = precision_score(asp_labels_sub, as_pseudo_labels_sub,average='macro')
    r = recall_score(asp_labels_sub, as_pseudo_labels_sub, average='macro')
    f1_mac = f1_score(asp_labels_sub, as_pseudo_labels_sub, average='macro')
    p_mic = precision_score(asp_labels_sub, as_pseudo_labels_sub,average='micro')
    r_mic = recall_score(asp_labels_sub, as_pseudo_labels_sub, average='micro')
    f1_mic = f1_score(asp_labels_sub, as_pseudo_labels_sub, average='micro')
    print('mac {} {} {}'.format(p, r, f1_mac))
    print('mic {} {} {}'.format(p_mic, r_mic, f1_mic))
    with open(filename,'w') as f:
        for i in range(len(test_cont)):
            if thres >= 0.0:
                #if confi_scores[i]>thres:
                f.write(str(asp_labels[i])+'\t')
                f.write(str(as_pseudo_labels[i])+'\t')
                #f.write(str(confi_scores[i])+ '\t')
                if enhance:
                    partial_sum =0
                    for index, s in enumerate(as_scores[i]):
                        partial_sum +=s
                        if index%2 ==1:
                            f.write(str(partial_sum)+ '\t')
                            partial_sum =0
                else:
                    for s in as_scores[i]:
                        f.write(str(s)+ '\t')
                        
                f.write(docs[i]+'\n')
            else:
                if as_pseudo_labels[i] != asp_labels[i]:
                    f.write(test_cont[i])
                    for s in doc_weights[i]:
                        f.write(format(s, '.2f')+' ')
                    f.write('\n')
                    for c in doc_choices[i]:
                        f.write(str(c)+' ')
                    f.write('\n')
                    f.write(str(as_pseudo_labels[i])+'\n')
    return np.sum(as_pseudo_labels_sub == asp_labels_sub)/len(asp_labels_sub)

In [252]:
eval(wv, topic_mat,'test_base.txt',use_center=True)

[[  1   0   6   2   2]
 [  0  22  25   0  10]
 [  1   2 281   4  16]
 [  2   0  11  54  10]
 [  1   1  22   7 163]]


0.8102643856920684

In [321]:
#thres = 0.3
#for thres in [0.30.35,0.37,0.4]:
#    eval(wv, topic_mat_update,'pseudo_{}_{}'.format(k, thres),use_center=False, soft=True, thres=thres)
eval(wv, topic_mat_update,'pseudo_{}'.format(k),use_center=False, soft=True, thres=0.0,current_kw=current_kw)
# thres number acc
# 0.4   175 0.954
# 0.37  237 0.932
# 0.35  278 0.896
# 0.30  428 0.878 

642
[[  2   0   1   2   0]
 [  0  21   3   0   1]
 [  7  28 282  10  17]
 [  1   1   2  57   6]
 [  2  13   9   8 169]]
precision 0.6130804716296944 recall 0.7502619460835358 f1 mac 0.6483953453797643


0.8271028037383178

In [158]:
eval(wv, topic_mat_update,'test_ind_{}'.format(k),use_center=False)

[[  1   1  11   2   2]
 [  0  22  48   1  12]
 [  1   1 237   1   3]
 [  2   0  13  55   5]
 [  1   1  36   8 179]]


0.7682737169517885

In [196]:
eval(wv, topic_mat_update,'test_{}_enhance_ite'.format(k),use_center=True, enhance=True)

[[  3   0   4   6   4]
 [  0  21  45   2  12]
 [  0   2 262   1   7]
 [  2   0   7  51   5]
 [  0   2  27   7 173]]


0.7931570762052877

In [205]:
eval(wv, topic_mat_update,'mistake_{}_enhance'.format(k),use_center=False, enhance=True)

[[  3   0   7   8   7]
 [  0  21  24   1  11]
 [  0   3 279   1   9]
 [  2   0  10  52   6]
 [  0   1  25   5 168]]


0.8133748055987559

In [316]:
wv_stop = read_vec("restaurant_stop.200d.txt")
current_kw_stop, topic_mat_update_stop = expand(aspect_kw,wv=wv_stop)
eval(wv_stop, topic_mat_update_stop,'stop_test_{}'.format(k),use_center=False, soft=False)

{'location': ['street', 'block', 'avenue', 'river', 'convenient', 'fremont', 'central', 'shopping', 'roosevelt', 'convention', 'ave', 'brien', 'smith', 'road', 'near'], 'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'cocktails', 'beverages', 'beers', 'whiskey', 'cocktail', 'martini', 'pitcher', 'great_wine', 'gin', 'alcohol'], 'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty', 'edamame', 'orange_chicken', 'teriyaki', 'gyoza', 'sashimi', 'decently', 'lunch_specials', 'budae', 'exceptionally', 'fishes'], 'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small', 'intimate', 'setting', 'cozy', 'private', 'quiet', 'booths', 'layout', 'cramped', 'comfy', 'bar_area'], 'service': ['tips', 'manager', 'wait', 'waitress', 'servers', 'hostesses', 'waiter', 'server', 'hostess', 'bartender', 'gentleman', 'waitresses', 'apologized', 'acknowledge', 'politely']}
location 0.9677821275739239
drinks 0.9666134917109935
food 0.932038885494072
ambience 0.9548992844172887
service 0.

KeyError: '3-6pm'

In [213]:
for asp in current_kw:
    print(asp, cosSim(topic_mat[aspect2ind[asp]],wv['best']))

location 0.040803956469103946
drinks -0.0431516091044525
food 0.17099126274739176
ambience -0.012356832791683724
service -0.07536520211261215


In [334]:
wv3 = read_vec("wv110.txt")
topic_mat3 = calculate_topic_emb(aspect_kw, wv3)

In [335]:
current_kw3, topic_mat_update3 = expand(aspect_kw,wv=wv3,k=10)

{'location': ['street', 'block', 'avenue', 'river', 'convenient', 'highway', '7th', 'strip', 'storefront', 'shoppes', 'fremont', 'ave', 'breathtaking', 'epicenter', 'roosevelt'], 'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'drink', 'wine', 'sakes', 'beers', 'beer', 'mixed_drinks', 'sauvignon', 'cocktails', 'dessert_wine', 'stiff'], 'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty', 'good', '.', ',', 'but', 'and', 'delicious', 'it', 'is', 'was', 'which'], 'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small', 'rooftop', 'intimate', 'cozy', 'outdoor', 'patio', 'atomosphere', 'space', 'quieter', 'couch', 'indoor'], 'service': ['tips', 'manager', 'wait', 'waitress', 'servers', 'server', 'us', 'waiter', 'she', 'barbi', 'our', 'busser', 'sincere', 'acknowledging', 'even']}
location 0.8776883985179151
drinks 0.8929628854691214
food 0.8862285898981843
ambience 0.9127673688783818
service 0.8866611147381297


In [332]:
eval(wv2, topic_mat_update2,'pseudo2_{}'.format(k),use_center=False, soft=True, thres=0.0,current_kw=current_kw2)



642
[[  3   0   1   1   0]
 [  0  21   2   0   2]
 [  3  12 277  17  35]
 [  1   1   1  55   9]
 [  2  11   7   9 172]]
precision 0.6443056177121405 recall 0.7843698947124841 f1 mac 0.6928822254488667


0.822429906542056

In [324]:
eval(wv2, topic_mat2,'pseudo2_{}'.format(k),thres=0.0,current_kw=current_kw2)

642
[[  2   0   0   2   1]
 [  0  23   1   1   0]
 [ 10  12 284  16  22]
 [  1   1   3  56   6]
 [  2   9  13  13 164]]
precision 0.6148141216269906 recall 0.759464537776235 f1 mac 0.6585661933975155




0.82398753894081

In [336]:
eval(wv3, topic_mat_update3,'pseudo3_{}'.format(k),use_center=False, soft=True, thres=0.0,current_kw=current_kw3)



642
[[  1   0   4   0   0]
 [  0   8  16   0   1]
 [  0   0 342   2   0]
 [  0   0  58   9   0]
 [  0   0 159   0  42]]
precision 0.8771199158715728 recall 0.371493925720236 f1 mac 0.4268550198358557


0.6261682242990654

In [337]:
eval(wv3, topic_mat3,'pseudo3_{}'.format(k),thres=0.0,current_kw=current_kw3)

642
[[  1   0   4   0   0]
 [  0  11  11   1   2]
 [  1   1 331   4   7]
 [  0   0  19  45   3]
 [  0   2  71   6 122]]
precision 0.75178155748127 recall 0.5761632534999422 f1 mac 0.6317200649126387




0.794392523364486

In [384]:
wv3 = read_vec("wv110.txt")
topic_mat3_enhance = calculate_topic_emb(aspect_kw_enhance, wv3)
eval(wv3, topic_mat3_enhance,'pseudo3_{}'.format(k),enhance=True,thres=0.0)

642
[[  2   0   1   1   1]
 [  1  17   4   0   3]
 [  1   4 308   6  25]
 [  2   0   1  56   8]
 [  1   3  16   7 174]]
precision 0.7104051004287971 recall 0.735368274904547 f1 mac 0.7206671828481547 mic 0.867601246105919




0.867601246105919

In [355]:
wv3 = read_vec("wv110.txt")
current_kw3, topic_mat_update3 = expand(aspect_kw_enhance,wv=wv3,k=2)
eval(wv3, topic_mat_update3,'pseudo3_{}'.format(k),use_center=True, soft=False, enhance=True, thres=0.0,current_kw=current_kw3)

{'location_n': ['street', 'parking', 'avenue', 'river', 'view', 'breathtaking', 'meters'], 'location_adj': ['convenient', 'near', 'uw', 'campus'], 'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'drink', 'wine'], 'drinks_adj': ['alcoholic', 'iced', 'bottled', 'tea', 'teas'], 'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger', '.', 'maguro'], 'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory', 'flavorful', 'flavor'], 'ambience_n': ['atmosphere', 'room', 'decor', 'music', 'space', 'loungy', 'rooftop'], 'ambience_adj': ['romantic', 'small', 'cozy', 'tiny', 'intimate', 'atomosphere'], 'service_n': ['tips', 'manager', 'wait', 'waitress', 'servers', 'server', 'us'], 'service_adj': ['rude', 'attentive', 'friendly', 'helpful', 'staff']}
location_n 0.9711896629861697
location_adj 0.8534451075664973
drinks_n 0.9742740020508132
drinks_adj 0.9304048498657731
food_n 0.9637818291098436
food_adj 0.9658801868683542
ambience_n 0.9527380714825309
ambience_adj 0.94126106224



0.8582554517133957

In [407]:
wv4 = read_vec("wv113.txt")
topic_mat4 = calculate_topic_emb(aspect_kw, wv4)
eval(wv4, topic_mat4,'pseudo5',thres=0.0,use_center=True, soft=True,current_kw = aspect_kw)



642
[[  1   0   4   0   0]
 [  0   9  14   1   1]
 [  1   0 336   3   4]
 [  0   0  16  48   3]
 [  0   3  77   4 117]]
mac 0.75896414189837 0.5670503297466157 0.6240075020491319
mic 0.7959501557632399 0.7959501557632399 0.7959501557632399


0.7959501557632399

In [356]:
current_kw4, topic_mat_update4 = expand(aspect_kw,wv=wv4,k=2)
eval(wv4, topic_mat_update4,'pseudo4_{}'.format(k),use_center=False, soft=True, thres=0.0,current_kw=current_kw4)

{'location': ['street', 'block', 'avenue', 'river', 'convenient', 'highway', '7th'], 'drinks': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'drink', 'sakes'], 'food': ['food', 'spicy', 'sushi', 'pizza', 'tasty', 'good', '.'], 'ambience': ['romantic', 'atmosphere', 'room', 'seating', 'small', 'rooftop', 'intimate'], 'service': ['tips', 'manager', 'wait', 'waitress', 'servers', 'server', 'us']}
location 0.954659470749324
drinks 0.9746646478734772
food 0.9813661384906037
ambience 0.956346554669606
service 0.972572788121977




642
[[  1   0   4   0   0]
 [  0   6  17   0   2]
 [  1   0 337   2   4]
 [  0   0  46  19   2]
 [  0   4 108   1  88]]
precision 0.7077012310606061 recall 0.4282088395233137 f1 mac 0.48710269817378915


0.7024922118380063

In [412]:
topic_mat4_enhance = calculate_topic_emb(aspect_kw_enhance, wv4)
eval(wv4, topic_mat4_enhance,'mis_pseudo4',enhance=True,use_center=True, soft=False,thres=-2.0,current_kw = current_kw4)



643
[[  2   0   1   1   1]
 [  1  17   6   0   1]
 [  3   5 309   6  22]
 [  2   0   2  57   6]
 [  1   3  17   3 177]]
mac 0.7060858028697095 0.7413990914990267 0.7185862285212592
mic 0.8740279937791602 0.8740279937791602 0.8740279937791602


0.8740279937791602

In [372]:
current_kw4, topic_mat_update4 = expand(aspect_kw_enhance,wv=wv4,k=2)
eval(wv4, topic_mat_update4,'pseudo4_{}'.format(k),use_center=True, soft=False, enhance=True, thres=0.0,current_kw=current_kw4)

{'location_n': ['street', 'parking', 'avenue', 'river', 'view', 'breathtaking', '7th'], 'location_adj': ['convenient', 'near', 'campus', 'uw'], 'drinks_n': ['drinks', 'beverage', 'wines', 'margarita', 'sake', 'drink', 'sakes'], 'drinks_adj': ['alcoholic', 'iced', 'bottled', 'tea', 'teas'], 'food_n': ['food', 'pizza', 'tuna', 'sushi', 'burger', '.', 'sashimi'], 'food_adj': ['spicy', 'tasty', 'delicious', 'bland', 'savory', 'flavorful', 'flavor'], 'ambience_n': ['atmosphere', 'room', 'decor', 'music', 'space', 'loungy', 'rooftop'], 'ambience_adj': ['romantic', 'small', 'cozy', 'tiny', 'intimate', 'atomosphere'], 'service_n': ['tips', 'manager', 'wait', 'waitress', 'servers', 'server', 'us'], 'service_adj': ['rude', 'attentive', 'friendly', 'helpful', 'staff']}
location_n 0.9610417382931088
location_adj 0.85452770287294
drinks_n 0.9746646478734772
drinks_adj 0.9326204257430823
food_n 0.9574452809989803
food_adj 0.967577217104486
ambience_n 0.9554058171097197
ambience_adj 0.941270969093756



0.8489096573208723

In [395]:
wv6 = read_vec("wv116.txt")
topic_mat6 = calculate_topic_emb(aspect_kw_enhance_expand, wv6)
eval(wv6, topic_mat6,'pseudo6',thres=0.0,use_center=True, soft=True, enhance=True, current_kw = aspect_kw_enhance_expand )



642
[[  4   0   0   1   0]
 [  0  17   5   0   3]
 [ 10   6 260   6  62]
 [  5   1   0  47  14]
 [  4   2   8   2 185]]
precision 0.6640366879497315 recall 0.7715409001504108 f1 mac 0.6710190403444863
precision 0.7990654205607477 recall 0.7990654205607477 f1 micc 0.7990654205607477


0.7990654205607477

In [410]:
wv7 = read_vec("wv120.txt")
topic_mat7 = calculate_topic_emb(aspect_kw_1, wv7)
eval(wv7, topic_mat7,'pseudo7',thres=0.0,use_center=True, soft=True,current_kw = aspect_kw_1)



642
[[  2   0   2   1   0]
 [  0  13  10   0   2]
 [  2   2 319   8  13]
 [  3   0   6  53   5]
 [  0   2  31   3 165]]
mac 0.7249089002861382 0.6918531759805623 0.7012801901868676
mic 0.8598130841121495 0.8598130841121495 0.8598130841121495


0.8598130841121495

In [417]:
"n't" in wv

False