In [16]:
# imports needed libs
import gensim 
import pandas as pd
import pickle
import itertools
import spacy

In [17]:
nlp = spacy.load('en_core_web_sm', disable=["ner"])

In [2]:
def read_topic_data():
    return pd.read_csv('processed_data/cleaned_reviews.csv', usecols = ["business_id", "name", "review_id","text","topic_text","sentiment_text"])

In [3]:
data = read_topic_data()

In [37]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 
neg_words  = [line.rstrip('\n') for line in open('config/negations.txt', 'r', encoding='utf-8')]
stopwords = stop_words + neg_words

In [5]:
def list_diff(list1,list2):
    return list(itertools.filterfalse(set(list2).__contains__, list1)) 

In [6]:
def split_phrases(str_list):
    new_list = []
    for tok in str_list:
        if '-' in tok:
            new_list += tok.split('-')
        new_list.append(tok)
    return list_diff(new_list,stop_words)

In [7]:
topic_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.topic_text]

In [8]:
senti_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.sentiment_text]

In [None]:
%%time
#build vocabulary and train model
topic_model = gensim.models.Word2Vec(
        topic_set,
        size=100,
        window=10,
        min_count=100,
        workers=100,
        iter=10)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : 4min 34s

In [None]:
%%time
#build vocabulary and train model
senti_model = gensim.models.Word2Vec(
        senti_set,
        size=100,
        window=15,
        min_count=100,
        workers=100,
        iter=20)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : Wall time: 17min 19s

In [None]:
#with open('pickles/aspect_topic_model.pk', 'wb') as fin:
#    pickle.dump(topic_model, fin)
#with open('pickles/aspect_senti_model.pk', 'wb') as fin:
#    pickle.dump(senti_model, fin)

In [7]:
with open('pickles/aspect_topic_model.pk', 'rb') as fin:
    topic_model = pickle.load(fin)
with open('pickles/aspect_senti_model.pk', 'rb') as fin:
    senti_model = pickle.load(fin)

In [4]:
topic_model.wv.most_similar(positive='cusine',topn=5)

[('cuisine', 0.7098367810249329),
 ('thailand', 0.6114474534988403),
 ('laos', 0.607699453830719),
 ('authenticity', 0.5937446355819702),
 ('region', 0.5881584882736206)]

In [5]:
senti_model.wv.most_similar(positive='far',topn=2)

[('superior-food', 0.6407245397567749),
 ('superior-service', 0.5888615846633911)]

In [7]:
senti_model.wv.similarity(w1='location',w2='far')

0.22458847

In [66]:
senti_model.wv.similarity(w1='staff',w2='far')

0.016833797

In [68]:
senti_model.wv.similarity(w1='atmosphere',w2='far')

0.0422838

In [69]:
senti_model.wv.similarity(w1='price',w2='far')

0.05009885

In [8]:
senti_model.wv.similarity(w1='service',w2='far')

0.073056154

In [9]:
Aspect_Terms = ['~Ambiance','~Food','~Staff','~Service','~Location','~Value','~Cusine','~Street','~Area','~Cusine','~Employee','Lunch','Dinner','Breakfast','Chef','Manager','Waiter','Bartender','Attitude','behavior',]

In [8]:
def get_all_topics_map():
    with open('config/all_topics.txt', 'r') as f:
         ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [9]:
topic_map = get_all_topics_map()

In [15]:
vocab = []
for x in topic_map.values():
    for y in x.split('/'):
        vocab.append(y)

In [10]:
def get_aspect_map():
    with open('config/aspect_map.txt', 'r') as f:
        ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [68]:
aspect_map = get_aspect_map()

In [69]:
aspect_map.keys()

dict_keys(['food', 'atmosphere', 'staff', 'value', 'cent', 'location', 'appetizer', 'area', 'attentive', 'bacon', 'bartender', 'beer', 'bill', 'bland', 'bread', 'breakfast', 'brunch', 'buffet', 'burger', 'burrito', 'caesar', 'charge', 'check', 'cheese', 'chef', 'chicken', 'clean', 'coffee', 'cook', 'couple', 'crust', 'customer', 'service', 'delicious', 'dessert', 'dinner', 'dish', 'dollar', 'dress', 'drink', 'drive', 'employee', 'fast', 'fish', 'flavor', 'friendly', 'friendly-staff', 'great-atmosphere', 'great-customer', 'great-food', 'great-service', 'grill', 'happy-hour', 'helpful', 'hotel', 'hour', 'inside', 'lobster', 'lunch', 'manager', 'meal', 'meat', 'minute', 'noodle', 'not-wait', 'offer', 'option', 'outside', 'pancake', 'pasta', 'people', 'pizza', 'portion', 'price', 'bad', 'overprice', 'money', 'quality', 'refill', 'reservation', 'review', 'selection', 'server', 'slow', 'special', 'star', 'street', 'tasty', 'town', 'parking', 'kitchen', 'table', 'chair', 'vegas', 'visit', 'wa

In [74]:
def assign_aspect(sent,aspect_map):
    dicty = {}
    
    for word in nlp(sent.lower()):
        if word.lemma_ not in stopwords:
            score = 0
            aspec = None
            for asp in aspect_map.keys():
                try:
                    sco = senti_model.wv.similarity(w1=asp,w2=word.lemma_)
                    if sco > score:
                        score=sco
                        aspec=asp
                except:
                    continue
            dicty[aspec]= dicty.setdefault(asp, 0) + score    
    if len(dicty)==0:
        return None
    
    ret = max(dicty.items(), key=lambda k: k[1])
    
    return aspect_map[ret[0]]

In [None]:
lst = []
start = time.time()
total = len(data)
for index,txt in enumerate(data.text[:1000000]):
    mydict = {}
    for x in txt.replace('\\n','.').replace('\n','.').split('.'):
        if(len(x)>0):
            y = assign_aspect(x,aspect_map)
            mydict[y] = mydict.setdefault(y, '') + x +'\n' 
    mydict.pop(None, None)
    lst.append(mydict)
    
    if index%10000 == 0 and index > 0:
        end = time.time()
        printTS(f'Clustered [{index+1:>{5}}/{total:>{5}} ] - {str(end-start):>{9.6}} secs')
        start = time.time()

2019-12-03 03:57:05: Clustered [10001/3527902 ] -    797.92 secs
2019-12-03 04:10:35: Clustered [20001/3527902 ] -    810.40 secs
2019-12-03 04:23:30: Clustered [30001/3527902 ] -    775.14 secs
2019-12-03 04:37:25: Clustered [40001/3527902 ] -    835.21 secs
2019-12-03 04:51:53: Clustered [50001/3527902 ] -    867.99 secs
2019-12-03 05:05:32: Clustered [60001/3527902 ] -    818.44 secs
2019-12-03 05:18:26: Clustered [70001/3527902 ] -    774.64 secs
2019-12-03 05:32:18: Clustered [80001/3527902 ] -    831.49 secs
2019-12-03 05:45:05: Clustered [90001/3527902 ] -    767.17 secs
2019-12-03 05:58:22: Clustered [100001/3527902 ] -    796.67 secs
2019-12-03 06:10:10: Clustered [110001/3527902 ] -    708.43 secs
2019-12-03 06:22:45: Clustered [120001/3527902 ] -    755.26 secs
2019-12-03 06:36:43: Clustered [130001/3527902 ] -    837.67 secs
2019-12-03 06:50:19: Clustered [140001/3527902 ] -    815.43 secs
2019-12-03 07:03:58: Clustered [150001/3527902 ] -    819.07 secs
2019-12-03 07:18:08

In [78]:
len(data)

3527902