In [1]:
# imports needed libs
import gensim 
import pandas as pd
import pickle
import itertools
import spacy
import time

In [2]:
nlp = spacy.load('en_core_web_sm', disable=["ner"])

In [3]:
def read_topic_data():
    return pd.read_csv('processed_data/cleaned_reviews.csv', usecols = ["business_id", "name", "review_id","text","topic_text","sentiment_text","review_stars"])

In [4]:
data = read_topic_data()

In [5]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 
neg_words  = [line.rstrip('\n') for line in open('config/negations.txt', 'r', encoding='utf-8')]
stopwords = stop_words + neg_words

In [6]:
def list_diff(list1,list2):
    return list(itertools.filterfalse(set(list2).__contains__, list1)) 

In [7]:
def split_phrases(str_list):
    new_list = []
    for tok in str_list:
        if '-' in tok:
            new_list += tok.split('-')
        new_list.append(tok)
    return list_diff(new_list,stop_words)

In [8]:
topic_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.topic_text]

In [9]:
senti_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.sentiment_text]

In [None]:
%%time
#build vocabulary and train model
topic_model = gensim.models.Word2Vec(
        topic_set,
        size=100,
        window=10,
        min_count=100,
        workers=100,
        iter=10)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : 4min 34s

In [None]:
%%time
#build vocabulary and train model
senti_model = gensim.models.Word2Vec(
        senti_set,
        size=100,
        window=15,
        min_count=100,
        workers=100,
        iter=20)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : Wall time: 17min 19s

In [None]:
#with open('pickles/aspect_topic_model.pk', 'wb') as fin:
#    pickle.dump(topic_model, fin)
#with open('pickles/aspect_senti_model.pk', 'wb') as fin:
#    pickle.dump(senti_model, fin)

In [23]:
stopwords.remove('restaurant')

ValueError: list.remove(x): x not in list

In [8]:
senti_model = None
topic_model = None
with open('pickles/aspect_topic_model.pk', 'rb') as fin:
    topic_model = pickle.load(fin)
with open('pickles/aspect_senti_model.pk', 'rb') as fin:
    senti_model = pickle.load(fin)

In [22]:
topic_model.wv.most_similar(positive='location',topn=5)

[('dana_park', 0.6781737208366394),
 ('warm_spre', 0.6394859552383423),
 ('pv_mall', 0.6201270222663879),
 ('stapley', 0.6192067265510559),
 ('greenway', 0.6060971021652222)]

In [10]:
senti_model.wv.most_similar(positive='far',topn=2)

[('superior-food', 0.6407245993614197),
 ('superior-service', 0.5888616442680359)]

In [11]:
senti_model.wv.similarity(w1='location',w2='far')

0.22458845

In [12]:
senti_model.wv.similarity(w1='staff',w2='far')

0.016833797

In [13]:
senti_model.wv.similarity(w1='atmosphere',w2='far')

0.042283814

In [14]:
senti_model.wv.similarity(w1='price',w2='far')

0.050098836

In [15]:
senti_model.wv.similarity(w1='service',w2='far')

0.07305616

In [9]:
Aspect_Terms = ['food','service','staff','location','value']

In [10]:
def get_all_topics_map():
    with open('config/all_topics.txt', 'r') as f:
         ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [11]:
topic_map = get_all_topics_map()

In [12]:
vocab = []
for x in topic_map.values():
    for y in x.split('/'):
        vocab.append(y)

In [13]:
def get_aspect_map():
    with open('config/aspect_map.txt', 'r') as f:
        ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [14]:
aspect_map = get_aspect_map()

In [15]:
aspect_map.keys()

dict_keys(['food', 'atmosphere', 'staff', 'value', 'cent', 'location', 'appetizer', 'area', 'attentive', 'bacon', 'bar', 'bartender', 'beer', 'bill', 'bland', 'bread', 'breakfast', 'brunch', 'buffet', 'burger', 'burrito', 'caesar', 'charge', 'check', 'cheese', 'chef', 'chicken', 'clean', 'coffee', 'cook', 'couple', 'crust', 'customer', 'service', 'delicious', 'dessert', 'dinner', 'dish', 'dollar', 'dress', 'drink', 'drive', 'employee', 'fast', 'fish', 'flavor', 'friendly', 'friendly-staff', 'great-atmosphere', 'great-customer', 'great-food', 'great-service', 'grill', 'happy-hour', 'helpful', 'hotel', 'hour', 'inside', 'lobster', 'lunch', 'manager', 'meal', 'meat', 'minute', 'noodle', 'not-wait', 'offer', 'option', 'outside', 'pancake', 'pasta', 'people', 'pizza', 'portion', 'price', 'bad', 'overprice', 'money', 'quality', 'refill', 'reservation', 'review', 'selection', 'server', 'slow', 'special', 'star', 'street', 'tasty', 'town', 'parking', 'kitchen', 'table', 'chair', 'vegas', 'visi

In [16]:
def assign_aspect(sent,aspect_map):
    dicty = {}
    
    for word in nlp(sent.lower()):
        if word.lemma_ not in stopwords:
            score = 0
            aspec = None
            for asp in aspect_map.keys():
                try:
                    sco = senti_model.wv.similarity(w1=asp,w2=word.lemma_)
                    if sco > score:
                        score=sco
                        aspec=asp
                        dicty[aspec]= dicty.setdefault(asp, 0) + score    
                except:
                    continue
            
    if len(dicty)==0:
        return None
    
    ret = max(dicty.items(), key=lambda k: k[1])
    if ret[1]<0.3:
        return None
    
    return aspect_map[ret[0]]

In [None]:
lst = []
start = time.time()
end = time.time()
total = len(data)
for index,txt in enumerate(data.text):
    if index < 2250001:
        continue;
        
    mydict = {}
    for w in Aspect_Terms:
        mydict[w] = mydict.setdefault(w, '')  
    for x in txt.replace('\\n','.').replace('\n','.').replace('restaurant','location').split('.'):
        if(len(x)>0):
            y = assign_aspect(x,aspect_map)
            mydict[y] = mydict.setdefault(y, '') + x +'.\n' 
    mydict.pop(None, None)
    lst.append(mydict)
    
    if index%1000 == 0 and index > 0:
        print(f".", end='')
        
    if index%10000 == 0 and index > 0:
        end = time.time()
        print(f' Clustered [{index+1:>{5}}/{total:>{5}} ] - {str(end-start):>{9.6}} secs')
        start = time.time()
        with open('pickles/'+str(index)+'.pk', 'wb') as fin:
            pickle.dump(lst, fin)
        lst = []
        
print(f'Clustered [{total:>{5}}] - {str(end-start):>{9.6}} secs')

with open('pickles/3530000.pk', 'wb') as fin:
    pickle.dump(lst, fin)

.......... Clustered [2260001/3527902 ] -    2223.6 secs
.......... Clustered [2270001/3527902 ] -    3284.8 secs
.......... Clustered [2280001/3527902 ] -    2046.9 secs
.......... Clustered [2290001/3527902 ] -    1981.5 secs
.......... Clustered [2300001/3527902 ] -    1826.4 secs
.......... Clustered [2310001/3527902 ] -    1800.4 secs
.......... Clustered [2320001/3527902 ] -    1703.6 secs
.......... Clustered [2330001/3527902 ] -    1841.2 secs
.......... Clustered [2340001/3527902 ] -    1809.3 secs
.......... Clustered [2350001/3527902 ] -    1886.8 secs
.......... Clustered [2360001/3527902 ] -    2033.6 secs
.......... Clustered [2370001/3527902 ] -    1913.6 secs
.......... Clustered [2380001/3527902 ] -    1719.1 secs
.......... Clustered [2390001/3527902 ] -    1852.0 secs
.......... Clustered [2400001/3527902 ] -    2061.2 secs
.......... Clustered [2410001/3527902 ] -    1688.8 secs
.......... Clustered [2420001/3527902 ] -    1759.9 secs
.......... Clustered [2430001/3

In [None]:
lst = []
for index in range(len(data)):
    if index%10000 == 0 and index > 0:
        with open('pickles/'+index+'.pk', 'rb') as fin:
            lst += pickle.load(fin)
with open('pickles/353.pk', 'rb') as fin:
    lst += pickle.load(fin)

In [None]:
aspect_df = pd.DataFrame(lst)

In [None]:
aspects_df = data[['review_id','text','review_stars']]

In [None]:
for asp in Aspect_Terms:
    aspects_df[asp] = aspect_df[asp]

In [None]:
aspects_df.to_csv('aspects_df.csv')