In [84]:
# imports needed libs
import gensim 
import pandas as pd
import pickle
import itertools
import spacy
import time

In [85]:
nlp = spacy.load('en_core_web_sm', disable=["ner"])

In [86]:
def read_topic_data():
    return pd.read_csv('processed_data/cleaned_reviews.csv', usecols = ["business_id", "name", "review_id","text","topic_text","sentiment_text","review_stars"])

In [87]:
data = read_topic_data()

In [88]:
stop_words = [line.rstrip('\n') for line in open('config/stopwords.txt', 'r', encoding='utf-8')] 
neg_words  = [line.rstrip('\n') for line in open('config/negations.txt', 'r', encoding='utf-8')]
stopwords = stop_words + neg_words

In [89]:
def list_diff(list1,list2):
    return list(itertools.filterfalse(set(list2).__contains__, list1)) 

In [90]:
def split_phrases(str_list):
    new_list = []
    for tok in str_list:
        if '-' in tok:
            new_list += tok.split('-')
        new_list.append(tok)
    return list_diff(new_list,stop_words)

In [8]:
topic_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.topic_text]

In [9]:
senti_set = [split_phrases((u''+str(txt)).replace('.','').split()) for txt in data.sentiment_text]

In [None]:
%%time
#build vocabulary and train model
topic_model = gensim.models.Word2Vec(
        topic_set,
        size=100,
        window=10,
        min_count=100,
        workers=100,
        iter=10)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : 4min 34s

In [None]:
%%time
#build vocabulary and train model
senti_model = gensim.models.Word2Vec(
        senti_set,
        size=100,
        window=15,
        min_count=100,
        workers=100,
        iter=20)

#  Stats
## NC (0250 MB): Wall time:   14.8 s /   22 s
## ALL         : Wall time: 17min 19s

In [None]:
#with open('pickles/aspect_topic_model.pk', 'wb') as fin:
#    pickle.dump(topic_model, fin)
#with open('pickles/aspect_senti_model.pk', 'wb') as fin:
#    pickle.dump(senti_model, fin)

In [8]:
senti_model = None
topic_model = None
with open('pickles/aspect_topic_model.pk', 'rb') as fin:
    topic_model = pickle.load(fin)
with open('pickles/aspect_senti_model.pk', 'rb') as fin:
    senti_model = pickle.load(fin)

In [9]:
topic_model.wv.most_similar(positive='cusine',topn=5)

[('cuisine', 0.7098367810249329),
 ('thailand', 0.6114474534988403),
 ('laos', 0.607699453830719),
 ('authenticity', 0.5937446355819702),
 ('region', 0.5881584882736206)]

In [10]:
senti_model.wv.most_similar(positive='far',topn=2)

[('superior-food', 0.6407245397567749),
 ('superior-service', 0.5888615846633911)]

In [11]:
senti_model.wv.similarity(w1='location',w2='far')

0.22458847

In [12]:
senti_model.wv.similarity(w1='staff',w2='far')

0.016833797

In [13]:
senti_model.wv.similarity(w1='atmosphere',w2='far')

0.0422838

In [14]:
senti_model.wv.similarity(w1='price',w2='far')

0.05009885

In [15]:
senti_model.wv.similarity(w1='service',w2='far')

0.073056154

In [91]:
Aspect_Terms = ['food','service','staff','location','value']

In [92]:
def get_all_topics_map():
    with open('config/all_topics.txt', 'r') as f:
         ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [93]:
topic_map = get_all_topics_map()

In [94]:
vocab = []
for x in topic_map.values():
    for y in x.split('/'):
        vocab.append(y)

In [95]:
def get_aspect_map():
    with open('config/aspect_map.txt', 'r') as f:
        ret = {line.split(';')[0]: line.split(';')[1].replace('\n','') for line in f.readlines()}
    return ret

In [96]:
aspect_map = get_aspect_map()

In [97]:
aspect_map.keys()

dict_keys(['food', 'atmosphere', 'staff', 'value', 'cent', 'location', 'appetizer', 'area', 'attentive', 'bacon', 'bar', 'bartender', 'beer', 'bill', 'bland', 'bread', 'breakfast', 'brunch', 'buffet', 'burger', 'burrito', 'caesar', 'charge', 'check', 'cheese', 'chef', 'chicken', 'clean', 'coffee', 'cook', 'couple', 'crust', 'customer', 'service', 'delicious', 'dessert', 'dinner', 'dish', 'dollar', 'dress', 'drink', 'drive', 'employee', 'fast', 'fish', 'flavor', 'friendly', 'friendly-staff', 'great-atmosphere', 'great-customer', 'great-food', 'great-service', 'grill', 'happy-hour', 'helpful', 'hotel', 'hour', 'inside', 'lobster', 'lunch', 'manager', 'meal', 'meat', 'minute', 'noodle', 'not-wait', 'offer', 'option', 'outside', 'pancake', 'pasta', 'people', 'pizza', 'portion', 'price', 'bad', 'overprice', 'money', 'quality', 'refill', 'reservation', 'review', 'selection', 'server', 'slow', 'special', 'star', 'street', 'tasty', 'town', 'parking', 'kitchen', 'table', 'chair', 'vegas', 'visi

In [98]:
def assign_aspect(sent,aspect_map):
    dicty = {}
    
    for word in nlp(sent.lower()):
        if word.lemma_ not in stopwords:
            score = 0
            aspec = None
            for asp in aspect_map.keys():
                try:
                    sco = senti_model.wv.similarity(w1=asp,w2=word.lemma_)
                    if sco > score:
                        score=sco
                        aspec=asp
                        dicty[aspec]= dicty.setdefault(asp, 0) + score    
                except:
                    continue
            
    if len(dicty)==0:
        return None
    
    ret = max(dicty.items(), key=lambda k: k[1])
    if ret[1]<0.3:
        return None
    
    return aspect_map[ret[0]]

In [None]:
%%time
lst = []
start = time.time()
end = time.time()
total = len(data)
for index,txt in enumerate(data.text):
    mydict = {}
    for w in Aspect_Terms:
        mydict[w] = mydict.setdefault(w, '')  
    for x in txt.replace('\\n','.').replace('\n','.').split('.'):
        if(len(x)>0):
            y = assign_aspect(x,aspect_map)
            mydict[y] = mydict.setdefault(y, '') + x +'.\n' 
    mydict.pop(None, None)
    lst.append(mydict)
    
    if index%1000 == 0 and index > 0:
        print(f".", end='')
            
    if index%10000 == 0 and index > 0:
        end = time.time()
        print(f' Clustered [{index+1:>{5}}/{total:>{5}} ] - {str(end-start):>{9.6}} secs')
        start = time.time()
        with open('pickles/IL/'+str(index)+'.pk', 'wb') as fin:
            pickle.dump(lst, fin)
        lst = []
        
print(f'\n Clustered [{total:>{5}} ] - {str(end-start):>{9.6}} secs')

with open('pickles/IL/3530000.pk', 'wb') as fin:
    pickle.dump(lst, fin)

........... Clustered [10001/3527902 ] -    877.50 secs
.......... Clustered [20001/3527902 ] -    884.31 secs
.......... Clustered [30001/3527902 ] -    861.31 secs
.......... Clustered [40001/3527902 ] -    915.41 secs
.......... Clustered [50001/3527902 ] -    964.49 secs
.......... Clustered [60001/3527902 ] -    985.38 secs
.......... Clustered [70001/3527902 ] -    853.13 secs
.......... Clustered [80001/3527902 ] -    9260.8 secs
.......... Clustered [90001/3527902 ] -    820.37 secs
.......... Clustered [100001/3527902 ] -    877.12 secs
.......... Clustered [110001/3527902 ] -    778.26 secs
.......... Clustered [120001/3527902 ] -    849.11 secs
.......... Clustered [130001/3527902 ] -    973.29 secs
.......... Clustered [140001/3527902 ] -    902.66 secs
.......... Clustered [150001/3527902 ] -    867.14 secs
.......... Clustered [160001/3527902 ] -    911.16 secs
.......... Clustered [170001/3527902 ] -    884.26 secs
.......... Clustered [180001/3527902 ] -    929.08 secs


In [None]:
lst = []
for index in range(len(data)):
    if index%10000 == 0 and index > 0:
        with open('pickles/IL/'+str(index)+'.pk', 'rb') as fin:
            lst += pickle.load(fin)
        print(index)
with open('pickles/IL/3530000.pk', 'rb') as fin:
    lst += pickle.load(fin)

In [None]:
aspect_df = pd.DataFrame(lst)

In [79]:
aspect_df

Unnamed: 0,atmosphere,food,location,service,staff,value
0,,This is a disgusting restaurant.\n You can fi...,,,This restaurant should really be inspected by...,I would have given this restaurant zero stars ...
1,I quietly walked back to my seat to retrieve ...,Before I picked up a plate to start in on the...,I will be calling the Champaign-Urbana Public...,,"I alerted a waitress, who acted as if it was ...",
2,,The best food I want to the a lot of the buffe...,,,,
3,It's impressive how filthy this place is.\n So...,The only worthwhile feature is that the food ...,,,You would think the staff would pick up trash...,The quality ranges from poor to mediocre.\n I...
4,,And they suggested that I try this buffet.\n...,I was visiting the Champaign area with my fami...,,,
5,,absolutely disgusting.\n arrived for lunch on ...,,,,this is by far the worst sushi I have ever se...
6,I have no idea what sort of food was being pr...,I was hungry and against my better judgement ...,Another restaurants make the DNV list in Champ...,It was a weeknight so luckily there was no li...,Even the water that they were boiled in looke...,Restaurant review:.\n This is bad news.\n Okay...
7,,It is a Chinese buffet!! They have a lot of su...,They have a wide range of things to eat and th...,,,I was looking for reviews and I saw how bad th...
8,,"Not necessarily the best Chinese food in town,...",,,,
9,,My friend took me there.\n,,,,Price was ok.\n Everything is just within the...


In [None]:
aspects_df = data[['review_id','text','review_stars']]

In [None]:
for asp in Aspect_Terms:
    aspects_df[asp] = aspect_df[asp]

In [82]:
aspects_df

Unnamed: 0,review_id,text,review_stars,food,service,staff,location,value
0,8MTptiOpUeuPUFZgtfk9Vw,I would have given this restaurant zero stars ...,1.0,This is a disgusting restaurant.\n You can fi...,,This restaurant should really be inspected by...,,I would have given this restaurant zero stars ...
1,ofpfzn8LV4nJ2pE6IlTVdg,So...no. Just...no. Before I picked up a plate...,1.0,Before I picked up a plate to start in on the...,,"I alerted a waitress, who acted as if it was ...",I will be calling the Champaign-Urbana Public...,
2,O3yApCw95tbA0kodflQrpA,The best food I want to the a lot of the buffe...,5.0,The best food I want to the a lot of the buffe...,,,,
3,fFTVZE7EENdl66SX9cgc0A,It's impressive how filthy this place is. Some...,1.0,The only worthwhile feature is that the food ...,,You would think the staff would pick up trash...,,The quality ranges from poor to mediocre.\n I...
4,pDlIw6DLLjZ5G1vJnMzr9A,I was visiting the Champaign area with my fami...,5.0,And they suggested that I try this buffet.\n...,,,I was visiting the Champaign area with my fami...,
5,xwJizYjeafDbsJVmPmBSgQ,absolutely disgusting. arrived for lunch on a ...,1.0,absolutely disgusting.\n arrived for lunch on ...,,,,this is by far the worst sushi I have ever se...
6,mvAvUVOkSuEr6EHZ97sg0Q,Restaurant review:\nAnother restaurants make t...,1.0,I was hungry and against my better judgement ...,It was a weeknight so luckily there was no li...,Even the water that they were boiled in looke...,Another restaurants make the DNV list in Champ...,Restaurant review:.\n This is bad news.\n Okay...
7,ticGaTnIjhkijFqWmww5uw,I was looking for reviews and I saw how bad th...,5.0,It is a Chinese buffet!! They have a lot of su...,,,They have a wide range of things to eat and th...,I was looking for reviews and I saw how bad th...
8,4ftuoeacaUQ1EyiW_xt1QA,"Not necessarily the best Chinese food in town,...",4.0,"Not necessarily the best Chinese food in town,...",,,,
9,aDCXNKvZpoD9s4GICWdqUg,My friend took me there. Price was ok. Everyth...,3.0,My friend took me there.\n,,,,Price was ok.\n Everything is just within the...


In [None]:
aspects_df.to_csv('processed_data/aspect_review.csv',encoding='utf-8')

In [55]:
assign_aspect('A bar with bar food. There were four of us.',aspect_map)

{'atmosphere': 0.5312161445617676, 'area': 0.8184992074966431, 'bartender': 1.1844112873077393, 'food': 1.0}


'staff'

In [53]:
senti_model.wv.similarity(w1='bar',w2='food')

-0.030885737

In [None]:
#END