# Example of category and frequency based aspect detection

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import json

In [3]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
from sqlalchemy.schema import Table, MetaData

In [4]:
connstr = "postgresql+psycopg2://{user}:{pwd}@{ipaddress}/{dbname}".format(
    user='postgres', pwd='flintpsql', ipaddress='localhost', dbname='bdlab'
)
engine = create_engine(connstr)

In [5]:
def read(sql, engine):
    conn = engine.connect()
    data = pd.read_sql(sql, conn)
    conn.close()
    return data

## Get reviews

In [6]:
sql = """
select b.id, b.name, r.content
from yelp.textclip as r
join yelp.business as b on r.business = b.id
join yelp.reviewer as u on r.author = u.id
where b.review > 500
limit 10000
"""
R = read(sql, engine=engine)

In [7]:
R.shape

(10000, 3)

In [8]:
R.head()

Unnamed: 0,id,name,content
0,AfN3Z1U6QPEgAb5F2CQm8w,Casey Moore's Oyster House,everything was great......service was outstand...
1,64dfRmMmUsOdLnkBOtzp4w,Gallo Blanco,You can't go wrong here. Some of the best gua...
2,d4qwVw4PcN-_2mK2o1Ro1g,Pink Taco,The Decor was really cool but the food was jus...
3,mU3vlAVzTxgmZUu6F4XixA,Momofuku Las Vegas,Oysters were tiny but deep. It was delicious b...
4,C9ImzBi5fn742ZcAYDww2A,Rise & Shine - A Steak & Egg Place,"Awesome food, good people. Love steak and egg...."


## Get categories

In [9]:
K = read("select * from yelp.incat", engine)

In [10]:
K.head(2)

Unnamed: 0,business,category
0,mmazCP1ZH0QsUqDS6OivFA,Seafood
1,Zh7k_33xMSPwm0UU7LsEIw,Education


In [11]:
category = lambda x: list(K[K.business==x].category.values)

In [12]:
sample_cat = R.id.unique()
category_map = dict([(b, category(b)) for b in sample_cat])

In [13]:
category_map['AfN3Z1U6QPEgAb5F2CQm8w']

['Bars', 'Nightlife', 'Seafood', 'Pubs', 'Irish', 'Restaurants']

## Indexing
We can extract n-grams from text and indexing by category

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
from collections import defaultdict
import time

In [None]:
def tokenize(text):
    doc = nlp(text)
    tokens = []
    for sentence in doc.sents:
        tokens.append([(t.lemma_, t.pos_) for t in sentence])
    return tokens

In [None]:
def shift_ngrams(text, window=3):
    grams = []
    for sent in tokenize(text):
        for i, (token, pos) in enumerate(sent):
            if pos == 'NOUN':
                grams.append(token)
                for token_j, pos_j in sent[max([i-window, 0]):i+window+1]:
                    if pos_j in ['NOUN', 'ADJ', 'VERB', 'ADV'] and token_j != token:
                            grams.append((token_j, token))
    return grams

In [None]:
test = R.content.values[0]
print(test)

In [None]:
shift_ngrams(test)

### Note: this is slow, so we pre-compute and save indexes

In [None]:
unigram = defaultdict(lambda: defaultdict(lambda: 0))
bigram = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

business_units = R.id.values
reviews = R.content.values

for i, text in tqdm(list(enumerate(reviews))):
    business = business_units[i]
    for token in shift_ngrams(text):
        for k in category_map[business]:
            if isinstance(token, tuple):
                for x in token:
                    unigram[k][x] += 1
                bigram[k][token[1]][token[0]] += 1
            else:
                unigram[k][token] += 1
U = dict([(x, dict(y)) for x, y in unigram.items()])
B = {}
for x, y in bigram.items():
    data = dict([(p, dict(q)) for p, q in y.items()])
    B[x] = data

In [None]:
with open('data/unigram.json', 'w') as uo:
    json.dump(U, uo)
with open('data/bigram.json', 'w') as bo:
    json.dump(B, bo)

## Aspect detection

In [14]:
with open('data/unigram.json', 'r') as uo:
    U = json.load(uo)
with open('data/bigram.json', 'r') as bo:
    B = json.load(bo)

In [15]:
list(U['Restaurants'].items())[:4]

[('service', 10463), ('friendly', 824), ('classy', 45), ('staff', 3491)]

In [16]:
for k, d in list(B['Restaurants'].items())[:4]:
    print(k, list(d.items())[:4])

service [('friendly', 123), ('classy', 2), ('nice', 42), ('downside', 2)]
staff [('great', 74), ('upscale', 1), ('food', 34), ('delivery', 1)]
experience [('really', 26), ('enhance', 5), ('personal', 2), ('consist', 1)]
butter [('salad', 4), ('so', 5), ('lot', 3), ('salt', 5)]


## Kullback–Leibler approach
$$
KL_t = p(t)\log \frac{p(t)}{q(t)}
$$

In [18]:
from collections import defaultdict

In [19]:
global_u = defaultdict(lambda: 0)
global_s = 0
for k, v in U.items():
    for t, w in v.items():
        global_u[t] += w
        global_s += w

In [20]:
def kl_unigram(category, unigram):
    kl = {}
    s = sum(unigram[category].values())
    for k, v in unigram[category].items():
        p_k = v / s
        q_k = global_u[k] / global_s
        kl[k] = p_k * np.log(p_k / q_k)
    return kl

In [21]:
klu = kl_unigram('Restaurants', U)

In [22]:
candidates = [(k, v) for k, v in sorted(klu.items(), key=lambda x: -x[1])]

In [23]:
candidates[:20]

[('food', 0.004403045406201249),
 ('place', 0.0017752309694621638),
 ('sauce', 0.0014525373906889188),
 ('dish', 0.00141628627400457),
 ('pizza', 0.001366669843011592),
 ('order', 0.0012087844243553719),
 ('chicken', 0.0011693420928505058),
 ('menu', 0.0011279902530636132),
 ('meal', 0.0010401526364132345),
 ('meat', 0.0010135126439660752),
 ('fry', 0.0009900395473505152),
 ('restaurant', 0.0009406684703899171),
 ('burger', 0.0009324358813403639),
 ('buffet', 0.0008908107917991969),
 ('salad', 0.000890510424238707),
 ('pork', 0.0008711928367037539),
 ('cheese', 0.000859032948638822),
 ('server', 0.0008485966145150159),
 ('potato', 0.0008292769121777076),
 ('dessert', 0.0008271286861555889)]

## Probability for bigrams
$$
p(i \mid j) \approx \frac{count(i, j)}{\sum\limits_{k} count(k, j)} \approx \frac{count(i, j)}{2 \cdot window \cdot count(j)}
$$

$$
pmi(a, b) = p(a, b)\log \frac{p(a, b)}{p(a)p(b)}
$$

In [89]:
kU = U['Restaurants']
kB = B['Restaurants']

In [94]:
def pab(w1, w2, idx, uidx):
    n = sum(uidx.values())
    try:
        p = idx[w1][w2]
        z = sum(idx[w1].values())
        p_ab = p / z
        p_w1 = uidx[w1] / n
        p_w2 = uidx[w2] / n
        p_c = p_w1 * p_w2
        pmi = p_ab * np.log(p_ab / p_c)
    except KeyError:
        return 0
    return pmi

In [95]:
pab('service', 'food', kB, kU)

0.29759997674368627

## Exercise: suggest ideas on how to aggregate those in aspects

In [96]:
from nltk.corpus import wordnet as wn

### Example 1: WordNet

In [97]:
def h_context(word):
    h = lambda s: s.hypernyms()
    b_all = wn.synsets(word, pos=wn.NOUN)
    H = {}
    for b in b_all:
        H[b] = 0
        for i, j in enumerate(b.closure(h)):
            if j in H.keys():
                if H[j] < i + 1:
                    H[j] = i + 1
            else:
                H[j] = i + 1
    return H

def containement(context_a, context_b):
    common = [(x, y, context_b[x]) for x, y in context_a.items() if x in context_b.keys()]
    return common

In [38]:
c_a = h_context('pizza')
c_b = h_context('food')
containement(c_a, c_b)

[(Synset('food.n.01'), 3, 0),
 (Synset('substance.n.07'), 4, 1),
 (Synset('matter.n.03'), 5, 2),
 (Synset('physical_entity.n.01'), 6, 3),
 (Synset('entity.n.01'), 7, 5)]

In [39]:
k = [x for x, _ in candidates[:20]]

In [42]:
for i, w1 in enumerate(k):
    for w2 in k:
        if w1 != w2:
            cont = containement(h_context(w1), h_context(w2))
            if len(cont) > 0:
                print(w1, w2, cont[0])

food place (Synset('physical_entity.n.01'), 3, 7)
food sauce (Synset('food.n.01'), 0, 5)
food dish (Synset('food.n.01'), 0, 2)
food pizza (Synset('food.n.01'), 0, 3)
food order (Synset('entity.n.01'), 5, 8)
food chicken (Synset('matter.n.03'), 2, 6)
food menu (Synset('food.n.01'), 0, 2)
food meal (Synset('food.n.01'), 0, 2)
food meat (Synset('matter.n.03'), 2, 3)
food fry (Synset('physical_entity.n.01'), 3, 5)
food restaurant (Synset('physical_entity.n.01'), 3, 6)
food burger (Synset('food.n.01'), 0, 5)
food buffet (Synset('food.n.01'), 0, 3)
food salad (Synset('food.n.01'), 0, 3)
food pork (Synset('matter.n.03'), 2, 4)
food cheese (Synset('food.n.01'), 0, 5)
food server (Synset('physical_entity.n.01'), 3, 8)
food potato (Synset('food.n.01'), 0, 7)
food dessert (Synset('food.n.01'), 0, 3)
place food (Synset('physical_entity.n.01'), 7, 3)
place sauce (Synset('physical_entity.n.01'), 7, 8)
place dish (Synset('object.n.01'), 6, 11)
place pizza (Synset('physical_entity.n.01'), 7, 6)
place 

potato dish (Synset('food.n.01'), 7, 2)
potato pizza (Synset('food.n.01'), 7, 3)
potato order (Synset('entity.n.01'), 13, 8)
potato chicken (Synset('food.n.02'), 8, 4)
potato menu (Synset('food.n.01'), 7, 2)
potato meal (Synset('foodstuff.n.02'), 5, 1)
potato meat (Synset('food.n.02'), 8, 1)
potato fry (Synset('physical_entity.n.01'), 12, 5)
potato restaurant (Synset('physical_entity.n.01'), 12, 6)
potato burger (Synset('food.n.01'), 7, 5)
potato buffet (Synset('food.n.01'), 7, 3)
potato salad (Synset('food.n.01'), 7, 3)
potato pork (Synset('food.n.02'), 8, 2)
potato cheese (Synset('foodstuff.n.02'), 5, 3)
potato server (Synset('physical_entity.n.01'), 12, 8)
potato dessert (Synset('food.n.01'), 7, 3)
dessert food (Synset('food.n.01'), 3, 0)
dessert place (Synset('physical_entity.n.01'), 6, 7)
dessert sauce (Synset('food.n.01'), 3, 5)
dessert dish (Synset('nutriment.n.01'), 2, 1)
dessert pizza (Synset('nutriment.n.01'), 2, 2)
dessert order (Synset('entity.n.01'), 7, 8)
dessert chicken 

### Example 2: word context

In [98]:
kU = U['Restaurants']
kB = B['Restaurants']

In [99]:
V = list(kU.keys())
k = [x for x, _ in candidates[:20]]

In [101]:
m = np.zeros((len(k), len(V)))
for w in k:
    i = k.index(w)
    for w_con, _ in kB[w].items():
        try:
            j = V.index(w_con)
            m[i,j] = pab(w, w_con, kB, kU)
        except IndexError:
            pass

In [102]:
from sklearn.metrics.pairwise import cosine_similarity

In [103]:
sigma = cosine_similarity(m, m)

In [104]:
ip = k.index('pizza')
most_sim = [(k, v) for k, v in sorted(enumerate(sigma[ip]), key=lambda x: -x[1])]

In [105]:
for wi, w_score in most_sim:
    print(k[wi], round(w_score, 3))

pizza 1.0
burger 0.73
food 0.677
buffet 0.577
meal 0.538
place 0.533
dish 0.519
dessert 0.516
restaurant 0.485
salad 0.469
meat 0.456
sauce 0.341
fry 0.327
chicken 0.317
menu 0.3
server 0.287
cheese 0.259
order 0.139
pork 0.134
potato 0.132


In [106]:
from sklearn.cluster import KMeans

In [113]:
kmeans = KMeans(n_clusters=6)
clusters = kmeans.fit_predict(m)

In [114]:
cii = set(clusters)

In [120]:
vectors = {}
for c in cii:
    bag = []
    for i, cluster in enumerate(clusters):
        if cluster == c:
            bag.append(k[i])
    vectors[c] = bag

In [123]:
m_1 = np.array([m[k.index(w)] for w in vectors[1]])

In [125]:
m_1.shape

(11, 12392)

In [127]:
sim_c = cosine_similarity(kmeans.cluster_centers_[1].reshape(1, -1), m_1)

In [128]:
sim_c.shape

(1, 11)

In [130]:
for pos, values in sorted(enumerate(sim_c[0]), key=lambda x: -x[1]):
    print(vectors[1][pos], round(values, 3))

food 0.808
burger 0.803
buffet 0.784
pizza 0.783
restaurant 0.762
place 0.759
meal 0.727
dish 0.705
dessert 0.697
server 0.564
menu 0.552
