# Example of category and frequency based aspect detection

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import json

In [3]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
from sqlalchemy.schema import Table, MetaData

In [4]:
connstr = "postgresql+psycopg2://{user}:{pwd}@{ipaddress}/{dbname}".format(
    user='postgres', pwd='flintpsql', ipaddress='localhost', dbname='bdlab'
)
engine = create_engine(connstr)

In [5]:
def read(sql, engine):
    conn = engine.connect()
    data = pd.read_sql(sql, conn)
    conn.close()
    return data

## Get reviews

In [6]:
sql = """
select b.id, b.name, r.content
from yelp.textclip as r
join yelp.business as b on r.business = b.id
join yelp.reviewer as u on r.author = u.id
where b.review > 500
limit 10000
"""
R = read(sql, engine=engine)

In [7]:
R.shape

(10000, 3)

In [8]:
R.head()

Unnamed: 0,id,name,content
0,J4CATH00YZrq8Bne2S4_cw,CUT by Wolfgang Puck,"Amazing, friendly and classy service. Great st..."
1,bpRo8L8dkhgbJhdIKa9mwA,STK Las Vegas,Often requires a reservation. Cool atmosphere ...
2,zEaGcSVPDQipnRdEZp-F6g,Dom DeMarco's Pizzeria & Bar,AMAZING! Great food and outdoor seating is bea...
3,eduRavkml8awmPachSZXuw,Kona Grill,"Horrible food, slow, over priced and rude! No ..."
4,elqbBhBfElMNSrjFqW3now,Pin Kaow Thai Restaurant,I LOVE LOVE LOVE this place. The staff is alwa...


## Get categories

In [9]:
K = read("select * from yelp.incat", engine)

In [10]:
K.head(2)

Unnamed: 0,business,category
0,mmazCP1ZH0QsUqDS6OivFA,Seafood
1,Zh7k_33xMSPwm0UU7LsEIw,Education


In [11]:
category = lambda x: list(K[K.business==x].category.values)

In [12]:
sample_cat = R.id.unique()
category_map = dict([(b, category(b)) for b in sample_cat])

## Indexing
We can extract n-grams from text and indexing by category

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [7]:
from collections import defaultdict
import time

In [15]:
def tokenize(text):
    doc = nlp(text)
    tokens = []
    for sentence in doc.sents:
        tokens.append([(t.lemma_, t.pos_) for t in sentence])
    return tokens

In [16]:
def shift_ngrams(text, window=3):
    grams = []
    for sent in tokenize(text):
        for i, (token, pos) in enumerate(sent):
            if pos == 'NOUN':
                grams.append(token)
                for token_j, pos_j in sent[max([i-window, 0]):i+window+1]:
                    if pos_j in ['NOUN', 'ADJ', 'VERB', 'ADV'] and token_j != token:
                            grams.append((token_j, token))
    return grams

### Note: this is slow, so we pre-compute and save indexes

In [17]:
unigram = defaultdict(lambda: defaultdict(lambda: 0))
bigram = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

business_units = R.id.values
reviews = R.content.values

for i, text in tqdm(list(enumerate(reviews))):
    business = business_units[i]
    for token in shift_ngrams(text):
        for k in category_map[business]:
            if isinstance(token, tuple):
                for x in token:
                    unigram[k][x] += 1
                bigram[k][token[1]][token[0]] += 1
            else:
                unigram[k][token] += 1
U = dict([(x, dict(y)) for x, y in unigram.items()])
B = {}
for x, y in bigram.items():
    data = dict([(p, dict(q)) for p, q in y.items()])
    B[x] = data

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [18]:
with open('data/unigram.json', 'w') as uo:
    json.dump(U, uo)
with open('data/bigram.json', 'w') as bo:
    json.dump(B, bo)

## Aspect detection

In [3]:
with open('data/unigram.json', 'r') as uo:
    U = json.load(uo)
with open('data/bigram.json', 'r') as bo:
    B = json.load(bo)

In [4]:
list(U['Restaurants'].items())[:4]

[('service', 10463), ('friendly', 824), ('classy', 45), ('staff', 3491)]

In [5]:
for k, d in list(B['Restaurants'].items())[:4]:
    print(k, list(d.items())[:4])

service [('friendly', 123), ('classy', 2), ('nice', 42), ('downside', 2)]
staff [('great', 74), ('upscale', 1), ('food', 34), ('delivery', 1)]
experience [('really', 26), ('enhance', 5), ('personal', 2), ('consist', 1)]
butter [('salad', 4), ('so', 5), ('lot', 3), ('salt', 5)]


## Kullback–Leibler approach
$$
KL_t = p(t)\log \frac{p(t)}{q(t)}
$$

In [8]:
global_u = defaultdict(lambda: 0)
global_s = 0
for k, v in U.items():
    for t, w in v.items():
        global_u[t] += w
        global_s += w

In [12]:
def kl_unigram(category, unigram):
    kl = {}
    s = sum(unigram[category].values())
    for k, v in unigram[category].items():
        p_k = v / s
        q_k = global_u[k] / global_s
        kl[k] = p_k * np.log(p_k / q_k)
    return kl

In [13]:
klu = kl_unigram('Restaurants', U)

In [15]:
candidates = [(k, v) for k, v in sorted(klu.items(), key=lambda x: -x[1])]

In [18]:
candidates[:20]

[('food', 0.004403045406201249),
 ('place', 0.0017752309694621638),
 ('sauce', 0.0014525373906889188),
 ('dish', 0.00141628627400457),
 ('pizza', 0.001366669843011592),
 ('order', 0.0012087844243553719),
 ('chicken', 0.0011693420928505058),
 ('menu', 0.0011279902530636132),
 ('meal', 0.0010401526364132345),
 ('meat', 0.0010135126439660752),
 ('fry', 0.0009900395473505152),
 ('restaurant', 0.0009406684703899171),
 ('burger', 0.0009324358813403639),
 ('buffet', 0.0008908107917991969),
 ('salad', 0.000890510424238707),
 ('pork', 0.0008711928367037539),
 ('cheese', 0.000859032948638822),
 ('server', 0.0008485966145150159),
 ('potato', 0.0008292769121777076),
 ('dessert', 0.0008271286861555889)]

## Exercise: suggest ideas on how to aggregate those in aspects

## Probability for bigrams
$$
p(i \mid j) \approx \frac{count(i, j)}{\sum\limits_{k} count(k, j)} \approx \frac{count(i, j)}{2 \cdot window \cdot count(j)}
$$