# Concrete example of using VSM

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## YELP review
[YELP dataset](https://www.yelp.com/dataset)

In [2]:
from sqlalchemy import create_engine
import json

In [3]:
with open('/Users/flint/Data/postgresql/conf.json', 'r') as conf_file:
    conf = json.load(conf_file)

In [4]:
engine = create_engine('postgresql+psycopg2://postgres:{}@localhost/nlp'.format(conf['psw']))

In [22]:
sql = "SELECT * FROM yelp.textclip T JOIN yelp.review R ON T.id = R.clip LIMIT 2000"
reviews = pd.read_sql(sql, engine)

In [23]:
docs = list(reviews.content.values)

In [24]:
print(docs[0])

Red, white and bleu salad was super yum and a great addition to the menu! This location was clean with great service and food served at just the right temps! Kids pizza is always a hit too with lots of great side dish options for the kiddos! When I'm on this side of town, this will definitely be a spot I'll hit up again!


In [25]:
from nltk import word_tokenize
from collections import Counter

In [47]:
def tokenize(text):
    return [x.lower() for x in word_tokenize(text)]

def tf(tokens):
    count = Counter(tokens).most_common()
    sum_c = count[0][1]
    tfout = {}
    for x, y in count:
        tfout[x] = y / sum_c
    return tfout

def tfidf(tokens, idf):
    count = Counter(tokens).most_common()
    sum_c = count[0][1]
    tfout = {}
    for x, y in count:
        tfout[x] = (y / sum_c) * idf[x]
    return tfout

In [27]:
I = {}
for i, doc in enumerate(docs):
    I[i] = tf(tokenize(doc))

In [30]:
M = pd.DataFrame(I).T.fillna(0)

In [31]:
M.head()

Unnamed: 0,!,and,a,great,the,this,",",was,with,hit,...,teleflora\/1-800,thro,abundance,omar,yolks,makers,googling,palette,maine,gimlet
0,1.0,0.75,0.75,0.75,0.75,0.75,0.5,0.5,0.5,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.333333,0.166667,0.0,0.0,1.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5,0.5,0.5,1.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.333333,0.666667,0.0,0.666667,0.333333,0.666667,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
M.shape

(2000, 14337)

In [35]:
docid = 0
print(docs[docid])

Red, white and bleu salad was super yum and a great addition to the menu! This location was clean with great service and food served at just the right temps! Kids pizza is always a hit too with lots of great side dish options for the kiddos! When I'm on this side of town, this will definitely be a spot I'll hit up again!


In [38]:
M.loc[docid].sort_values(ascending=False)[:10]

!        1.00
a        0.75
great    0.75
the      0.75
this     0.75
and      0.75
hit      0.50
i        0.50
of       0.50
side     0.50
Name: 0, dtype: float64

In [52]:
idf = {}
for c in M.columns:
    df_c = np.log(M.shape[0] / len([x for x in M[c].values if x > 0]))
    idf[c] = df_c

In [53]:
idf['!']

0.7011793522572095

In [54]:
TfIdf = {}
for i, doc in enumerate(docs):
    TfIdf[i] = tfidf(tokenize(doc), idf)
T = pd.DataFrame(TfIdf).T.fillna(0)

In [56]:
T.loc[docid].sort_values(ascending=False)[:20]

hit         2.052197
kiddos      1.900226
temps       1.553652
bleu        1.497866
side        1.436757
addition    1.139095
yum         1.105712
white       1.011389
lots        0.965808
red         0.937689
served      0.932425
kids        0.917269
dish        0.893888
great       0.859278
options     0.820854
town        0.807864
pizza       0.783749
spot        0.780891
salad       0.772511
super       0.727389
Name: 0, dtype: float64