In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from flair.embeddings import WordEmbeddings
from flair.data import Sentence



# TODO: Folium map of business locations, etc.
# word similarity with food

## Read JSON Data

In [2]:
df = pd.read_json("yelp_dataset/yelp_academic_dataset_business.json", lines=True)
print(df.head)

<bound method NDFrame.head of                                                   address  \
0                                       1314 44 Avenue NE   
1                                                           
2                                     1335 rue Beaubien E   
3                                         211 W Monroe St   
4                                     2005 Alyth Place SE   
5                         20235 N Cave Creek Rd, Ste 1115   
6                                          631 Bloor St W   
7                             3417 Derry Road E, Unit 103   
8                                      1440 N. Dysart Ave   
9                                         209 Oakland Ave   
10                                       4568 Highway 7 E   
11                                     595 Markham Street   
12                                        2801 N 15th Ave   
13                                4216 Saint-Laurent Boul   
14                                      4404 14 Street 

[188593 rows x 15 columns]>


## What are all of the food related words?

In [3]:
categories = df['categories']
s = set()
for i in range(0, len(categories)):
    text = categories[i]
    if text:
        for word in text.split(","):
            s.add(word)
sorted(s)

[' & Probates',
 ' 3D Printing',
 ' ATV Rentals/Tours',
 ' Acai Bowls',
 ' Accessories',
 ' Accountants',
 ' Acne Treatment',
 ' Active Life',
 ' Acupuncture',
 ' Addiction Medicine',
 ' Adoption Services',
 ' Adult',
 ' Adult Education',
 ' Adult Entertainment',
 ' Advertising',
 ' Aerial Fitness',
 ' Aerial Tours',
 ' Aestheticians',
 ' Afghan',
 ' African',
 ' Air Duct Cleaning',
 ' Aircraft Dealers',
 ' Aircraft Repairs',
 ' Airlines',
 ' Airport Lounges',
 ' Airport Shuttles',
 ' Airport Terminals',
 ' Airports',
 ' Airsoft',
 ' Allergists',
 ' Alternative Medicine',
 ' Amateur Sports Teams',
 ' American (New)',
 ' American (Traditional)',
 ' Amusement Parks',
 ' Anesthesiologists',
 ' Animal Assisted Therapy',
 ' Animal Physical Therapy',
 ' Animal Shelters',
 ' Antiques',
 ' Apartment Agents',
 ' Apartments',
 ' Appliances',
 ' Appliances & Repair',
 ' Appraisal Services',
 ' Aquarium Services',
 ' Aquariums',
 ' Arabian',
 ' Arcades',
 ' Archery',
 ' Architects',
 ' Architectur

## Glove Word Embeddings

In [5]:
glove_embedding = WordEmbeddings('glove')

In [8]:
for category in s:
    text = Sentence(category)
    glove_embedding.embed(text)

In [6]:
# create sentence.
sentence = Sentence('The grass is green .')

# embed a sentence using glove.
glove_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 The
tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  

In [33]:
import torch
import numpy as np
from numpy import linalg as LA

In [55]:
t1 = Sentence("food")
t2 = Sentence("pencil")
glove_embedding.embed(t1)
glove_embedding.embed(t2)
# input1 = torch.randn(100, 128)
# input2 = input1
input1, input2 = t1[0].embedding, t2[0].embedding
a = np.asarray(input1)
b = np.asarray(input2)
dot = np.dot(a.flatten(), b.flatten())

In [56]:
a_mag = LA.norm(a)
b_mag = LA.norm(b)

In [57]:
sim = dot / a_mag / b_mag

In [58]:
print("Similarity", sim)

Similarity 0.009144417


## Where are the businesses located?

In [3]:
food_columns = df[df['categories'].str.contains("Food")==True]
print("Number of restaurants", len(food_columns))
df['city'].value_counts()

Number of restaurants 33488


Las Vegas                     28865
Phoenix                       18633
Toronto                       18233
Charlotte                      9204
Scottsdale                     8822
Calgary                        7384
Pittsburgh                     6804
Mesa                           6239
MontrÃ©al                      6045
Henderson                      4815
Tempe                          4492
Chandler                       4272
Madison                        3509
Cleveland                      3506
Glendale                       3469
Gilbert                        3397
Mississauga                    2954
Peoria                         1868
Markham                        1699
North Las Vegas                1508
Champaign                      1243
Scarborough                    1175
North York                     1140
Surprise                       1119
Richmond Hill                   978
Concord                         975
Brampton                        929
Vaughan                     