In [1]:
from collections import OrderedDict
import json
import csv
import numpy as np
import matplotlib as plt
import pandas as pd
from scipy.sparse import linalg
from scipy import dot
import logging, gensim, bz2
from gensim import corpora
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def get_business_categories(business_file):
    
    all_categories = {'NONE': 0}
    businesses = OrderedDict()

    with open(business_file) as all_business_data:

        for single_business_data in all_business_data:

            business = json.loads(single_business_data)
            b_id = business.pop('business_id')
            
            atts_to_remove = [
                'attributes',
                'hours',
                'name',
                'hours',
                'neighborhood',
                'address',
                'state',
                'is_open',
                'type'
            ]
            
            keys = list(business.keys())
            for key in keys:
                if key in atts_to_remove:
                    del business[key]
            
            categories = business['categories']
            businesses[b_id] = business

            if categories:
                for category in categories:
                    if category not in all_categories.keys():
                        all_categories[category] = 1
                    else:
                        all_categories[category] += 1
            else:
                business['categories'] = ['NONE']
                all_categories['NONE'] += 1

    all_categories = OrderedDict([ (tup[0], [i, tup[1]]) for i, tup in enumerate(sorted(all_categories.items(), key=lambda t: t[1], reverse=True))])
    
    return businesses, all_categories

def build_business_cat_vectors(biz_list, list_of_catlists, cat_idxs):
    cat_vectors = []
    vec_length = len(cat_idxs.keys())
    for cat_list in list_of_catlists:
        cat_vector = [0] * vec_length
        for cat in cat_list:
            idx = cat_idxs[cat][0]
            cat_vector[idx] = 1
        cat_vectors.append(tuple(cat_vector))
            
    # returns a {biz_id : (cat_vector)} dictionary
    return OrderedDict(zip(biz_list, cat_vectors))

business_json = '../yelp_data/yelp_academic_dataset_business.json'

bizs, cats = get_business_categories(business_json)

In [14]:
cat_frame = {}
count=0
for cat, data in cats.items():
    cat_frame[cat] = [data[1]]
    if count>10:
        break
    count += 1 
    
# print(sorted(cat_frame, key=lambda x: {y:x}))
cdf = pd.DataFrame(cat_frame)
cdf.head()

Unnamed: 0,Active Life,Automotive,Bars,Beauty & Spas,Event Planning & Services,Food,Health & Medical,Home Services,Local Services,Nightlife,Restaurants,Shopping
0,6722,8554,9087,13711,7224,21189,10476,11241,8133,10524,48485,22466


In [4]:
bizcattuples = [(b_id, biz['categories']) for b_id, biz in bizs.items()]
bizidlist, cat_lists = zip(*bizcattuples)
biz_cat_vectors = build_business_cat_vectors(bizidlist, cat_lists, cats)

In [5]:
dictionary = corpora.Dictionary(cat_lists)

2017-05-01 15:37:51,009 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-05-01 15:37:51,231 : INFO : adding document #10000 to Dictionary(887 unique tokens: ['Desserts', 'Flight Instruction', 'Internet Service Providers', 'Spanish', 'Pet Adoption']...)
2017-05-01 15:37:51,417 : INFO : adding document #20000 to Dictionary(991 unique tokens: ['Desserts', 'Flight Instruction', 'Internet Service Providers', 'Spanish', 'Pet Adoption']...)
2017-05-01 15:37:51,594 : INFO : adding document #30000 to Dictionary(1031 unique tokens: ['Desserts', 'Flight Instruction', 'Internet Service Providers', 'Spanish', 'Pet Adoption']...)
2017-05-01 15:37:51,773 : INFO : adding document #40000 to Dictionary(1078 unique tokens: ['Desserts', 'Flight Instruction', 'Internet Service Providers', 'Spanish', 'Pet Adoption']...)
2017-05-01 15:37:51,959 : INFO : adding document #50000 to Dictionary(1111 unique tokens: ['Desserts', 'Flight Instruction', 'Internet Service Providers', 'Spanish', 'Pet 

In [6]:
gensim_corpus = [dictionary.doc2bow(text) for text in cat_lists]

In [7]:
mm = gensim.corpora.MmCorpus('../corpus.mm')

2017-05-01 15:38:01,280 : INFO : loaded corpus index from ../corpus.mm.index
2017-05-01 15:38:01,281 : INFO : initializing corpus reader from ../corpus.mm
2017-05-01 15:38:01,283 : INFO : accepted corpus with 144072 documents, 1192 features, 527929 non-zero entries


In [8]:
lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=50, onepass=False)

2017-05-01 15:38:02,144 : INFO : using serial LSI version on this node
2017-05-01 15:38:02,146 : INFO : updating model with new documents
2017-05-01 15:38:02,147 : INFO : using 100 extra samples and 2 power iterations
2017-05-01 15:38:02,148 : INFO : 1st phase: constructing (1192, 150) action matrix
2017-05-01 15:38:02,740 : INFO : PROGRESS: at document #0
2017-05-01 15:38:03,295 : INFO : PROGRESS: at document #20000
2017-05-01 15:38:03,923 : INFO : PROGRESS: at document #40000
2017-05-01 15:38:04,455 : INFO : PROGRESS: at document #60000
2017-05-01 15:38:05,043 : INFO : PROGRESS: at document #80000
2017-05-01 15:38:05,581 : INFO : PROGRESS: at document #100000
2017-05-01 15:38:06,119 : INFO : PROGRESS: at document #120000
2017-05-01 15:38:06,397 : INFO : PROGRESS: at document #140000
2017-05-01 15:38:06,462 : INFO : running power iteration #1
2017-05-01 15:38:06,801 : INFO : PROGRESS: at document #0/144072
2017-05-01 15:38:07,217 : INFO : PROGRESS: at document #20000/144072
2017-05-01

In [203]:
def assign_topics(biz_dict, lsi_model):
    for biz_id, biz in biz_dict.items():
        vector = dictionary.doc2bow( biz['categories'] )
        topic = max( lsi_model[vector], key=lambda tup: tup[1] )
        biz['topic'] = topic[0]
        biz.pop('categories')

assign_topics(bizs, lsi)
        

In [17]:
biz_out = '../yelp_data/business_compressed.csv'

with open(biz_out, 'w') as csvfile:
    names = ['business_id', 'city', 'longitude', 'latitude', 'postal_code', 'review_count', 'stars', 'topic']
    writer = csv.DictWriter(csvfile, fieldnames=names)
    
    writer.writeheader()
    
    for biz_id, biz in bizs.items():
        biz['business_id'] = biz_id
        writer.writerow(biz)

ValueError: dict contains fields not in fieldnames: 'categories'

In [9]:
_kwargs = dict(formatted=0, num_words=20)
# tups = [tups for tups in lsi.show_topics(**_kwargs)]
topic_words = [[t[0] for t in tups[1]] for tups in lsi.show_topics(**_kwargs)]

In [10]:
topic_words[1]

['Tobacco Shops',
 'Leather Goods',
 'Food',
 'Permanent Makeup',
 'Home & Garden',
 'Professional Services',
 'Self Storage',
 'Italian',
 "Women's Clothing",
 'General Dentistry',
 'Cosmetics & Beauty Supply',
 'Accessories',
 'Department Stores',
 'Massage',
 'Shoe Stores',
 'Flowers & Gifts',
 'Sporting Goods',
 'Event Planning & Services',
 'Mattresses',
 'Office Equipment']

In [11]:
framed_words = []
for i in range(0,20):
        row = {}
        for j in range(0, len(topic_words)):
            row[j+1] = topic_words[j][i]
        framed_words.append(row)

In [12]:
frame = pd.DataFrame.from_records(framed_words)

In [13]:
frame.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
0,Italian,Tobacco Shops,Food,Permanent Makeup,Nightlife,Professional Services,General Dentistry,Wheel & Rim Repair,Caterers,Active Life,...,Active Life,Auto Repair,Ice Cream & Frozen Yogurt,Flowers & Gifts,Cafes,Flowers & Gifts,Sporting Goods,Seafood,Doctors,Day Spas
1,Food,Leather Goods,Coffee & Tea,Massage,Pubs,Real Estate,Doctors,Auto Repair,Hotels,Fitness & Instruction,...,Fitness & Instruction,Wheel & Rim Repair,Bakeries,Cosmetics & Beauty Supply,Ice Cream & Frozen Yogurt,Doctors,Women's Clothing,Steakhouses,Day Spas,Desserts
2,Nightlife,Food,Tobacco Shops,General Dentistry,Italian,Self Storage,Permanent Makeup,Professional Services,Active Life,Caterers,...,Gyms,Oil Change Stations,Auto Repair,Hair Removal,Mediterranean,Florists,Sports Wear,American (New),Electronics,Department Stores
3,Pubs,Permanent Makeup,Italian,Nail Salons,Food,Wheel & Rim Repair,Dentists,Oil Change Stations,Hotels & Travel,Hotels,...,Life Insurance,Event Planning & Services,Wheel & Rim Repair,Day Spas,Bakeries,Active Life,Electronics,Desserts,Computers,American (New)
4,American (Traditional),Home & Garden,Nightlife,Tobacco Shops,American (New),Tobacco Shops,Health & Medical,Tires,Fitness & Instruction,Gyms,...,Sporting Goods,Gas & Service Stations,Oil Change Stations,Florists,Coffee & Tea,Fabric Stores,Mobile Phones,Mediterranean,Optometrists,Doctors
5,Breakfast & Brunch,Professional Services,Pubs,Waxing,Sports Bars,Real Estate Services,Massage,Automotive,Arts & Entertainment,Hotels & Travel,...,Trainers,Bakeries,Tires,Massage,Seafood,Nail Salons,Active Life,Dance Clubs,Mobile Phones,Mediterranean
6,Chicken Wings,Self Storage,Grocery,Skin Care,Fast Food,Leather Goods,Active Life,Used Car Dealers,Venues & Event Spaces,Trainers,...,Flowers & Gifts,Tires,Mexican,Ice Cream & Frozen Yogurt,Chinese,Women's Clothing,Computers,Greek,Seafood,Seafood
7,Fast Food,Italian,Bakeries,Day Spas,Arts & Entertainment,Contractors,Periodontists,Real Estate,Gyms,General Dentistry,...,Parks,Convenience Stores,Canadian (New),Fabric Stores,Mexican,Sporting Goods,Bikes,Lebanese,General Dentistry,Men's Hair Salons
8,Pizza,Women's Clothing,Event Planning & Services,Hair Removal,American (Traditional),Auto Repair,Nail Salons,Body Shops,General Dentistry,Yoga,...,Banks & Credit Unions,Mexican,Sushi Bars,Nail Salons,Sushi Bars,Massage,Accessories,Doctors,Jewelry,Women's Clothing
9,Canadian (New),General Dentistry,Desserts,Doctors,Dance Clubs,Home Services,Fitness & Instruction,Gas & Service Stations,Specialty Food,Self Storage,...,Bikes,Ice Cream & Frozen Yogurt,Gas & Service Stations,Doctors,Lebanese,Day Spas,Parks,Event Planning & Services,Waxing,Convenience Stores
