In [1]:
from collections import OrderedDict
import json
import csv
import numpy as np
import matplotlib as plt
import pandas as pd
from scipy.sparse import linalg
from scipy import dot
import logging, gensim, bz2
from gensim import corpora
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def get_business_categories(business_file):
    
    all_categories = {'NONE': 0}
    businesses = OrderedDict()

    with open(business_file) as all_business_data:

        for single_business_data in all_business_data:

            business = json.loads(single_business_data)
            b_id = business.pop('business_id')
            
            atts_to_remove = [
                'attributes',
                'hours',
                'name',
                'hours',
                'neighborhood',
                'address',
                'state',
                'is_open',
                'type'
            ]
            
            keys = list(business.keys())
            for key in keys:
                if key in atts_to_remove:
                    del business[key]
            
            categories = business['categories']
            businesses[b_id] = business

            if categories:
                for category in categories:
                    if category not in all_categories.keys():
                        all_categories[category] = 1
                    else:
                        all_categories[category] += 1
            else:
                business['categories'] = ['NONE']
                all_categories['NONE'] += 1

    all_categories = OrderedDict([ (tup[0], [i, tup[1]]) for i, tup in enumerate(sorted(all_categories.items(), key=lambda t: t[1], reverse=True))])
    
    return businesses, all_categories

def build_business_cat_vectors(biz_list, list_of_catlists, cat_idxs):
    cat_vectors = []
    vec_length = len(cat_idxs.keys())
    for cat_list in list_of_catlists:
        cat_vector = [0] * vec_length
        for cat in cat_list:
            idx = cat_idxs[cat][0]
            cat_vector[idx] = 1
        cat_vectors.append(tuple(cat_vector))
            
    # returns a {biz_id : (cat_vector)} dictionary
    return OrderedDict(zip(biz_list, cat_vectors))

business_json = '../yelp_data/yelp_academic_dataset_business.json'

bizs, cats = get_business_categories(business_json)

In [3]:
cat_frame = {}
count=0
for cat, data in cats.items():
    cat_frame[cat] = [data[1]]
    if count>10:
        break
    count += 1 
    
print(sorted(cat_frame, key=lambda x: {y:x}))
cdf = pd.DataFrame(cat_frame)
cdf.head()

NameError: name 'y' is not defined

In [4]:
bizcattuples = [(b_id, biz['categories']) for b_id, biz in bizs.items()]
bizidlist, cat_lists = zip(*bizcattuples)
biz_cat_vectors = build_business_cat_vectors(bizidlist, cat_lists, cats)

In [10]:
dictionary = corpora.Dictionary(cat_lists)

2017-05-01 14:52:00,416 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-05-01 14:52:00,604 : INFO : adding document #10000 to Dictionary(887 unique tokens: ['Auto Loan Providers', 'Recycling Center', 'Nail Salons', 'Amateur Sports Teams', 'Breakfast & Brunch']...)
2017-05-01 14:52:00,789 : INFO : adding document #20000 to Dictionary(991 unique tokens: ['Auto Loan Providers', 'Recycling Center', 'Nail Salons', 'Amateur Sports Teams', 'Breakfast & Brunch']...)
2017-05-01 14:52:00,975 : INFO : adding document #30000 to Dictionary(1031 unique tokens: ['Auto Loan Providers', 'Recycling Center', 'Nail Salons', 'Amateur Sports Teams', 'Breakfast & Brunch']...)
2017-05-01 14:52:01,154 : INFO : adding document #40000 to Dictionary(1078 unique tokens: ['Auto Loan Providers', 'Recycling Center', 'Nail Salons', 'Amateur Sports Teams', 'Breakfast & Brunch']...)
2017-05-01 14:52:01,346 : INFO : adding document #50000 to Dictionary(1111 unique tokens: ['Auto Loan Providers', 'Recy

In [11]:
gensim_corpus = [dictionary.doc2bow(text) for text in cat_lists]

In [12]:
mm = gensim.corpora.MmCorpus('../corpus.mm')

2017-05-01 14:52:08,323 : INFO : loaded corpus index from ../corpus.mm.index
2017-05-01 14:52:08,324 : INFO : initializing corpus reader from ../corpus.mm
2017-05-01 14:52:08,325 : INFO : accepted corpus with 144072 documents, 1192 features, 527929 non-zero entries


In [13]:
lsi = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=50, onepass=False)

2017-05-01 14:52:09,296 : INFO : using serial LSI version on this node
2017-05-01 14:52:09,297 : INFO : updating model with new documents
2017-05-01 14:52:09,298 : INFO : using 100 extra samples and 2 power iterations
2017-05-01 14:52:09,299 : INFO : 1st phase: constructing (1192, 150) action matrix
2017-05-01 14:52:09,903 : INFO : PROGRESS: at document #0
2017-05-01 14:52:10,472 : INFO : PROGRESS: at document #20000
2017-05-01 14:52:11,027 : INFO : PROGRESS: at document #40000
2017-05-01 14:52:11,605 : INFO : PROGRESS: at document #60000
2017-05-01 14:52:12,225 : INFO : PROGRESS: at document #80000
2017-05-01 14:52:12,811 : INFO : PROGRESS: at document #100000
2017-05-01 14:52:13,387 : INFO : PROGRESS: at document #120000
2017-05-01 14:52:13,658 : INFO : PROGRESS: at document #140000
2017-05-01 14:52:13,722 : INFO : running power iteration #1
2017-05-01 14:52:14,067 : INFO : PROGRESS: at document #0/144072
2017-05-01 14:52:14,746 : INFO : PROGRESS: at document #20000/144072
2017-05-01

In [203]:
def assign_topics(biz_dict, lsi_model):
    for biz_id, biz in biz_dict.items():
        vector = dictionary.doc2bow( biz['categories'] )
        topic = max( lsi_model[vector], key=lambda tup: tup[1] )
        biz['topic'] = topic[0]
        biz.pop('categories')

assign_topics(bizs, lsi)
        

In [17]:
biz_out = '../yelp_data/business_compressed.csv'

with open(biz_out, 'w') as csvfile:
    names = ['business_id', 'city', 'longitude', 'latitude', 'postal_code', 'review_count', 'stars', 'topic']
    writer = csv.DictWriter(csvfile, fieldnames=names)
    
    writer.writeheader()
    
    for biz_id, biz in bizs.items():
        biz['business_id'] = biz_id
        writer.writerow(biz)

ValueError: dict contains fields not in fieldnames: 'categories'

In [14]:
_kwargs = dict(formatted=0, num_words=20)
# tups = [tups for tups in lsi.show_topics(**_kwargs)]
topic_words = [[t[0] for t in tups[1]] for tups in lsi.show_topics(**_kwargs)]

In [15]:
topic_words[1]

['Vape Shops',
 'Accessories',
 'Food',
 'Beauty & Spas',
 'Home & Garden',
 'Professional Services',
 'Local Services',
 'Chicken Wings',
 'Gift Shops',
 'Dentists',
 'Cosmetics & Beauty Supply',
 'Leather Goods',
 'Department Stores',
 'Hair Removal',
 'Shoe Stores',
 'Florists',
 'Sporting Goods',
 'Grocery',
 'Home Decor',
 'Furniture Stores']

In [207]:
framed_words = []
for i in range(0,20):
        row = {}
        for j in range(0, len(topic_words)):
            row[j+1] = topic_words[j][i]
        framed_words.append(row)

In [208]:
frame = pd.DataFrame.from_records(framed_words)

In [211]:
frame.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
0,Chicken Wings,Tobacco Shops,Party & Event Planning,Hair Extensions,Nightlife,Professional Services,Health & Medical,Wheel & Rim Repair,Caterers,Sports Clubs,...,Sports Clubs,Oil Change Stations,Ice Cream & Frozen Yogurt,Florists,Cafes,Florists,Sporting Goods,Seafood,Sports Medicine,Medical Spas
1,Party & Event Planning,Leather Goods,Desserts,Hair Removal,American (New),Apartments,Sports Medicine,Oil Change Stations,Hotels & Travel,Fitness & Instruction,...,Fitness & Instruction,Wheel & Rim Repair,Bakeries,Cosmetics & Beauty Supply,Ice Cream & Frozen Yogurt,Sports Medicine,Gift Shops,Steakhouses,Medical Spas,Coffee & Tea
2,Nightlife,Party & Event Planning,Tobacco Shops,Health & Medical,Chicken Wings,Self Storage,Hair Extensions,Professional Services,Sports Clubs,Caterers,...,Gyms,Auto Repair,Oil Change Stations,Waxing,Turkish,Flowers & Gifts,Sports Wear,Bars,Computers,Department Stores
3,American (New),Hair Extensions,Chicken Wings,Nail Salons,Party & Event Planning,Wheel & Rim Repair,Dentists,Auto Repair,Hotels,Hotels & Travel,...,Life Insurance,Specialty Food,Wheel & Rim Repair,Medical Spas,Bakeries,Sports Clubs,Electronics,Coffee & Tea,Mobile Phones,Sports Medicine
4,American (Traditional),Cabinetry,Nightlife,Tobacco Shops,Bars,Tobacco Shops,General Dentistry,Auto Parts & Supplies,Fitness & Instruction,Gyms,...,Sporting Goods,Bakeries,Auto Repair,Flowers & Gifts,Seafood,Gift Shops,Computers,Turkish,Medical Centers,Turkish
5,Sandwiches,Professional Services,American (New),Permanent Makeup,Sports Bars,Real Estate,Hair Removal,Automotive,Performing Arts,Hotels,...,Trainers,Gas & Service Stations,Auto Parts & Supplies,Ice Cream & Frozen Yogurt,Desserts,Nail Salons,Sports Clubs,Dance Clubs,Electronics,Bars
6,Italian,Self Storage,Event Planning & Services,Skin Care,Tex-Mex,Leather Goods,Sports Clubs,Car Buyers,Venues & Event Spaces,Trainers,...,Florists,Auto Parts & Supplies,Fast Food,Hair Removal,Chinese,Fabric Stores,Mobile Phones,Middle Eastern,Health & Medical,Seafood
7,Tex-Mex,Chicken Wings,Bakeries,Medical Spas,Performing Arts,Telecommunications,Cosmetic Dentists,Apartments,Gyms,Health & Medical,...,Parks,Convenience Stores,Canadian (New),Fabric Stores,Fast Food,Sporting Goods,Bikes,Wine Bars,Jewelry,Gift Shops
8,Pizza,Gift Shops,Specialty Food,Waxing,American (Traditional),Oil Change Stations,Nail Salons,Body Shops,Health & Medical,Yoga,...,Banks & Credit Unions,Fast Food,Gas & Service Stations,Nail Salons,Sushi Bars,Computers,Fashion,Indian,Seafood,Men's Hair Salons
9,Canadian (New),Health & Medical,Coffee & Tea,Sports Medicine,Dance Clubs,Home Services,Fitness & Instruction,Gas & Service Stations,Grocery,Self Storage,...,Bikes,Ice Cream & Frozen Yogurt,Sushi Bars,Sports Medicine,Middle Eastern,Medical Spas,Parks,Persian/Iranian,Permanent Makeup,Convenience Stores


AttributeError: 'LsiModel' object has no attribute 'ascarray'