In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [147]:
user_n_business = pd.read_csv('dataset/yelp_academic_dataset_tip_reduced_100.csv')

In [148]:
business_categories = pd.read_csv('dataset/yelp_academic_dataset_business_55000.csv')

In [149]:
business_categories.head()

Unnamed: 0,business_id,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
0,Apn5Q_b6Nz61Tq4XzPdf9A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AjEbIBw6ZFfln7ePHha9PA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,O8S5hYJ1SMc8fA4QBtVujA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,bFzdJJ3wp3PZssNEsyU23g,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8USyCYqpScwiNEb58Bt6CA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Defining new dataframe

The new dataframe will be a data set that relates users with how many interactions they have with each category.

In [176]:
index = set(user_n_business['user_id']) # getting only unique user ids

In [177]:
columns = business_categories.columns
columns = columns.drop(['business_id']) # take out 'business_id'

In [178]:
df = pd.DataFrame(index=index, columns=columns)

In [179]:
df = df.fillna(0) #Fill NaNs with 0s

In [180]:
df.head()

Unnamed: 0,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,Party & Event Planning,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
OVpt8ITCEWyYBjn0N3CRwg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ulQ8Nyj7jCUR8M83SUMoRQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o9fbX0eNRqaX0K-PmuXWnQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M5n8mMFoeXjnJEV2gxyGGA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QGgWWhEi5R4SLAKN-xwtNQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Filling new dataframe with the sum of interactions

In [181]:
# Count how many business that the user interacted was found
found_n, notFound_n = 0, 0

# Finding out wich categories of business the user interacted
for index, row in user_n_business.iterrows():
    user_id = str(row['user_id'])
    business_id = str(row['business_id'])
    business_index = business_categories.index[business_categories['business_id'] == business_id].tolist()
    
    #If the business exists, increment the number of interactions at its categories in the dataframe
    if business_index:
        print("found business " + business_id + " with index: " + str(business_index[0]))
        found_n = found_n + 1
        
        for column in df.columns:
            new_value = df.loc[user_id][column] + business_categories.loc[business_index][column]
            df.at[user_id, column] = new_value
        
    else:
        print("coudnt find business with id: " + business_id)
        notFound_n = notFound_n + 1
        
print("Business found: " + str(found_n) + "; Not found: " + str(notFound_n))

found business tJRDll5yqpZwehenzE2cSg with index: 52561
found business 2ThtBbeDqFkVi6LugUOcVA with index: 12391
coudnt find business with id: jH19V2I9fIslnNhDzPmdkA
coudnt find business with id: dAa0hB2yrnHzVmsCkN4YvQ
coudnt find business with id: dAa0hB2yrnHzVmsCkN4YvQ
coudnt find business with id: ESzO3Av0b1_TzKOiqzbQYQ
coudnt find business with id: k7WRPbDd7rztjHcGGkEjlw
coudnt find business with id: k7WRPbDd7rztjHcGGkEjlw
coudnt find business with id: SqW3igh1_Png336VIb5DUA
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: 8qNOI6Q1-rJrvWWD5Btz6w
coudnt find business with id: 8qNOI6Q1-rJrvWWD5Btz6w
found business A73Dp0lo9s_Ci9NFDqbhqw with index: 32308
found business --ujyvoQlwVoBgMYtADiLA with index: 23232
coudnt find business with id: FCUVjQf762no86Uzcbv1Tg
coudnt find business with id: Tl_jT2a0bRMac5_YW65GPg
coudnt find business with id: Tl_j

In [182]:
# Dataframe populated
df.head()

Unnamed: 0,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,Party & Event Planning,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
OVpt8ITCEWyYBjn0N3CRwg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ulQ8Nyj7jCUR8M83SUMoRQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
o9fbX0eNRqaX0K-PmuXWnQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M5n8mMFoeXjnJEV2gxyGGA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QGgWWhEi5R4SLAKN-xwtNQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [183]:
df.loc['zcTZk7OG8ovAmh_fenH21g','Event Planning & Services']

1