In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
user_n_business = pd.read_csv('dataset/yelp_academic_dataset_tip_reduced_1000.csv')

In [3]:
business_categories = pd.read_csv('dataset/yelp_academic_dataset_business_55000.csv')

In [4]:
business_categories.head()

Unnamed: 0,business_id,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
0,Apn5Q_b6Nz61Tq4XzPdf9A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AjEbIBw6ZFfln7ePHha9PA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,O8S5hYJ1SMc8fA4QBtVujA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,bFzdJJ3wp3PZssNEsyU23g,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8USyCYqpScwiNEb58Bt6CA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Defining new dataframe

The new dataframe will be a data set that relates users with how many interactions they have with each category.

In [5]:
index = set(user_n_business['user_id']) # getting only unique user ids

In [6]:
columns = business_categories.columns
columns = columns.drop(['business_id']) # take out 'business_id'

In [7]:
df = pd.DataFrame(index=index, columns=columns)

In [8]:
df = df.fillna(0) #Fill NaNs with 0s

In [9]:
df.head()

Unnamed: 0,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,Party & Event Planning,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
JE2qFjL4BaUbiI-cT5MSBw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4rYg5UywNqmKywRWJXOp4g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ulQ8Nyj7jCUR8M83SUMoRQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZcLKXikTHYOnYt5VYRO5sg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PZNMPWCViVX8JLsn10MSnQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Filling new dataframe with the sum of interactions

In [10]:
# Count how many business that the user interacted was found
found_n, notFound_n = 0, 0

# Finding out wich categories of business the user interacted
for index, row in user_n_business.iterrows():
    user_id = str(row['user_id'])
    business_id = str(row['business_id'])
    business_index = business_categories.index[business_categories['business_id'] == business_id].tolist()
    
    #If the business exists, increment the number of interactions at its categories in the dataframe
    if business_index:
        print("found business " + business_id + " with index: " + str(business_index[0]))
        found_n = found_n + 1
        
        for column in df.columns:
            new_value = df.loc[user_id][column] + business_categories.loc[business_index][column]
            df.at[user_id, column] = new_value
        
    else:
        print("coudnt find business with id: " + business_id)
        notFound_n = notFound_n + 1
        
print("Business found: " + str(found_n) + "; Not found: " + str(notFound_n))

found business tJRDll5yqpZwehenzE2cSg with index: 52561
found business 2ThtBbeDqFkVi6LugUOcVA with index: 12391
coudnt find business with id: jH19V2I9fIslnNhDzPmdkA
coudnt find business with id: dAa0hB2yrnHzVmsCkN4YvQ
coudnt find business with id: dAa0hB2yrnHzVmsCkN4YvQ
coudnt find business with id: ESzO3Av0b1_TzKOiqzbQYQ
coudnt find business with id: k7WRPbDd7rztjHcGGkEjlw
coudnt find business with id: k7WRPbDd7rztjHcGGkEjlw
coudnt find business with id: SqW3igh1_Png336VIb5DUA
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: 8qNOI6Q1-rJrvWWD5Btz6w
coudnt find business with id: 8qNOI6Q1-rJrvWWD5Btz6w
found business A73Dp0lo9s_Ci9NFDqbhqw with index: 32308
found business --ujyvoQlwVoBgMYtADiLA with index: 23232
coudnt find business with id: FCUVjQf762no86Uzcbv1Tg
coudnt find business with id: Tl_jT2a0bRMac5_YW65GPg
coudnt find business with id: Tl_j

coudnt find business with id: CTiWs36A_sX3eahdqrYxUQ
found business WY5Nn8--uHauNleE4JAPrw with index: 53462
coudnt find business with id: Vq4wz0wm6PrCDhw23AgJ9w
coudnt find business with id: KPgyqG3MyFUDK7GRbUg51A
coudnt find business with id: KNpcPGqDORDdvtekXd348w
coudnt find business with id: KNpcPGqDORDdvtekXd348w
found business xkgX0lzCY-S62e4Q1mvgiA with index: 34574
coudnt find business with id: 41f1utKc483Fw4mP4AFfJA
coudnt find business with id: nxo1C5te8Aug-Lf5D_0b1A
coudnt find business with id: uYzaARioENSfB9Y9Gk2PDA
coudnt find business with id: yCXYIz0MxVfT05TRwr1EiQ
found business NxFuteLIIgD96vLSowo-3w with index: 3676
found business NxFuteLIIgD96vLSowo-3w with index: 3676
found business NxFuteLIIgD96vLSowo-3w with index: 3676
found business NxFuteLIIgD96vLSowo-3w with index: 3676
found business hsRjct8UiUHU54B983GmcA with index: 694
found business 3GuqcEYaF1d1WXcwOX5xPw with index: 36493
coudnt find business with id: RxNPIAim-2fIw43IilR62g
coudnt find business with id

found business nlVjdQq9FzdQ3bfy-8y80g with index: 54197
coudnt find business with id: _9PUJYKVEspmGfMjdTxlcQ
coudnt find business with id: CK-Gv3vqIlWOrKP4fhT8_g
found business 1GaooxqCWHzulI2Ub3CXEw with index: 9481
found business 1GaooxqCWHzulI2Ub3CXEw with index: 9481
found business 1GaooxqCWHzulI2Ub3CXEw with index: 9481
found business 1qxmO1zpYegBxV8XDLE8ZA with index: 28025
found business 1qxmO1zpYegBxV8XDLE8ZA with index: 28025
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: dMLGSQlmo6MhLD18grRENA
coudnt find business with id: Aror3Q3exo1uWxsH4eWkzA
coudnt find business with id: Aror3Q3exo1uWxsH4eWkzA
coudnt find business with id: azE1DNVQFBU8boVbaJhj7w
coudnt find business with id: TVSk_tmVnnqS6ZCp7oJj1g
found business nwjIvTxJt2GpMoKz

found business Bma7RgPLdDAFvycubjdVOA with index: 22781
found business Bma7RgPLdDAFvycubjdVOA with index: 22781
found business Bma7RgPLdDAFvycubjdVOA with index: 22781
coudnt find business with id: h_k2iPNWpsy2kOt8_jN5QA
coudnt find business with id: h_k2iPNWpsy2kOt8_jN5QA
found business F-lUIN9jFeiX2kXpiA2gTA with index: 39922
found business d0GmiP5cai20VZr0a_bNnQ with index: 53461
found business WPRLdtddBtD-jfVYQFvOhQ with index: 5177
coudnt find business with id: txJkOk-oY900paWJQVGdVw
coudnt find business with id: BBkQ99cFPNJ5op-7IVqWNA
coudnt find business with id: BBkQ99cFPNJ5op-7IVqWNA
coudnt find business with id: FCUVjQf762no86Uzcbv1Tg
coudnt find business with id: mseBBgYcqyv6T1sfrXI-XQ
coudnt find business with id: ZHrdgKBR8v3uUrKNlW30ZA
found business nc5uuDeM3EA9WJycGDeg1w with index: 23374
found business u8ndOBCi3qVK0QISbIzmuw with index: 11891
found business XItYW5ul3OW_AqpT2nDbBQ with index: 4146
found business _R1jBQQieKpNGMBqmrLRyA with index: 22249
coudnt find busine

coudnt find business with id: KPvEo0QeT6b3x_UC6WbsvQ
coudnt find business with id: KPvEo0QeT6b3x_UC6WbsvQ
coudnt find business with id: KPvEo0QeT6b3x_UC6WbsvQ
coudnt find business with id: KPvEo0QeT6b3x_UC6WbsvQ
found business eIedjt0mHVKDFmfhtfKSAQ with index: 32165
found business dwAtKEutQKgCvb7wSSU8Kw with index: 33677
coudnt find business with id: LM3DK2j8W8mmD5_zEEku3Q
coudnt find business with id: Dyn5YDHXOmQSItz4JstHrA
coudnt find business with id: Dyn5YDHXOmQSItz4JstHrA
coudnt find business with id: Dyn5YDHXOmQSItz4JstHrA
coudnt find business with id: Dyn5YDHXOmQSItz4JstHrA
coudnt find business with id: B4lI3DPbEVeTRKVjLLklXQ
coudnt find business with id: WfQojfN1xWzm58gIY_Fa_w
coudnt find business with id: Q8tTVs1pTC27iYmOBkdPXw
found business 8kL_Z36p2HtSe6-lh0naEQ with index: 50845
found business 8kL_Z36p2HtSe6-lh0naEQ with index: 50845
found business PBYfM9Y8lROS4r-V3WjKhw with index: 54613
found business PBYfM9Y8lROS4r-V3WjKhw with index: 54613
coudnt find business with id

found business WXd7r6Yvjxhyo1L6R3lQMA with index: 11427
coudnt find business with id: yPcAdikNrXEsfbHNBQUjkQ
coudnt find business with id: yPcAdikNrXEsfbHNBQUjkQ
coudnt find business with id: og04KLFbOsvu7vTy-uchTA
coudnt find business with id: w-YZ755_a3SswPSeUwCdSg
found business d7mNEMA_o7TFYAehHzueDw with index: 14512
coudnt find business with id: E5feyjrD8L4kmpMQ2SJV8Q
coudnt find business with id: o1ZAzn5IZOLUhpEg2_rK6Q
coudnt find business with id: o1ZAzn5IZOLUhpEg2_rK6Q
coudnt find business with id: o1ZAzn5IZOLUhpEg2_rK6Q
coudnt find business with id: HxCwoHzTLzNWjazHs_9O0w
coudnt find business with id: HxCwoHzTLzNWjazHs_9O0w
coudnt find business with id: BfIcrMq0Me0D7BBSYij7Fg
found business SvCjBtbN1cKElDKPTw9dOA with index: 6834
coudnt find business with id: WKPm5nCLjmF3ph1kyhyBTA
coudnt find business with id: YfKZh6Op-PmuKyzZu_DMGQ
coudnt find business with id: TUbnEXJg1pYtrtvgxymVjw
coudnt find business with id: p_wPhw_MXIJhLEzkxtQt3Q
coudnt find business with id: kT9SuRVT

found business FgNgBLayRFm6H6Qr66ecbQ with index: 35650
found business FgNgBLayRFm6H6Qr66ecbQ with index: 35650
found business FgNgBLayRFm6H6Qr66ecbQ with index: 35650
found business FgNgBLayRFm6H6Qr66ecbQ with index: 35650
found business FgNgBLayRFm6H6Qr66ecbQ with index: 35650
coudnt find business with id: rLZDtOjCPhXZx1ldZKFzJA
coudnt find business with id: BMR_AsSBzTQHqW-SQabI4w
coudnt find business with id: BMR_AsSBzTQHqW-SQabI4w
coudnt find business with id: TvL29lvPQU5GtPEN-J0wIA
coudnt find business with id: 0NoHJtug9xefI2OnsANaMA
found business XItYW5ul3OW_AqpT2nDbBQ with index: 4146
coudnt find business with id: QYIXeru0ptw9v9sShZwaVg
coudnt find business with id: 6pKR-h3KN7AwgGOOYBbE2A
found business l0pf71mxpV1f_po1W5w1Lw with index: 42587
found business l0pf71mxpV1f_po1W5w1Lw with index: 42587
found business l0pf71mxpV1f_po1W5w1Lw with index: 42587
found business l0pf71mxpV1f_po1W5w1Lw with index: 42587
coudnt find business with id: 7ULRdJfVo2-Og5ZoGOECFg
coudnt find busin

# About the data...

In [12]:
df.loc['zcTZk7OG8ovAmh_fenH21g','Event Planning & Services']

1

In [15]:
business_categories.loc[business_categories["null"] == 1]

Unnamed: 0,business_id,Auto Repair,Roofing,Self-defense Classes,Falafel,Motorcycle Dealers,Armenian,Eyebrow Services,Interval Training Gyms,Public Transportation,...,Pool Halls,Olive Oil,Car Buyers,Dry Cleaning & Laundry,Clock Repair,Cambodian,Farmers Market,Pop-up Shops,Electronics,Scavenger Hunts
288,EBzr465prEffkpmE8Mk5AA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
603,CN3BLZwfG4eqZjvKrIZoAg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1091,Y76jGd_f889OT4oz9pLgZA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1179,yo8TQab_kYgAZm_DJkRjfA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1404,s_0zH6HExuWJeJjiZAtckA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1887,v7BcP2B6IGR59ZrY4QUaYA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2683,FXY6bj0GTWZoiA_uRng6xQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3802,-J5SYRlH2hpv4uDs07lkqA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,nR6wsc5ld4bx6NTa0ogDwQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4752,IiJP9wxsnJBzUGLJKavHag,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, JE2qFjL4BaUbiI-cT5MSBw to rGuevqz7WCka0f4UfkHurQ
Columns: 2217 entries,  Auto Repair to  Scavenger Hunts
dtypes: int64(2217)
memory usage: 521.1+ KB


## Dropping unwanted columns

Droping columns with all the rows equal to 0

In [54]:
df = df.replace(0,np.nan)

In [55]:
df = df.dropna(how='all', axis=1)

In [56]:
df = df.replace(np.nan,0)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, JE2qFjL4BaUbiI-cT5MSBw to rGuevqz7WCka0f4UfkHurQ
Columns: 228 entries,  Auto Repair to  Farmers Market
dtypes: float64(228)
memory usage: 54.9+ KB


# Save in a CSV

In [58]:
df.to_csv('dataset/df.csv', encoding='utf-8')