In [1]:
# importing libraries

import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('ggplot')

# machine learning
from sklearn.cluster import KMeans

In [2]:
# load data
df = pd.read_csv("./yelp_data/yelp_review.csv", nrows=100)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [3]:
users = df['user_id'].count()
restaurants = df['business_id'].count()
stars = df['stars'].count()

print('User count:', users)
print('restaurants count:', restaurants)
print('stars count:', stars)

User count: 100
restaurants count: 100
stars count: 100


In [4]:
from scipy.sparse import csr_matrix

user_u = list(sorted(df.user_id.unique()))
item_u = list(sorted(df.business_id.unique()))

row = df.user_id.astype('category', categories=user_u).cat.codes
col = df.business_id.astype('category', categories=item_u).cat.codes

data = df['stars'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u)))

df = pd.SparseDataFrame([ pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) 
                              for i in np.arange(sparse_matrix.shape[0]) ], 
                        index=user_u, columns=item_u, default_fill_value=0)
df.head()

  
  import sys


Unnamed: 0,0-yj2jtzLUHG2b7PpEHyog,0H8PL4trSvZFYgPpvSOCjQ,0Rni7ocMC_Lg2UH0lDeKMQ,0W4lkclzZThpx3V65bVgig,0g7Pr8OWl_t_7DUeYJrrGw,13nKUHH-uEUXVZylgxchPA,1jNteKQ2JuF6Sk1SI9X23Q,28adZ4lsuUeVB2aWzohK9g,2QCZyAOB6xXKCdyFPvXXmw,3E5umUqaU5OZAV3jNLW3kQ,...,vyeQzjZFx6KoL2pJBQ2QFA,xNNAfZJkLZlAeS-I7-QwgA,xpJEBXTCQh5Ib8BJrjt9Bg,yXvyFDIdJTM0kLGSe7VYAw,yaViddk9vxi-7p8DnjoClw,yuFdJdrnfMp3cfXVwTXjjA,z8oIoCT1cXz7gZP5GeU5OA,zgQHtqX0gqMw1nlBZl2VnQ,zkU-WMio8g6dpRJ2Y2xqvQ,zxJlg4XCHNoFy78WZPv89w
_4iMDXbXZ1p1ONG297YEAQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
_L2SZSwf7A6YSrIHy_q4cw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,0
bv2nCi5Qv5vroFiqKGopiw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nOTl4aPC4tKHK35T3bNauQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nsOf58RZjMTn8V94EQYJog,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
