In [1]:
import sqlite3
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

# Load data

In [2]:
conn = sqlite3.connect('yelp_dataset_reviews.db') 
df_reviews= pd.read_sql("""SELECT ubt.*
                        FROM reviews ubt
                        INNER JOIN (
                         SELECT user_id, business_id, MAX(date) as max_date
                        FROM reviews
                        GROUP BY 1,2
) grouped_ubt ON ubt.user_id = grouped_ubt.user_id
              AND ubt.business_id = grouped_ubt.business_id
              AND ubt.date = grouped_ubt.max_date;
""",conn )
conn.close()

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
conn = sqlite3.connect('yelp_dataset_business.db') 
df_business= pd.read_sql("""SELECT * FROM business""",conn )
conn.close()

In [5]:
df_business.is_open = df_business.is_open.astype(int)
df_business["review_count"]= df_business["review_count"].astype(int)
df_business['attributes']= df_business['attributes'].apply(lambda x: eval(x) if x else None)
df_business['hours']= df_business['hours'].apply(lambda x: eval(x) if x else None)

In [6]:
conn = sqlite3.connect('yelp_dataset_users.db') 
df_users= pd.read_sql("""SELECT user_id, name FROM users""",conn )
conn.close()

In [7]:
attr = [col for col in df_business.attributes.explode().unique() if col is not None]
lst_of_attr_dict = []
for attr_dict in df_business.attributes:
    if not attr_dict:
        lst_of_attr_dict.append({})
        continue

    if 'BusinessParking' in attr_dict:
        if type(attr_dict['BusinessParking']) == str:
            attr_dict['BusinessParking'] = ('True' in attr_dict['BusinessParking'])

    lst_of_attr_dict.append(attr_dict)
attr_df = pd.DataFrame(lst_of_attr_dict, columns=attr)

for col in attr_df:
        attr_df[col] = attr_df[col].fillna(False).astype(bool)
    
df_business = pd.concat([df_business.reset_index().drop('index', axis=1), attr_df], axis=1)
df_business.drop(['attributes'], axis=1, inplace=True)
lst_of_time = []
for time_dict in df_business.hours:
    if not time_dict:
        lst_of_time.append({})
        continue
    lst_of_time.append(time_dict)
time_df = pd.DataFrame(lst_of_time)
df_business = pd.concat([df_business, time_df], axis=1).drop('hours', axis=1)

In [8]:
df_reviews['date'] = pd.to_datetime(df_reviews['date'])

# Feature Engineering

In [9]:
df_business['Monday'].value_counts()

0:0-0:0        31362
8:0-17:0        4202
9:0-17:0        3910
11:0-22:0       3241
11:0-21:0       2938
               ...  
19:30-22:0         1
16:45-20:15        1
22:0-18:0          1
21:30-0:0          1
20:0-4:0           1
Name: Monday, Length: 1315, dtype: int64

In [10]:
df_business = df_business.replace(False, 0)
df_business = df_business.replace(True, 1)

In [11]:
df_business = df_business.drop(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],axis=1)

# User based Collaborative Filtering


### Select review table and reduce to managable size

In [12]:
selected= ['review_id', 'user_id', 'business_id', 'stars','date']
df_reviews = df_reviews[selected]
df_reviews = df_reviews[df_reviews.stars>4]

In [13]:
dis_rate = df_reviews
idx = dis_rate.groupby(['user_id','business_id'])['date'].idxmax()
dis_rate = dis_rate.loc[idx]
dis_rate=dis_rate[['user_id','business_id','stars']].iloc[:30000,:]

### get user_business pivot table

In [79]:
piv_rate= dis_rate.pivot_table(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
piv_rate

business_id,--ZVrH2X2QXBFdCilbirsw,--epgcb7xHGuJ-4PUeSLAw,-06ngMH_Ejkm_6HQBYxB7g,-0FX23yAacC4bbLaGPvyxw,-0TffRSXXIlBYVbb5AwfTg,-0Ym1Wg3bXd_TDz8JtvOQg,-0jzoPt3UeXn6FUXVQvyPg,-16EH6b1ho0xQqP0Bzm9Mg,-1B9pP_CrRBJYPICE5WbRA,-1MhPXk1FglglUAmuPLIGg,...,zxj_evoLcLDS29uktC8sFw,zxuVnNVAUm16Sro0dS_lmg,zyFn7oqokhaUaPPqd2FbbA,zyHP-oXgDkANEyQbJVKf8g,zyghhZzPgb1bRAIYB-oi1w,zymvTQ12tRy3bzmP4tPgOg,zyrhpLocbo60EbS57jBTLw,zzHtFjfM7NvuVM1HTsCLGA,zzO2zgfqP9ANmEWt-EZFWg,zzXDi0Pdv0s84M-oQaIa_g
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
---UgP94gokyCDuB5zUssA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
---r61b7EpVPkb4UVme5tA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
---zemaUC8WeJeWKqS6p9Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-cWLKRScFMQGkq9MOzI_yw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-cWYVhfgW2UwjcFesbZbZA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-cX1cJ8fayYZi9XakiR_dg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-cX6CW0hntz6LUF-nJsOUQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
bus_id = piv_rate.columns
Names = [df_business[df_business.business_id == id_]['name'].iloc[0] for id_ in bus_id] # extract movie titles
piv_rate.columns = Names

counts = piv_rate.apply(np.count_nonzero, axis=0) # count the number of ratings for each business
number_ratings_df = pd.DataFrame(counts).reset_index()
number_ratings_df.columns = ['bus_name', 'numberRatings']

# get the names of the 500 business with the most ratings in the df
top = number_ratings_df.sort_values('numberRatings', ascending=False)[0:600].bus_name

In [81]:
ratings = piv_rate[list(top)]

## SVD

In [62]:
svd_model = TruncatedSVD(n_components=50, random_state=671)
users_df_svd = svd_model.fit_transform(ratings)

U = users_df_svd / svd_model.singular_values_ 
sigma = svd_model.singular_values_
V = svd_model.components_

# checks 
print("Shape of U:", U.shape)
print("Length of Sigma:", len(sigma))
print("Shape of V:", V.shape)

Shape of U: (12575, 50)
Length of Sigma: 50
Shape of V: (50, 773)


In [63]:
clusters = KMeans(n_clusters=10, random_state=6).fit_predict(U) # arbitrary choice of k
clustered_users_df = pd.DataFrame(U, index = piv_rate.index)
clustered_users_df['cluster'] = clusters # create new column with cluster membership

## User exist in database

In [54]:
def knn_rec(user, number):
    cluster_no = int(clustered_users_df[clustered_users_df.index == user].cluster) 
    name = df_users[df_users.user_id =="---2PmXbF47D870stH1jqA"]['name'].to_list()[0]
    print("User ID:{} \n User name:{} \n belongs to cluster {}".format(user,name,cluster_no))
    similar_users = list(clustered_users_df[clustered_users_df.cluster == cluster_no].index)
    cluster_df = ratings.loc[similar_users] 
    cluster_df[cluster_df == 0] = np.nan # vanish impact of 0
    avg_ratings = cluster_df.mean() 
    avg_ratings_df = pd.DataFrame(avg_ratings).reset_index()
    avg_ratings_df.columns = ['movieName', 'avgClusterRating']
    avg_ratings_df.sort_values('avgClusterRating', ascending=False).head(10)
    return avg_ratings_df.sort_values('avgClusterRating', ascending=False).iloc[:10,:]

## Similarity of business

In [66]:
bus_vec = pd.DataFrame(V.T)
bus_vec.index = ratings.columns

In [85]:
def euclidean(arr1, arr2):
    res = np.linalg.norm(arr1 - arr2,ord=2)
    return res

In [86]:
def cosine(arr1, arr2):
    res = np.dot(arr1,arr2)/(np.linalg.norm(arr1)* np.linalg.norm(arr2))
    return res

In [105]:
def calc_sim(df,method):
    res = []
    row , col = df.shape
    for i in range(row):
        arr1 = df.iloc[i,:]
        res_row =[]
        for j in range(row):
            arr2 = df.iloc[j, :]
            if method == "Euc":
                dist = euclidean(arr1, arr2)
                res_row.append(dist)
            elif method == 'cos':
                dist = cosine(arr1, arr2)
                res_row.append(dist)
        res.append(res_row)
    res = pd.DataFrame(res, columns =df.index, index=df.index) 
    return res

### Euclidean

In [106]:
sim_euc = calc_sim(bus_vec,"Euc")

In [None]:
sim_euc

### Cosine

In [108]:
sim_cos = calc_sim(bus_vec,"cos")

In [None]:
sim_cos

### Test
test case:"Hattie B’s Hot Chicken - Nashville"

In [120]:
business_name = input("Please enter the name of business:")
vec_euc = sim_euc[sim_euc.index == business_name]
vec_euc.iloc[0].sort_values(ascending=True)[1:11]

Gaylord Opryland Resort & Convention Center    0.911544
Five Daughters Bakery 12south                  0.918576
The Pharmacy                                   0.919489
Milk and Honey Nashville                       0.920842
Puckett's Grocery & Restaurant                 0.940808
Jeni's Splendid Ice Creams                     0.941769
Jeni's Splendid Ice Creams                     0.941769
Frothy Monkey                                  0.941940
Hattie B's Hot Chicken - Melrose               0.942893
Amerigo Italian Restaurant                     0.943560
Name: Hattie B’s Hot Chicken - Nashville, dtype: float64

In [119]:
vec_cos = sim_cos[sim_cos.index == business_name]
vec_cos.iloc[0].sort_values(ascending=False)[1:11]

Mitchell Delicatessen                          0.960361
Frothy Monkey                                  0.782917
Gaylord Opryland Resort & Convention Center    0.751826
Jeni's Splendid Ice Creams                     0.443022
Jeni's Splendid Ice Creams                     0.443022
Puckett's Grocery & Restaurant                 0.409007
Five Daughters Bakery 12south                  0.367252
Hattie B's Hot Chicken - Melrose               0.350186
K-POT Korean BBQ & Hot Pot                     0.335735
The Pharmacy                                   0.324259
Name: Hattie B’s Hot Chicken - Nashville, dtype: float64

## Test 
---2PmXbF47D870stH1jqA

In [None]:
user = input("Enter a user id: ")
if user in clustered_users_df.index:
    knn_rec(user, 10)
