In [15]:
import pandas as pd
import numpy as np


from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


In [74]:
# import the data (chunksize returns jsonReader for iteration)
businesses = pd.read_json("Data/yelp_academic_dataset_business.json", lines=True, orient='columns', chunksize=10000)
reviews = pd.read_json("Data/yelp_academic_dataset_review.json", lines=True, orient='columns', chunksize=10000)

In [75]:
# read the data
for business in businesses:
    business_chunk = business
    break

for review in reviews:
    review_chunk = review
    break

In [76]:
business_subset = business_chunk[['business_id','name','address', 'categories', 'attributes','stars']]
business_subset = business_subset[business_subset['categories'].str.contains('Restaurant.*')==True].reset_index()

In [77]:
df_review = review_chunk[['user_id','business_id','stars', 'date']]

In [78]:
df_restaurant = business_subset[['business_id', 'name', 'address']]
df_restaurant.head()

Unnamed: 0,business_id,name,address
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,


In [79]:
all_combined = pd.merge(df_review, df_restaurant, on='business_id')
all_combined.head()

Unnamed: 0,user_id,business_id,stars,date,name,address
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike
1,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,2017-05-13 17:06:55,Turning Point of North Wales,1460 Bethlehem Pike
2,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,2017-08-08 00:58:18,Turning Point of North Wales,1460 Bethlehem Pike
3,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,2017-11-19 02:20:23,Turning Point of North Wales,1460 Bethlehem Pike
4,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,2015-01-04 00:01:03,Zaika,2481 Grant Ave


In [80]:
all_combined.shape

(4849, 6)

In [85]:
# create a user-item matrix
rating_crosstab = all_combined.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)
rating_crosstab.head()

name,24,3 Sisters Café,30 Main,312 Pizza Company,365 Caffe Italiano,4 Rivers Smokehouse,5 Star Burgers,500 Degrees,51st Deli,5th St. Bakehouse,...,Zio's Italian Market,Zocalo,Zoes Kitchen,Zorba's Taverna,Zydeco's,fat Rooster diner,iCafe,iPho Vietnamese Restaurant,la Madeleine,sweetgreen
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--pvE2eu3WWwikKs1E2QDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--vCeHrklS1DIep0QhorrA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0KrCHEsOcjJ6N4k_k1A9A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1WbN1Qd-opw8u3uEqs2Kg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2MXx9Fk3IiCg2y559iI8Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
# Transpose the Utility matrix
X = rating_crosstab.values.T

In [101]:
def cosine_similarity(vec_a, vec_b):
    """Compute the cosine similarity between two vectors."""
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot_product / (norm_a * norm_b)

def compute_cosine_similarity_matrix(matrix):
    """Compute the cosine similarity matrix from a user-item matrix."""
    n_users = matrix.shape[0]
    similarity_matrix = np.zeros((n_users, n_users))

    for i in range(n_users):
        for j in range(i, n_users):
            similarity = cosine_similarity(matrix[i], matrix[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity  # The matrix is symmetric

    return similarity_matrix

In [107]:
def calculate_similarity(user_item_matrix):
    """
    Calculate the cosine similarity matrix from the user-item matrix
    """
    similarity = cosine_similarity(user_item_matrix)
    np.fill_diagonal(similarity, 0)
    return pd.DataFrame(similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Prediction of Ratings 

def predict_ratings(similarity, user_item_matrix, user_id):
    """
    Predict ratings for all items for a given user
    """
    total_similarity = similarity[user_id,:].sum()
    weighted_sum = np.dot(similarity[user_id,:], user_item_matrix.fillna(0))

    # Avoid division by zero
    if total_similarity == 0:
        total_similarity = 1

    predictions = weighted_sum / total_similarity
    predictions = pd.Series(predictions, index=user_item_matrix.columns)
    return predictions

def train_test_split_and_predict(data):
    """
    Split the data into train and test sets, predict ratings, and return the true and predicted ratings
    """
    train_user_item_matrix, test_user_item_matrix = train_test_split(data, test_size=0.02)

    similarity = compute_cosine_similarity_matrix(data.values.T)
    true_ratings = []
    pred_ratings = []

    for user_id in test_user_item_matrix.index:
        if user_id in test_user_item_matrix.index:
            true_rating = test_user_item_matrix.loc[user_id]
            pred_rating = predict_ratings(similarity, data, user_id)
            true_ratings.extend(true_rating[true_rating.notnull()])
            pred_ratings.extend(pred_rating[true_rating.notnull()])

    return true_ratings, pred_ratings

def evaluate_performance(data):
    """
    Evaluate the performance of the collaborative filtering algorithm
    """
    true_ratings, pred_ratings = train_test_split_and_predict(data)
    rmse = np.sqrt(mean_squared_error(true_ratings, pred_ratings))
    mae = mean_absolute_error(true_ratings, pred_ratings)
    return rmse, mae

In [108]:
compute_cosine_similarity_matrix(X)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [109]:
rmse, mae = evaluate_performance(rating_crosstab)
print(f'RMSE: {rmse}, MAE: {mae}')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
def temp(dataset):
    unique_users = dataset.index.unique()
    numUsers = len(unique_users)
    testSet = 5
    random_user = np.random.choice(numUsers, testSet, replace=False)

    print(random_user)
    
    randomUsers = unique_users[random_user]

    # List of restaurants that user has not rated
    not_rated_restaurants = []
    for user in randomUsers:
        not_rated_restaurants.append(dataset.loc[user][dataset.loc[user] == 0].index)
        print(not_rated_restaurants)
temp(rating_crosstab)

In [97]:
def train_test_split_and_predict(data):
    """
    Split the data into train and test sets, predict ratings, and return the true and predicted ratings
    """
    train_user_item_matrix, test_user_item_matrix = train_test_split(data, test_size=0.02)

    similarity = calculate_similarity(data)
    true_ratings = []
    pred_ratings = []

    for user_id in test_user_item_matrix.index:
        if user_id in test_user_item_matrix.index:
            true_rating = test_user_item_matrix.loc[user_id]
            pred_rating = predict_ratings(similarity, data, user_id)
            true_ratings.extend(true_rating[true_rating.notnull()])
            pred_ratings.extend(pred_rating[true_rating.notnull()])

    return true_ratings, pred_ratings
tr, pr = train_test_split_and_predict(rating_crosstab)
print(tr)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [100]:
print([x for x in tr if x != 0])
print([x for x in pr if x != 0])

[5.0, 5.0, 5.0, 2.0, 2.0, 4.0, 5.0, 4.0, 5.0, 1.0, 5.0, 5.0, 4.0, 4.0, 4.0, 2.0, 5.0, 3.0, 2.0, 3.0, 3.0, 4.0, 3.0, 4.0, 4.0, 4.0, 2.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 1.0, 2.0, 4.0, 4.0, 5.0, 4.0, 2.0, 4.0, 4.0, 5.0, 4.0, 5.0, 4.0, 2.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 1.0, 3.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.0, 1.0, 4.0, 5.0, 2.0, 5.0, 1.0, 5.0, 1.0, 4.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 5.0, 2.0, 4.0, 1.0, 5.0, 1.0, 4.0, 3.0, 4.0, 3.0, 4.0, 2.0]
[3.0, 3.2, 3.7857142857142856, 0.18258587692278186, 0.18258587692278186, 0.15037682633397198, 4.1670376141717105, 2.0, 4.809256430169454, 0.7629742793221846, 0.4955973723971816, 4.700880525520564, 0.14035087719298245, 4.473684210526316, 5.0, 0.3948853255440963, 4.535038224818634, 4.0, 3.730769230769231, 4.571428571428571, 4.0, 5.0, 0.058634115596513964, 0.06910096791195156, 4.571682627066012, 0.07630925339765814, 0.10091564835691057, 0.10091564835691057, 0.10402146936330792, 2.9846023585351142,