In [2]:
import pandas as pd 
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import numpy as np

In [3]:
data = pd.read_csv("GeneratedData/user_restaurant_ratings_corrected.csv")
data

Unnamed: 0,UserId,BusinessId,Rating,Review,CreatedAt
0,3079,831,4,Great food and experience!,4/1/2024
1,73,188,2,Great food and experience!,2/25/2023
2,8583,514,3,"An average experience, nothing special.",12/12/2022
3,6466,676,3,Great food and experience!,10/10/2024
4,7292,603,2,"Not well-seasoned, and the portions were small.",4/26/2023
...,...,...,...,...,...
499995,8387,209,1,Great food and experience!,6/13/2024
499996,723,82,4,"Juicy and well-cooked meat, really good.",2/23/2024
499997,7293,187,1,Great food and experience!,11/2/2024
499998,3041,289,4,Great coffee and a nice atmosphere.,1/15/2023


In [4]:
data = data.drop_duplicates(subset=["UserId", "BusinessId"])
data

Unnamed: 0,UserId,BusinessId,Rating,Review,CreatedAt
0,3079,831,4,Great food and experience!,4/1/2024
1,73,188,2,Great food and experience!,2/25/2023
2,8583,514,3,"An average experience, nothing special.",12/12/2022
3,6466,676,3,Great food and experience!,10/10/2024
4,7292,603,2,"Not well-seasoned, and the portions were small.",4/26/2023
...,...,...,...,...,...
499995,8387,209,1,Great food and experience!,6/13/2024
499996,723,82,4,"Juicy and well-cooked meat, really good.",2/23/2024
499997,7293,187,1,Great food and experience!,11/2/2024
499998,3041,289,4,Great coffee and a nice atmosphere.,1/15/2023


In [5]:
user_item_matrix = data.pivot(index='UserId', columns='BusinessId', values='Rating').fillna(0)

In [6]:
all_users = list(user_item_matrix.index)

train_users, test_users = train_test_split(all_users, test_size=0.2, random_state=42)

print(f"Train users: {len(train_users)}, Test users: {len(test_users)}")

Train users: 8000, Test users: 2000


In [7]:
actual_user_restaurants = {
    user: set(user_item_matrix.loc[user][user_item_matrix.loc[user] > 0].index) 
    for user in test_users if user in user_item_matrix.index 
}
actual_user_restaurants

{6253: {23,
  43,
  89,
  114,
  137,
  157,
  160,
  167,
  178,
  198,
  220,
  223,
  244,
  279,
  288,
  337,
  346,
  432,
  458,
  510,
  511,
  537,
  597,
  608,
  637,
  663,
  668,
  682,
  699,
  716,
  725,
  781,
  835,
  927,
  936,
  949,
  958,
  964,
  985,
  986},
 4685: {22,
  77,
  79,
  98,
  115,
  165,
  185,
  325,
  354,
  365,
  377,
  384,
  410,
  417,
  434,
  454,
  463,
  473,
  485,
  516,
  545,
  550,
  554,
  602,
  606,
  615,
  620,
  636,
  640,
  687,
  697,
  699,
  724,
  746,
  750,
  757,
  758,
  817,
  832,
  892,
  893,
  898,
  911,
  960,
  982,
  998},
 1732: {7,
  39,
  46,
  168,
  170,
  255,
  285,
  288,
  291,
  342,
  358,
  361,
  380,
  411,
  426,
  441,
  448,
  466,
  468,
  469,
  493,
  552,
  555,
  587,
  617,
  627,
  628,
  632,
  653,
  726,
  727,
  734,
  743,
  766,
  787,
  826,
  843,
  874,
  910,
  922,
  990,
  991},
 4743: {13,
  34,
  35,
  55,
  57,
  67,
  142,
  165,
  189,
  224,
  225,
  246,
  271,
  2

In [8]:
knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=5)
knn.fit(user_item_matrix)

In [9]:
def recommendRestaurantsToUser(userId: int, numOfRecommends: int = 6):
    if userId not in user_item_matrix.index:
        return "User Not Found"

    user_idx = user_item_matrix.index.get_loc(userId)
    distances, indecies = knn.kneighbors([user_item_matrix.iloc[user_idx]], n_neighbors=6)

    similar_users = user_item_matrix.iloc[indecies.flatten()[1:]]
    recommend_businesses = (similar_users.mean().sort_values(ascending=False)).index[:numOfRecommends]

    return recommend_businesses.tolist()

In [10]:
def hit_rate_at_k(model, k=6):
    hits = 0
    total = 0

    for user in tqdm(test_users):
        if user not in user_item_matrix.index:
            continue

        actual_items = actual_user_restaurants.get(user, set())
        if not actual_items:
            continue

        recommended = recommendRestaurantsToUser(user, numOfRecommends=k)
        hits += any(item in actual_items for item in recommended)
        total += 1

    return hits / total if total > 0 else 0

In [11]:
def mean_average_precision(recommend_fn, test_users, actual_user_restaurants, numOfRecommends=5):
    average_precisions = []
    for user in test_users:
        if user in actual_user_restaurants: 
            recommended = recommend_fn(user, numOfRecommends)
            relevant = actual_user_restaurants[user]
            score = sum([(i + 1) / (idx + 1) for idx, i in enumerate(recommended) if i in relevant])
            average_precisions.append(score / min(numOfRecommends, len(relevant)))
    return sum(average_precisions) / len(average_precisions) if average_precisions else 0


In [13]:
def apk(actual, predicted, k=5):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual_list, predicted_list, k=5):
    return np.mean([apk(a, p, k) for a, p in zip(actual_list, predicted_list)])


In [14]:
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['cosine', 'euclidean'],
    'algorithm': ['brute']
}

In [15]:
best_score = 0
best_params = None

for params in ParameterGrid(param_grid):
    print(f"Testing params: {params}")
    
    model = NearestNeighbors(**params)
    model.fit(user_item_matrix)

    # Redefine the function inline to use current model
    def recommend(userId, numOfRecommends=6):
        return recommendRestaurantsToUser(userId, model, numOfRecommends)

    # Evaluate performance
    score = hit_rate_at_k(model, k=6)

    print(f"Hit Rate@6: {score:.4f}")
    if score > best_score:
        best_score = score
        best_params = params

print(f"\nBest Params: {best_params}")
print(f"Best Hit Rate@6: {best_score:.4f}")

Testing params: {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 3}


100%|██████████| 2000/2000 [04:48<00:00,  6.93it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 5}


100%|██████████| 2000/2000 [04:32<00:00,  7.35it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 7}


100%|██████████| 2000/2000 [04:34<00:00,  7.27it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 10}


100%|██████████| 2000/2000 [04:38<00:00,  7.18it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'euclidean', 'n_neighbors': 3}


100%|██████████| 2000/2000 [05:01<00:00,  6.64it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'euclidean', 'n_neighbors': 5}


100%|██████████| 2000/2000 [04:34<00:00,  7.27it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'euclidean', 'n_neighbors': 7}


100%|██████████| 2000/2000 [04:43<00:00,  7.05it/s]


Hit Rate@6: 1.0000
Testing params: {'algorithm': 'brute', 'metric': 'euclidean', 'n_neighbors': 10}


100%|██████████| 2000/2000 [05:10<00:00,  6.45it/s]

Hit Rate@6: 1.0000

Best Params: {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 3}
Best Hit Rate@6: 1.0000



