In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

data = Path("/content/drive/MyDrive/merged_data.csv")

In [2]:
main_df = pd.read_csv(data)
main_df = main_df[main_df.state.isin(['PA', 'FL', 'LA'])] # Top three states: PA, FL, LA

In [3]:
# create a label encoder for user_id and business_id
(user_encoder, business_encoder) = (LabelEncoder(), LabelEncoder())

# fit the encoders to the data
user_encoder.fit(main_df['user_id'])
business_encoder.fit(main_df['business_id'])

# encode the user_id and business_id columns
main_df['user_id_encoded'] = user_encoder.transform(main_df['user_id'])
main_df['business_id_encoded'] = business_encoder.transform(main_df['business_id'])

In [4]:
main_df.head()

Unnamed: 0,user_id,business_id,name,state,stars,text,user_id_encoded,business_id_encoded
0,b-yqVomA1iK2mKQFnTuTbQ,uYMVRPYEqpmVXWN2xOs5KQ,Cooper's Hawk Winery & Restaurants - Tampa,FL,3,The wife and I went to Cooper's Hawk for the f...,45741,43151
1,utx0JQVAwsXaedVV-xPZjw,uYMVRPYEqpmVXWN2xOs5KQ,Cooper's Hawk Winery & Restaurants - Tampa,FL,4,Pros: the food is fantastic. The burnt broccol...,69065,43151
4,s_9uD6zqVU-9cnKO9pDKtg,jMi5SL9vb6nLJGRjw0HK3Q,Civera's Deli,PA,5,Civera's Deli is the cleanest deli in Delco. E...,66237,34858
5,p6qI5SY2ybs003-whBAnpA,2N97axWHko3rzyzOGCxxBA,URBAN Brew and BBQ,FL,1,Disappointing 2nd time experience. Waited 15-2...,62254,2551
6,JOrDiXIgpb0sjtd7Cr3CdA,RUfWgnSSQKjRNBpK0wSxfg,Isabella Pizza,PA,1,Yuck. I ordered here this morning for breakfas...,24022,20979


In [5]:
main_df.dtypes

user_id                object
business_id            object
name                   object
state                  object
stars                   int64
text                   object
user_id_encoded         int64
business_id_encoded     int64
dtype: object

In [6]:
# Number of unique users
{
    "n_users": len(main_df["user_id"].unique()),
    "n_businesses": len(main_df["business_id"].unique())
}

{'n_users': 75036, 'n_businesses': 47261}

In [7]:
# Extracting business
businesses_df = main_df[['business_id', 'business_id_encoded']].drop_duplicates()
businesses_df = pd.merge(businesses_df, main_df[['business_id', 'name']], on='business_id', how='left')
businesses_df = businesses_df.drop_duplicates()
businesses_df = businesses_df.set_index(['business_id_encoded', 'business_id'])
businesses_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name
business_id_encoded,business_id,Unnamed: 2_level_1
43151,uYMVRPYEqpmVXWN2xOs5KQ,Cooper's Hawk Winery & Restaurants - Tampa
34858,jMi5SL9vb6nLJGRjw0HK3Q,Civera's Deli
2551,2N97axWHko3rzyzOGCxxBA,URBAN Brew and BBQ
20979,RUfWgnSSQKjRNBpK0wSxfg,Isabella Pizza
6377,7WWLPXpOjrh_1EvjFuw3hQ,The Drake Tavern


## Raw Correlation (Cosine Similarity) - for baseline model

In [8]:
# Generating the user-item rating matrix - using raw user_id and raw business_id
rating_matrix = csr_matrix((main_df['stars'], (main_df['user_id_encoded'], main_df['business_id_encoded'])))
rating_matrix # n_users x n_businesses

<75036x47261 sparse matrix of type '<class 'numpy.int64'>'
	with 1559381 stored elements in Compressed Sparse Row format>

In [9]:
# Calculate the pairwise similarities between items using cosine similarity
# Item-Item similarity (n_items x n_items)
item_similarity = cosine_similarity(rating_matrix.T)
item_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [10]:
def get_item_item_recommendations(user_id, rating_matrix, item_similarity, businesses_df, n=5):
    # get the target user's ratings
    target_user_ratings = rating_matrix[user_id, :]
    
    # convert the target user's ratings to a sparse matrix format
    target_user_rating_matrix = csr_matrix(target_user_ratings)
    
    # compute the dot product of the target user's ratings and the item similarity matrix
    weighted_sum = target_user_rating_matrix.dot(item_similarity)
    
    # convert the result to a dense numpy array
    predicted_ratings = weighted_sum.flatten()
    
    # get the indices of the top n recommendations
    top_item_indices = np.argsort(predicted_ratings)[::-1][:n]
    
    # get the corresponding business IDs and names
    return businesses_df.loc[top_item_indices, :]


In [11]:
main_df["user_id_encoded"].describe()

count    1.643235e+06
mean     3.744425e+04
std      2.170075e+04
min      0.000000e+00
25%      1.857400e+04
50%      3.741700e+04
75%      5.633600e+04
max      7.503500e+04
Name: user_id_encoded, dtype: float64

In [12]:
[i[0] for i in get_item_item_recommendations(0, rating_matrix, item_similarity, businesses_df, n=6).index]

[1654, 33308, 21295, 20128, 19346, 16308]

## Evaluation

### Coverage

Coverage can be calculate by determining the percentage of unique items that are recommended to at least one user. Higher coverage indicates a larger fraction of items that are being recommended, suggesting a better diversity in the recommendations

In [13]:
def coverage_score(rating_matrix, item_similarity, businesses_df, n_users, n_items, top_n):
    """
    Calculate the coverage score for the recommendation system

    :param rating_matrix: the user-business rating sparse matrix
    :param item_similarity: Item-Item similarity matrix
    :param businesses_df: Pandas Dataframe with business_id, business_id_encoded and name
    :param n_users: total number of users
    :param n_items: total number of businesses
    :param top_n: number of recomendations
    """

    recommended_items = set()
    for user_id in range(n_users):
      top_items = get_item_item_recommendations(
          user_id, rating_matrix, item_similarity, businesses_df, n=top_n)
      recommended_items.update([i[0] for i in top_items.index])

    print(f"Total items got {len(recommended_items)}")
    return len(recommended_items) / float(n_items)

In [16]:
baseline_cov = coverage_score(
    rating_matrix,
    item_similarity,
    businesses_df,
    main_df['user_id_encoded'].max(), # getting the total number of users
    main_df['business_id_encoded'].max(), # getting the total number of businesses
    3
)
print(f"Baseline Coverage Score: {baseline_cov}")

Total items got 44829
Baseline Coverage Score: 0.9485611510791367


### Mean Average Precision at k (MAP@k)

MAP@k measures how well the system ranks the recommended items for a user based on their true preferences. Higher MAP@k indicates a better ranking accuracy of the recommended items.

In [19]:
def apk(actual, predicted, k):
    """
    Calculates the average precision at k.
    """
    if k == 0:
        return 0.0
    
    predicted = predicted[:k]
    
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    
    return score / min(len(actual), k)

In [21]:
top_n = 3
user_item_preferences = main_df.groupby('user_id_encoded')['business_id_encoded'].apply(list).to_dict()

mean_apk = 0.0
for user_id, true_items in user_item_preferences.items():
    top_items = get_item_item_recommendations(
      user_id, rating_matrix, item_similarity, businesses_df, n=top_n)
    predicted_items = [i[0] for i in top_items.index if i[0] in true_items]
    apk_score = apk(true_items, predicted_items, top_n)
    mean_apk += apk_score

mean_apk /= float(len(user_item_preferences))

mean_apk

0.9966949197718414

### Mean Average Recall at k (MAR@k)

MAR@k measures how well the system recommends the relevant items to a user. Higher MAR@k indicates a better ability of the system to recommend items that the user would prefer.

In [25]:
def recall(actual, predicted, k):
    """
    Calculates the recall.
    """
    if len(actual) == 0:
        return 0.0
    
    predicted = predicted[:k]
    
    score = 0.0
    
    for p in predicted:
        if p in actual:
            score += 1.0
    
    return score / float(len(actual))

In [26]:
top_n = 3
user_item_preferences = main_df.groupby('user_id_encoded')['business_id_encoded'].apply(list).to_dict()

mean_ar = 0.0
for user_id, true_items in user_item_preferences.items():
    top_items = get_item_item_recommendations(
      user_id, rating_matrix, item_similarity, businesses_df, n=top_n)
    top_items = [i[0] for i in top_items.index]
    ar_score = recall(true_items, top_items, top_n)
    mean_ar += ar_score

mean_ar /= len(user_item_preferences)

mean_ar

0.30535801222129094