In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

import pickle

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
train_df = pd.read_csv("train_df.csv")

### Popularity Based Model

In [4]:
def get_popularity_based_recommendation(user_id, k):
    return pd.DataFrame(columns=['business_id'])

### Content Based Model

In [6]:
cb_matrix = sparse.load_npz("cb_matrix.npz")
item_df = pd.read_csv("item_df.csv")

text_features = ['categories', 'editorial_summary',
                 'RestaurantsAttire', 'Ambience',
                 'NoiseLevel', 'Music', 'city']

list_features = ['GoodForMeal', 'BusinessParking']

bool_features = [
    'RestaurantsTakeOut', 'RestaurantsDelivery', 'RestaurantsReservations', 'OutdoorSeating',
    'RestaurantsGoodForGroups', 'GoodForKids', 'DogsAllowed', 'dine_in', 'serves_beer', 'serves_wine',
    'serves_cocktails', 'good_for_watching_sports', 'serves_coffee', 'menu_for_children',
    'BusinessAcceptsCreditCards', 'HasTV', 'BikeParking', 'Caters', 'RestaurantsTableService',
    'WheelchairAccessible', 'HappyHour', 'BusinessAcceptsBitcoin'
]

num_features = [
    'yelp_rating', 'number_of_photos', 'dist_highway',
    'rural_urban_continuum_code_2023', 'adjusted_gross_income',
    'unemployment_rate_2023', 'user_rating_count',
    'google_rating', 'price_level'
]

features = ['business_id', 'name', 'description'] + \
    text_features + list_features + bool_features + num_features

def get_content_based_recommendation(user_id, k):
    past_restaurants = train_df.loc[(train_df['user_id'] == user_id)]
    restaurant_indices = []
    aligned_ratings = []

    for _, row in past_restaurants.iterrows():
        business_id = row['business_id']
        rating = row['stars']
        idx = item_df[item_df['business_id'] == business_id].index
        if len(idx) > 0:
            restaurant_indices.append(idx[0])
            aligned_ratings.append(rating)

    vectors = cb_matrix[restaurant_indices].toarray()
    weights = np.array(aligned_ratings) - 3  # center ratings at neutral (3)
    weights = weights.reshape(-1, 1)

    weighted_embedding = (vectors * weights).sum(axis=0)
    query_embedding = weighted_embedding.reshape(1, -1)

    similarity_scores = cosine_similarity(
        query_embedding, cb_matrix).flatten()

    similarity_scores[similarity_scores >= 0.99] = 0

    top_k_indices = np.argsort(similarity_scores.flatten())[::-1][:k]
    top_k_scores = similarity_scores.flatten()[top_k_indices]
    recommended_ids = item_df.iloc[top_k_indices]['business_id'].values

    recommended_df = pd.DataFrame({
        'business_id': recommended_ids,
        'similarity_score': top_k_scores
    })

    recommended_df['similarity_score'] = recommended_df['similarity_score'].round(
        3)

    return recommended_df.reset_index(drop=True)

### Collaborative Filtering Model

In [8]:
cf_matrix = sparse.load_npz("cf_matrix.npz")

with open('user_encoder.pkl', 'rb') as f:
    user_encoder = pickle.load(f)

with open('item_encoder.pkl', 'rb') as f:
    item_encoder = pickle.load(f)

with open('user_decoder.pkl', 'rb') as f:
    user_decoder = pickle.load(f)

users_df = pd.read_csv("users_df.csv")


def get_collaborative_filtering_recommendation(user_id, k, top_k_indices, similarity_scores):
    # user_idx = user_encoder[user_id]
    # query_vector = cf_matrix[user_idx].reshape(1, -1)

    # similarity_scores = cosine_similarity(
    #     query_vector, cf_matrix).flatten()
    # similarity_scores[user_idx] = 0  # remove self-similarity

    # # Get top-k similar users
    # top_k_indices = np.argsort(similarity_scores)[::-1][:k]
    top_k_user_ids = [users_df["user_id"][i] for i in top_k_indices]
    top_k_similarities = similarity_scores[top_k_indices]

    # Get ratings from these similar users
    sim_user_ratings = train_df[train_df['user_id'].isin(
        top_k_user_ids)].copy()
    sim_score_dict = dict(zip(top_k_user_ids, top_k_similarities))
    sim_user_ratings['weight'] = sim_user_ratings['user_id'].map(
        sim_score_dict)
    sim_user_ratings['weighted_ratings'] = sim_user_ratings['stars'] * \
        sim_user_ratings['weight']

    # Aggregate predictions
    agg_df = sim_user_ratings.groupby('business_id').agg(
        predicted_score=('weighted_ratings', 'sum'),
        total_weight=('weight', 'sum')
    )
    agg_df['predicted_score'] = agg_df['predicted_score'] / \
        agg_df['total_weight']

    # Select top-k
    top_recs = agg_df.sort_values(
        by='predicted_score', ascending=False).head(k).reset_index()

    return top_recs[['business_id', 'predicted_score']]

### Matrix Factorization Model

In [10]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, user_feat_dim, item_feat_dim, embedding_dim=32):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, embedding_dim)
        self.item_emb = nn.Embedding(n_items, embedding_dim)

        # Side feature projection layers
        self.user_feat_proj = nn.Linear(user_feat_dim, embedding_dim)
        self.item_feat_proj = nn.Linear(item_feat_dim, embedding_dim)

        # Bias terms
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.tensor([0.0]))

        self.dropout = nn.Dropout(0.3)

        # Initialization
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        nn.init.xavier_uniform_(self.user_feat_proj.weight)
        nn.init.xavier_uniform_(self.item_feat_proj.weight)

    def forward(self, users, items, user_features, item_features):
        # User latent + side
        u_id = self.dropout(self.user_emb(users))
        u_feat = self.user_feat_proj(user_features[users])
        u = u_id + u_feat

        # Item latent + side
        i_id = self.dropout(self.item_emb(items))
        i_feat = self.item_feat_proj(item_features[items])
        i = i_id + i_feat

        # Dot product + bias
        dot = (u * i).sum(1, keepdim=True)
        bias = self.user_bias(users) + self.item_bias(items) + self.global_bias

        return (dot + bias).squeeze(1)  
    

num_users = 269461
num_items = 8069
user_features_tensor = torch.load('user_features_tensor.pt')
item_features_tensor = torch.load('item_features_tensor.pt')
all_item_indices=list(range(item_features_tensor.shape[0]))
mf_model = MF(num_users, num_items, 3, 386)
mf_model.load_state_dict(torch.load('MF_model.pth'))
mf_model.eval()

  user_features_tensor = torch.load('user_features_tensor.pt')
  item_features_tensor = torch.load('item_features_tensor.pt')
  mf_model.load_state_dict(torch.load('MF_model.pth'))


MF(
  (user_emb): Embedding(269461, 32)
  (item_emb): Embedding(8069, 32)
  (user_feat_proj): Linear(in_features=3, out_features=32, bias=True)
  (item_feat_proj): Linear(in_features=386, out_features=32, bias=True)
  (user_bias): Embedding(269461, 1)
  (item_bias): Embedding(8069, 1)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [11]:
with open('user_mf_encoder.pkl', 'rb') as f:
    user_le = pickle.load(f)

with open('item_mf_encoder.pkl', 'rb') as f:
    item_le = pickle.load(f)

@torch.no_grad()
def get_mf_recommendation(model, user_idx, all_item_indices, user_features_tensor, item_features_tensor, device='cpu', top_k=10):
    model.eval()

    user_tensor = torch.tensor([user_idx] * len(all_item_indices), dtype=torch.long).to(device)
    item_tensor = torch.tensor(all_item_indices, dtype=torch.long).to(device)

    user_features_tensor = user_features_tensor.to(device)
    item_features_tensor = item_features_tensor.to(device)

    # Predict scores for all items for this user
    scores = model(user_tensor, item_tensor, user_features_tensor, item_features_tensor)
    scores = torch.clamp(scores, 1, 5)
    scores = scores.cpu().numpy()

    # Rank items by score
    ranked_indices = np.argsort(scores)[::-1][:top_k]
    top_items = [(all_item_indices[i], scores[i]) for i in ranked_indices]

    item_indices = [item_idx for item_idx, _ in top_items]
    original_business_ids = item_le.inverse_transform(item_indices)  
    top_recs = pd.DataFrame({'business_id': original_business_ids})
    return top_recs

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Combine All Models

In [13]:
def get_recommendation(user_id, k, visualize=True):
    # If user is not found aka "new user"
    if user_id not in user_encoder: 
        recommendations = get_popularity_based_recommendation(user_id, k)
        recommendations = recommendations.merge(
            item_df[features], on='business_id', how='left')
        return recommendations
    
    # For "existing user", find top-k similar users
    user_idx = user_encoder[user_id]
    query_vector = cf_matrix[user_idx].reshape(1, -1)
    similarity_scores = cosine_similarity(
        query_vector, cf_matrix).flatten()
    similarity_scores[user_idx] = 0  # remove self-similarity
    top_k_indices = np.argsort(similarity_scores)[::-1][:k]

    # If there are similar users
    if top_k_indices.size > 0:
        cb_rec = get_content_based_recommendation(user_id, k)[["business_id"]]
        cf_rec = get_collaborative_filtering_recommendation(user_id, k, top_k_indices, similarity_scores)[
            ["business_id"]]
        mf_rec = get_mf_recommendation(mf_model, user_idx, all_item_indices, user_features_tensor, item_features_tensor, device='cpu', top_k=k)
        cb_rec["rank"] = cb_rec.index + 1
        cf_rec["rank"] = cf_rec.index + 1
        mf_rec["rank"] = mf_rec.index + 1
        recommendations = pd.concat([mf_rec,cb_rec, cf_rec])
        
    # If there are no similar users    
    else:
        cb_rec = get_content_based_recommendation(user_id, k)[["business_id"]]
        mf_rec = get_mf_recommendation(mf_model, user_idx, all_item_indices, user_features_tensor, item_features_tensor, device='cpu', top_k=k)
        cb_rec["rank"] = cb_rec.index + 1
        mf_rec["rank"] = mf_rec.index + 1
        recommendations = pd.concat([mf_rec, cb_rec])
    
    # Tidy up recommendations
    recommendations = recommendations.groupby("business_id")["rank"].mean().reset_index().sort_values("rank").head(k)
    recommendations = recommendations.merge(
        item_df[features], on='business_id', how='left')

    return recommendations

### Get Recommendations

In [15]:
get_recommendation('mh_-eMZ6K5RLWhZyISBhwA', 10)

Unnamed: 0,business_id,rank,name,description,categories,editorial_summary,RestaurantsAttire,Ambience,NoiseLevel,Music,...,BusinessAcceptsBitcoin,yelp_rating,number_of_photos,dist_highway,rural_urban_continuum_code_2023,adjusted_gross_income,unemployment_rate_2023,user_rating_count,google_rating,price_level
0,NvOMeOZp6SC702ZqOLM4Cg,1.0,Nudy's Café - West Chester,restaurant breakfast brunch salad burger casua...,Restaurants Breakfast & Brunch Salad Burgers,Casual breakfast & lunch spot with a large men...,casual,classy casual,average,,...,False,3.5,10.0,3.431273,1.0,3458385.0,2.9,726.0,4.3,2.0
1,WL-0PLW5IzdnyUHGmiOrgQ,1.0,Keswick Tavern,bar greek restaurant nightlife beer wine spiri...,Bars Greek Restaurants Nightlife Beer Win...,Contemporary tavern with multiple TVs for spor...,casual,classy casual,average,live,...,,3.5,10.0,0.687538,1.0,1739529.0,2.8,656.0,4.2,2.0
2,u50hTvPV_W_Hx625ytvLYw,1.0,Hatboro Pizza,salad food delivery service italian event plan...,Salad Food Delivery Services Italian Event ...,Quaint family-run pizzeria serves traditional ...,casual,casual,average,,...,,3.5,10.0,1.633556,1.0,877828.0,2.9,357.0,4.4,1.0
3,8n_BlTxfALO08FWdb6Tnbg,2.0,Pholosophy,restaurant vietnamese restaurant vietnamese ca...,Restaurants Vietnamese,Restaurants Vietnamese,,casual,average,,...,False,4.0,10.0,1.757572,1.0,1143777.0,2.8,186.0,4.5,1.0
4,_-8TAMmIbDGkZCXAqSkFMg,2.0,Slack's Hoagie Shack,restaurant caterer pizza event planning servic...,Restaurants Caterers Pizza Event Planning &...,Chain sandwich specialist turning out hoagies ...,casual,casual,quiet,,...,,3.5,10.0,1.554484,1.0,1237238.0,3.2,294.0,4.2,1.0
5,xdG_RW_QsAGJbXp46M_V8w,2.0,MOD Pizza,pizza restaurant fast food chain restaurant kn...,Pizza Restaurants Fast Food,Counter-serve industrial-chic chain restauran...,casual,casual,average,,...,False,4.0,10.0,2.099933,1.0,3836907.0,2.866667,599.0,4.3,1.0
6,OTwwIvLkrS0zc8f1lNRstw,3.0,Dasiwa,sushi bar restaurant intimate bistro providing...,Sushi Bars Restaurants,Intimate bistro providing sushi & other Japane...,casual,classy casual,quiet,,...,False,4.5,10.0,0.930093,1.0,1650111.0,4.2,222.0,4.5,2.0
7,-rwfGlw6T5czqQO4uZGWYw,3.0,Moonlight Diner,breakfast brunch diner mediterranean restauran...,Breakfast & Brunch Diners Mediterranean Res...,Relaxed neighborhood nook providing classic A...,casual,casual,average,,...,,4.0,10.0,0.687538,1.0,1739529.0,2.8,736.0,4.3,1.0
8,AtDtwv66pG52TehGR3f6tQ,3.0,Randazzo's Pizzeria,pizza restaurant pizza restaurant casual casua...,Pizza Restaurants,Pizza Restaurants,casual,casual,average,,...,,4.0,10.0,1.322643,1.0,713838.0,2.8,344.0,4.2,1.0
9,sYgyAxvuDP1799oiGXqE_A,4.0,MOD Pizza,fast food restaurant pizza chain restaurant kn...,Fast Food Restaurants Pizza,Counter-serve industrial-chic chain restauran...,casual,trendy casual,average,,...,False,4.0,10.0,1.322643,1.0,713838.0,2.8,901.0,4.4,1.0
