In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

import pickle

In [2]:
train_df = pd.read_csv("train_df.csv")

### Popularity Based Model

In [3]:
def get_popularity_based_recommendation(user_id, k):
    return pd.DataFrame(columns=['business_id'])

### Content Based Model

In [4]:
cb_matrix = sparse.load_npz("cb_matrix.npz")
item_df = pd.read_csv("item_df.csv")

text_features = ['categories', 'editorial_summary',
                 'RestaurantsAttire', 'Ambience',
                 'NoiseLevel', 'Music', 'city']

list_features = ['GoodForMeal', 'BusinessParking']

bool_features = [
    'RestaurantsTakeOut', 'RestaurantsDelivery', 'RestaurantsReservations', 'OutdoorSeating',
    'RestaurantsGoodForGroups', 'GoodForKids', 'DogsAllowed', 'dine_in', 'serves_beer', 'serves_wine',
    'serves_cocktails', 'good_for_watching_sports', 'serves_coffee', 'menu_for_children',
    'BusinessAcceptsCreditCards', 'HasTV', 'BikeParking', 'Caters', 'RestaurantsTableService',
    'WheelchairAccessible', 'HappyHour', 'BusinessAcceptsBitcoin'
]

num_features = [
    'yelp_rating', 'number_of_photos', 'dist_highway',
    'rural_urban_continuum_code_2023', 'adjusted_gross_income',
    'unemployment_rate_2023', 'user_rating_count',
    'google_rating', 'price_level'
]

features = ['business_id', 'name', 'description'] + \
    text_features + list_features + bool_features + num_features

def get_content_based_recommendation(user_id, k):
    past_restaurants = train_df.loc[(train_df['user_id'] == user_id)]
    restaurant_indices = []
    aligned_ratings = []

    for _, row in past_restaurants.iterrows():
        business_id = row['business_id']
        rating = row['stars']
        idx = item_df[item_df['business_id'] == business_id].index
        if len(idx) > 0:
            restaurant_indices.append(idx[0])
            aligned_ratings.append(rating)

    vectors = cb_matrix[restaurant_indices].toarray()
    weights = np.array(aligned_ratings) - 3  # center ratings at neutral (3)
    weights = weights.reshape(-1, 1)

    weighted_embedding = (vectors * weights).sum(axis=0)
    query_embedding = weighted_embedding.reshape(1, -1)

    similarity_scores = cosine_similarity(
        query_embedding, cb_matrix).flatten()

    similarity_scores[similarity_scores >= 0.99] = 0

    top_k_indices = np.argsort(similarity_scores.flatten())[::-1][:k]
    top_k_scores = similarity_scores.flatten()[top_k_indices]
    recommended_ids = item_df.iloc[top_k_indices]['business_id'].values

    recommended_df = pd.DataFrame({
        'business_id': recommended_ids,
        'similarity_score': top_k_scores
    })

    recommended_df['similarity_score'] = recommended_df['similarity_score'].round(
        3)

    return recommended_df.reset_index(drop=True)

### Collaborative Filtering Model

In [5]:
cf_matrix = sparse.load_npz("cf_matrix.npz")

with open('user_encoder.pkl', 'rb') as f:
    user_encoder = pickle.load(f)

with open('item_encoder.pkl', 'rb') as f:
    item_encoder = pickle.load(f)

with open('user_decoder.pkl', 'rb') as f:
    user_decoder = pickle.load(f)

users_df = pd.read_csv("users_df.csv")


def get_collaborative_filtering_recommendation(user_id, k, top_k_indices, similarity_scores):
    # user_idx = user_encoder[user_id]
    # query_vector = cf_matrix[user_idx].reshape(1, -1)

    # similarity_scores = cosine_similarity(
    #     query_vector, cf_matrix).flatten()
    # similarity_scores[user_idx] = 0  # remove self-similarity

    # # Get top-k similar users
    # top_k_indices = np.argsort(similarity_scores)[::-1][:k]
    top_k_user_ids = [users_df["user_id"][i] for i in top_k_indices]
    top_k_similarities = similarity_scores[top_k_indices]

    # Get ratings from these similar users
    sim_user_ratings = train_df[train_df['user_id'].isin(
        top_k_user_ids)].copy()
    sim_score_dict = dict(zip(top_k_user_ids, top_k_similarities))
    sim_user_ratings['weight'] = sim_user_ratings['user_id'].map(
        sim_score_dict)
    sim_user_ratings['weighted_interaction'] = sim_user_ratings['interaction'] * \
        sim_user_ratings['weight']

    # Aggregate predictions
    agg_df = sim_user_ratings.groupby('business_id').agg(
        predicted_score=('weighted_interaction', 'sum'),
        total_weight=('weight', 'sum')
    )

    # Step 1: Filter out seen items first
    seen_items = train_df[train_df['user_id']
                          == user_id]['business_id'].unique()
    agg_df = agg_df[~agg_df.index.isin(seen_items)]  # early filter

    # Step 2: Compute predicted score (after filtering)
    agg_df['predicted_score'] = agg_df['predicted_score'] / \
        agg_df['total_weight']

    # Step 3: Select top-k
    top_recs = agg_df.sort_values(
        by='predicted_score', ascending=False).head(k).reset_index()

    return top_recs[['business_id', 'predicted_score']]

### Neural Collaborative Filtering Model / MF Model

In [6]:
def get_ncf_recommendation(user_id, k):
    return pd.DataFrame(columns=['business_id'])

### Combine All Models

In [9]:
def get_recommendation(user_id, k, visualize=True):
    # If user is not found aka "new user"
    if user_id not in user_encoder: 
        recommendations = get_popularity_based_recommendation(user_id, k)
        recommendations = recommendations.merge(
            item_df[features], on='business_id', how='left')
        return recommendations
    
    # For "existing user", find top-k similar users
    user_idx = user_encoder[user_id]
    query_vector = cf_matrix[user_idx].reshape(1, -1)
    similarity_scores = cosine_similarity(
        query_vector, cf_matrix).flatten()
    similarity_scores[user_idx] = 0  # remove self-similarity
    top_k_indices = np.argsort(similarity_scores)[::-1][:k]

    # If there are similar users
    if top_k_indices.size > 0:
        cb_rec = get_content_based_recommendation(user_id, k)[["business_id"]]
        cf_rec = get_collaborative_filtering_recommendation(user_id, k, top_k_indices, similarity_scores)[
            ["business_id"]]
        ncf_rec = get_ncf_recommendation(user_id, k)[
            ["business_id"]]
        cb_rec["rank"] = cb_rec.index + 1
        cf_rec["rank"] = cf_rec.index + 1
        ncf_rec["rank"] = ncf_rec.index + 1
        recommendations = pd.concat([ncf_rec,cb_rec, cf_rec])
        
    # If there are no similar users    
    else:
        cb_rec = get_content_based_recommendation(user_id, k)[["business_id"]]
        ncf_rec = get_ncf_recommendation(user_id, k)[
            ["business_id"]]
        cb_rec["rank"] = cb_rec.index + 1
        ncf_rec["rank"] = ncf_rec.index + 1
        recommendations = pd.concat([ncf_rec, cb_rec])
    
    # Tidy up recommendations
    recommendations = recommendations.groupby("business_id")["rank"].mean().reset_index().sort_values("rank").head(k)
    recommendations = recommendations.merge(
        item_df[features], on='business_id', how='left')

    return recommendations

### Get Recommendations

In [10]:
get_recommendation('mh_-eMZ6K5RLWhZyISBhwA', 10)

Unnamed: 0,business_id,rank,name,description,categories,editorial_summary,RestaurantsAttire,Ambience,NoiseLevel,Music,...,BusinessAcceptsBitcoin,yelp_rating,number_of_photos,dist_highway,rural_urban_continuum_code_2023,adjusted_gross_income,unemployment_rate_2023,user_rating_count,google_rating,price_level
0,NQ01WqVX0tojNHKn-0sFww,1.0,Tir na nOg Irish Pub,irish pub nightlife american new bar irish pub...,Irish Pub Nightlife American (New) Bars Ir...,Draft beers plus Irish & American pub fare fo...,casual,casual,loud,live,...,,3.5,10.0,0.669666,1.0,3058841.0,4.2,1182.0,4.2,2.0
1,u50hTvPV_W_Hx625ytvLYw,1.0,Hatboro Pizza,salad food delivery service italian event plan...,Salad Food Delivery Services Italian Event ...,Quaint family-run pizzeria serves traditional ...,casual,casual,average,,...,,3.5,10.0,1.633556,1.0,877828.0,2.9,357.0,4.4,1.0
2,8n_BlTxfALO08FWdb6Tnbg,2.0,Pholosophy,restaurant vietnamese restaurant vietnamese ca...,Restaurants Vietnamese,Restaurants Vietnamese,,casual,average,,...,False,4.0,10.0,1.757572,1.0,1143777.0,2.8,186.0,4.5,1.0
3,YjLMWlHoBJHtYMLdFXfvVg,2.0,El Limon - Ambler,mexican restaurant salad sandwich familiar mex...,Mexican Restaurants Salad Sandwiches,Familiar Mexican grub including tostadas tor...,casual,casual,average,,...,,4.0,10.0,0.029601,1.0,2886349.0,2.8,1465.0,4.5,1.0
4,AtDtwv66pG52TehGR3f6tQ,3.0,Randazzo's Pizzeria,pizza restaurant pizza restaurant casual casua...,Pizza Restaurants,Pizza Restaurants,casual,casual,average,,...,,4.0,10.0,1.322643,1.0,713838.0,2.8,344.0,4.2,1.0
5,xdG_RW_QsAGJbXp46M_V8w,3.0,MOD Pizza,pizza restaurant fast food chain restaurant kn...,Pizza Restaurants Fast Food,Counter-serve industrial-chic chain restauran...,casual,casual,average,,...,False,4.0,10.0,2.099933,1.0,3836907.0,2.866667,599.0,4.3,1.0
6,HNREwZWJqeapl4-uRhaKNg,4.0,PrimoHoagies,sandwich restaurant italian sandwich chain kno...,Sandwiches Restaurants Italian,Philly-born counter-serve sandwich chain known...,casual,,average,,...,,3.5,10.0,1.633556,1.0,877828.0,2.9,300.0,4.4,1.0
7,ta7mh_wdbvbKUwb1RO3TZQ,4.0,Giuseppe's Pizza & Family Restaurant,pizza restaurant eatery dishing pizza pasta it...,Pizza Restaurants,Time-tested eatery dishing up pizza pasta & o...,casual,casual,average,,...,False,3.5,10.0,0.029601,1.0,2886349.0,2.8,1660.0,4.5,2.0
8,CATqJGCwqj2YBt2yp8BXHw,5.0,Narberth Pizza Italian Delite,italian restaurant pizza italian restaurant pi...,Italian Restaurants Pizza,Italian Restaurants Pizza,casual,casual,quiet,,...,,3.5,10.0,1.757572,1.0,1143777.0,2.8,139.0,4.4,1.0
9,xppn6ViiUkZT_K_OswcB7g,5.0,Tamarindos Restaurant,mexican restaurant bustling restaurant patio o...,Mexican Restaurants,Bustling restaurant with a patio offering a r...,casual,romantic intimate classy casual,average,,...,,3.5,10.0,1.576631,1.0,424876.0,2.8,601.0,4.5,2.0
