In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load data
reviews = pd.read_csv('/kaggle/input/attraction-data/DATA/subset_reviews (1).csv')
businesses = pd.read_csv('/kaggle/input/attraction-data/DATA/subset_places (1).csv')
users = pd.read_csv('/kaggle/input/attraction-data/DATA/subset_users (1).csv')


In [2]:
# Preprocessing
reviews = reviews[['user_id', 'business_id', 'stars', 'text']]
businesses = businesses[['business_id', 'name', 'categories', 'review_count']]
data = pd.merge(reviews, businesses, on='business_id').dropna()

# Initialize model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
reviews['user_id'].value_counts()

user_id
n-lBS02-3yvlY5Q91mmwDA    6
_BcWyKQL16ndpBdggh2kNA    5
7ktyPHE-NGnWxarOqjIQiQ    4
758g6NGLp9deCbvowz62Ww    4
zxeXnjqmlrAspfk17LSZCg    3
                         ..
C410dZNCFbEL7D4vMRF_Qw    1
szqJcANlagfO7s9GNwTwUw    1
xmbvOR4vRH1iPVrLOB628w    1
hn18xnU4C5wM_rlQ06FANA    1
fUnIZ6Z_hy68xjvWipXA7Q    1
Name: count, Length: 4514, dtype: int64

In [5]:
data

Unnamed: 0,user_id,business_id,stars,text,name,categories,review_count
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",Turning Point of North Wales,"Restaurants, Breakfast & Brunch, Food, Juice B...",169
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...,Body Cycle Spinning Studio,"Active Life, Cycling Classes, Trainers, Gyms, ...",144
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...,Kettle Restaurant,"Restaurants, Breakfast & Brunch",47
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo...",Zaika,"Halal, Pakistani, Restaurants, Indian",181
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...,Melt,"Sandwiches, Beer, Wine & Spirits, Bars, Food, ...",32
...,...,...,...,...,...,...,...
4725,FQe-tDY2ESxrvgIyMyKQRg,dz_aIFbATP2PLWQSOBnMfw,3,Nothing special. Came for brunch and got the b...,Maggie Mae's Sunrise Cafe,"Restaurants, Breakfast & Brunch",469
4726,AhqX56lEpj7MjW3UGF9zsQ,_aKr7POnacW_VizRKBpCiA,5,Love this place! Been going back once a week ...,Blues City Deli,"Delis, Bars, Restaurants, Nightlife, Pubs, Ame...",991
4727,pg0GI_LBpsX06a8eN7Ff5A,0qu0fNTOsSmuREYVIMPuIQ,4,I went here 5-7 years ago. I remember it bein...,Cold Spring Tavern,"American (Traditional), Restaurants, Bars, Nig...",1018
4728,lYAmgL_l7A3MPFYe1DYKrw,EpREWeEpmR8f1qLHzzF0AA,5,Just about to get tucked into a meatloaf that ...,Schlafly Bottleworks,"Local Flavor, Food, Brewpubs, Breweries, Resta...",615


In [6]:
# Function to generate embeddings in batches
def generate_embeddings_in_batches(texts, tokenizer, model, device, batch_size=32):
    dataloader = DataLoader(texts, batch_size=batch_size)
    embeddings = []

    for batch in dataloader:
        inputs = tokenizer(list(batch), return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)

    return embeddings

# Generate review embeddings
data['review_embedding'] = generate_embeddings_in_batches(data['text'].tolist(), tokenizer, model, device)


In [7]:
# Calculate weighted user embeddings
def weighted_user_embeddings(data):
    def weighted_avg(group):
        weighted_sum = sum(star * review for star, review in zip(group['stars'], group['review_embedding']))
        total_weight = sum(group['stars'])
        return weighted_sum / total_weight

    user_embeddings = data.groupby('user_id').apply(weighted_avg)
    return dict(zip(user_embeddings.index, user_embeddings.tolist()))

user_embeddings = weighted_user_embeddings(data)

  user_embeddings = data.groupby('user_id').apply(weighted_avg)


In [8]:
# Calculate business embeddings
def business_embeddings(data):
    def avg_embedding(group):
        return sum(group['review_embedding']) / len(group['review_embedding'])

    business_embeddings = data.groupby('business_id').apply(avg_embedding)
    return dict(zip(business_embeddings.index, business_embeddings.tolist()))

business_embeddings = business_embeddings(data)

  business_embeddings = data.groupby('business_id').apply(avg_embedding)


In [9]:
# Recommendation function
def recommend_businesses(businesses, user_id, top_n=10):
    if user_id not in user_embeddings:
        return f"User ID {user_id} not found."

    user_vector = user_embeddings[user_id]
    scores = []

    for business_id, business_vector in business_embeddings.items():
        score = cosine_similarity([user_vector], [business_vector])[0][0]
        business_name = businesses.loc[businesses["business_id"] == business_id, "name"].values
        business_name = business_name[0] if len(business_name) > 0 else "Unknown"
        scores.append((business_id, business_name, score))

    return sorted(scores, key=lambda x: x[2], reverse=True)[:top_n]

# Example usage
recommendations = recommend_businesses(businesses, user_id='zxeXnjqmlrAspfk17LSZCg', top_n=5)
print(recommendations)

[('cg4JFJcCxRTTMmcg9O9KtA', 'Ava', 0.97715497), ('pym7c6ZFEtmoH16xN2ApBg', "Katie's Restaurant & Bar", 0.9771434), ('ompDR5sUDpoI6gnTldmneQ', 'The Praline Connection', 0.97632027), ('uE40984_YDgVvPeRpFcCaQ', 'The Fat Ham', 0.97417516), ('GBTPC53ZrG1ZBY3DT8Mbcw', 'Luke', 0.97339284)]


In [10]:
data.loc[data['user_id'] == "zxeXnjqmlrAspfk17LSZCg"]

Unnamed: 0,user_id,business_id,stars,text,name,categories,review_count,review_embedding
1637,zxeXnjqmlrAspfk17LSZCg,OINbC0rpDVJ5bfxt3LO9fw,5,A neighborhood cafe with GREAT soul food. We w...,Li'l Dizzy's Cafe,"Restaurants, Cajun/Creole",651,"[0.021170635, 0.07375943, 0.074098185, 0.10879..."
3019,zxeXnjqmlrAspfk17LSZCg,ompDR5sUDpoI6gnTldmneQ,3,Just OK fried southern basics. The fried chick...,The Praline Connection,"Seafood, Cajun/Creole, Candy Stores, Coffee & ...",770,"[0.20930617, 0.07774904, 0.053295825, 0.199161..."
3883,zxeXnjqmlrAspfk17LSZCg,GBTPC53ZrG1ZBY3DT8Mbcw,3,"Pretty good food, on the French bistro side of...",Luke,"German, Restaurants, Seafood, Cocktail Bars, F...",4554,"[-0.04325212, 0.074690826, 0.08059637, 0.19996..."


In [11]:
from sklearn.metrics import mean_squared_error, r2_score

def dcg_at_k(scores, k):
    scores = np.asarray(scores)[:k]
    return np.sum((2**scores - 1) / np.log2(np.arange(2, scores.size + 2)))

def ndcg_at_k(true_ratings, predicted_scores, k=10):
    ideal_scores = sorted(true_ratings, reverse=True)
    return dcg_at_k(predicted_scores, k) / dcg_at_k(ideal_scores, k) if dcg_at_k(ideal_scores, k) > 0 else 0

def mean_average_precision_at_k(true_ratings, predicted_scores, k=10):
    sorted_indices = np.argsort(predicted_scores)[::-1]
    relevant = np.array(true_ratings)[sorted_indices] > 0  # Assuming nonzero ratings are relevant
    precisions = [np.mean(relevant[:i + 1]) for i in range(len(relevant)) if relevant[i]]
    return np.mean(precisions) if precisions else 0

def mean_reciprocal_rank(true_ratings, predicted_scores):
    sorted_indices = np.argsort(predicted_scores)[::-1]
    relevant = np.array(true_ratings)[sorted_indices] > 0
    for i, rel in enumerate(relevant):
        if rel:
            return 1 / (i + 1)
    return 0

def evaluate_recommendations(data, user_id, recommendations, k=10):
    true_ratings = []
    predicted_scores = []

    for business_id, business_name, score in recommendations:
        true_rating = data[(data['user_id'] == user_id) & (data['business_id'] == business_id)]['stars'].values
        if len(true_rating) > 0:
            true_ratings.append(true_rating[0])
            predicted_scores.append(score)

    if true_ratings:
        rmse = mean_squared_error(true_ratings, predicted_scores, squared=False)
        r2 = r2_score(true_ratings, predicted_scores)
        ndcg = ndcg_at_k(true_ratings, predicted_scores, k)
        map_k = mean_average_precision_at_k(true_ratings, predicted_scores, k)
        mrr = mean_reciprocal_rank(true_ratings, predicted_scores)
        
        return rmse, r2, ndcg, map_k, mrr
    else:
        return None, None, None, None, None

eval_rmse, eval_r2, eval_ndcg, eval_map, eval_mrr = evaluate_recommendations(data, user_id='zxeXnjqmlrAspfk17LSZCg', recommendations=recommendations)
print(f"RMSE: {eval_rmse}, R2: {eval_r2}, NDCG: {eval_ndcg}, MAP: {eval_map}, MRR: {eval_mrr}")

RMSE: 2.02514397350074, R2: 0.0, NDCG: 0.13798543111484338, MAP: 1.0, MRR: 1.0
