## BERT + Clustering

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

# Load the dataframes
df_movie = pd.read_csv('/content/movie_df_comm.csv')
df_book = pd.read_csv('/content/book_df_comm.csv')

# Concatenate dataframes
df = pd.concat([df_book, df_movie], ignore_index=True)

# Drop duplicates and missing values
df = df.drop_duplicates(subset=['reviewerID', 'asin'])
df = df.dropna(subset=['reviewerID', 'asin', 'overall'])

# Generate unique user and item IDs
df['user_id'] = df['reviewerID'].astype('category').cat.codes
df['item_id'] = df['asin'].astype('category').cat.codes

# Split data into train and test sets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the dataframes for later use
train_df.to_pickle("train_df.pkl")
test_df.to_pickle("test_df.pkl")

# Initialize BERT model and tokenizer
import torch
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Function to generate BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

# Apply BERT embedding to item descriptions
train_df['item_embedding'] = train_df['description'].apply(get_bert_embedding)
train_df.to_pickle("train_df_with_embeddings.pkl")

# Generate user embeddings by averaging the embeddings of items they interacted with
user_embeddings = train_df.groupby('user_id')['item_embedding'].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()
user_embeddings = user_embeddings.rename(columns={'item_embedding': 'user_embedding'})

# Merge user embeddings back to the train_df
train_df = train_df.merge(user_embeddings, on='user_id')

In [None]:
from sklearn.cluster import KMeans

# Cluster users
num_user_clusters = 10  # You can tune this parameter
user_kmeans = KMeans(n_clusters=num_user_clusters, random_state=42)
user_clusters = user_kmeans.fit_predict(np.vstack(user_embeddings['user_embedding']))
user_embeddings['user_cluster'] = user_clusters

# Cluster items
num_item_clusters = 20  # You can tune this parameter
item_embeddings = np.vstack(train_df['item_embedding'].values)
item_kmeans = KMeans(n_clusters=num_item_clusters, random_state=42)
item_clusters = item_kmeans.fit_predict(item_embeddings)
train_df['item_cluster'] = item_clusters

In [None]:
def recommend_items(user_id, user_embeddings, train_df, top_n=10):
    # Check if the user exists in user_embeddings
    user_data = user_embeddings[user_embeddings['user_id'] == user_id]

    if user_data.empty:
        # If the user is not in the training set, recommend popular items
        popular_items = train_df['item_id'].value_counts().index[:top_n].tolist()
        return popular_items[:top_n]

    # Get the user's cluster
    user_cluster = user_data['user_cluster'].values[0]

    # Get items in the same cluster
    cluster_items = train_df[train_df['item_cluster'] == user_cluster]['item_id'].unique()

    # Get the items that the user has not interacted with
    user_interacted_items = train_df[train_df['user_id'] == user_id]['item_id'].unique()
    recommended_items = np.setdiff1d(cluster_items, user_interacted_items)

    # If there are not enough items in the cluster, recommend popular items
    if len(recommended_items) < top_n:
        popular_items = train_df['item_id'].value_counts().index[:top_n].tolist()
        recommended_items = np.concatenate([recommended_items, popular_items])

    return recommended_items[:top_n]

In [None]:
def evaluate_recommendations(test_df, user_embeddings, train_df, top_n=10):
    hit_rate = 0
    ndcg = 0
    precision = 0
    recall = 0
    total_users = 0  # Count only users present in user_embeddings

    for user_id in test_df['user_id'].unique():
        # Check if the user exists in user_embeddings
        if user_id not in user_embeddings['user_id'].values:
            continue  # Skip users not in the training set

        total_users += 1

        # Get ground truth items for the user
        ground_truth = test_df[test_df['user_id'] == user_id]['item_id'].values

        # Generate recommendations for the user
        recommended_items = recommend_items(user_id, user_embeddings, train_df, top_n)

        # Calculate Hit Rate
        if len(np.intersect1d(recommended_items, ground_truth)) > 0:
            hit_rate += 1

        # Calculate NDCG
        relevance = np.isin(recommended_items, ground_truth).astype(int)
        if np.sum(relevance) > 0:
            ndcg += ndcg_score([relevance], [np.ones_like(relevance)], k=top_n)

        # Calculate Precision and Recall
        true_positives = len(np.intersect1d(recommended_items, ground_truth))
        precision += true_positives / top_n
        recall += true_positives / len(ground_truth)

    # Avoid division by zero if no users are evaluated
    if total_users == 0:
        return 0, 0, 0, 0

    # Average metrics across all evaluated users
    hit_rate /= total_users
    ndcg /= total_users
    precision /= total_users
    recall /= total_users

    return hit_rate, ndcg, precision, recall

In [None]:
# Example usage
hit_rate, ndcg, precision, recall = evaluate_recommendations(test_df, user_embeddings, train_df, top_n=10)
print(f"Hit Rate: {hit_rate:.4f}")
print(f"NDCG: {ndcg:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Hit Rate: 0.0009
NDCG: 0.0004
Precision: 0.0001
Recall: 0.0009
