## Clustering + GNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Load data
df_movie = pd.read_csv('/content/movie_df_comm.csv')
df_book = pd.read_csv('/content/book_df_comm.csv')

# Concatenate dataframes
df = pd.concat([df_book, df_movie], ignore_index=True)

# Drop duplicates and missing values
df = df.drop_duplicates(subset=['reviewerID', 'asin'])
df = df.dropna(subset=['reviewerID', 'asin', 'overall'])

# Generate unique user and item IDs
df['user_id'] = df['reviewerID'].astype('category').cat.codes
df['item_id'] = df['asin'].astype('category').cat.codes

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# After splitting the data into train and test sets, reindex user_id and item_id
train_df['user_id'] = train_df['user_id'].astype('category').cat.codes
train_df['item_id'] = train_df['item_id'].astype('category').cat.codes
test_df['user_id'] = test_df['user_id'].astype('category').cat.codes
test_df['item_id'] = test_df['item_id'].astype('category').cat.codes

# Update num_users and num_items based on the reindexed IDs
num_users = train_df['user_id'].nunique()
num_items = train_df['item_id'].nunique()

# Update the interaction matrix with reindexed IDs
interaction_matrix = train_df.pivot(index='user_id', columns='item_id', values='overall').fillna(0)

# Cluster users and items
num_clusters = 50  # Adjust based on your dataset size
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Cluster users
user_clusters = kmeans.fit_predict(interaction_matrix)
user_cluster_mapping = {user_id: cluster for user_id, cluster in zip(interaction_matrix.index, user_clusters)}
train_df['user_cluster'] = train_df['user_id'].map(user_cluster_mapping)

# Cluster items
item_clusters = kmeans.fit_predict(interaction_matrix.T)
item_cluster_mapping = {item_id: cluster for item_id, cluster in zip(interaction_matrix.columns, item_clusters)}
train_df['item_cluster'] = train_df['item_id'].map(item_cluster_mapping)

# Create edge index for the graph
user_ids, item_ids = interaction_matrix.to_numpy().nonzero()
user_ids = torch.tensor(user_ids, dtype=torch.long)
item_ids = torch.tensor(item_ids + num_users, dtype=torch.long)  # Offset item IDs
edge_index = torch.stack([user_ids, item_ids], dim=0)

# Normalize adjacency matrix
from torch_geometric.utils import to_undirected, degree

edge_index = to_undirected(edge_index)
row, col = edge_index
deg = degree(row, num_users + num_items, dtype=torch.float)
deg_inv_sqrt = deg.pow(-0.5)
norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

# Create PyTorch Geometric data object
data = Data(edge_index=edge_index, edge_weight=norm)

# Add cluster features to the graph
user_cluster_features = torch.zeros(num_users, num_clusters)
user_cluster_features[torch.arange(num_users), torch.tensor(user_clusters)] = 1

item_cluster_features = torch.zeros(num_items, num_clusters)
item_cluster_features[torch.arange(num_items), torch.tensor(item_clusters)] = 1

# Combine user and item features
cluster_features = torch.cat([user_cluster_features, item_cluster_features], dim=0)
data.x = cluster_features  # Add cluster features to the graph

# Define the LightGNN model with cluster features
class LightGNN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, num_clusters=50):
        super(LightGNN, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.cluster_proj = nn.Linear(num_clusters, embedding_dim)  # Project cluster features to embedding space
        self.gcn = GCNConv(embedding_dim, embedding_dim)

    def forward(self, x, edge_index):
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        embeddings = torch.cat([user_emb, item_emb], dim=0)

        # Add cluster features to embeddings
        cluster_emb = self.cluster_proj(x)
        embeddings += cluster_emb

        # Pass through GCN
        embeddings = self.gcn(embeddings, edge_index)

        # Split embeddings back into users and items
        user_embeddings = embeddings[:self.user_embedding.num_embeddings]
        item_embeddings = embeddings[self.user_embedding.num_embeddings:]

        return user_embeddings, item_embeddings

# Initialize model and optimizer
model = LightGNN(num_users, num_items, embedding_dim=64, num_clusters=num_clusters)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(100):
    optimizer.zero_grad()
    user_embeddings, item_embeddings = model(data.x, data.edge_index)

    # Sample positive and negative interactions
    pos_interactions = train_df[['user_id', 'item_id']].values
    neg_items = torch.randint(0, num_items, (pos_interactions.shape[0],))

    # Get scores for positive and negative interactions
    pos_scores = torch.sum(user_embeddings[pos_interactions[:, 0]] * item_embeddings[pos_interactions[:, 1]], dim=1)
    neg_scores = torch.sum(user_embeddings[pos_interactions[:, 0]] * item_embeddings[neg_items], dim=1)

    # Compute BPR loss
    loss = -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss {loss.item()}")

# Extract user and item embeddings from the LightGNN model
user_embeddings, item_embeddings = model(data.x, data.edge_index)
user_embeddings = user_embeddings.detach().numpy()  # Shape: (num_users, embedding_dim)
item_embeddings = item_embeddings.detach().numpy()  # Shape: (num_items, embedding_dim)

# Generate recommendations for each user
def recommend_items(user_id, user_embeddings, item_embeddings, top_n=10):
    user_embedding = user_embeddings[user_id]  # Shape: (embedding_dim,)
    scores = np.dot(item_embeddings, user_embedding)  # Dot product between item embeddings and user embedding
    top_items = np.argsort(scores)[-top_n:]  # Get top N items with highest scores
    return top_items

# Example: Recommend items for user 0
recommended_items = recommend_items(0, user_embeddings, item_embeddings)
print("Recommended items for user 0:", recommended_items)

Epoch 0: Loss 0.05831856653094292
Epoch 10: Loss 0.007918637245893478
Epoch 20: Loss 0.0026469137519598007
Epoch 30: Loss 0.005871028173714876
Epoch 40: Loss 0.00162618572358042
Epoch 50: Loss 0.0019342484883964062
Epoch 60: Loss 0.0019004239002242684
Epoch 70: Loss 0.0022806404158473015
Epoch 80: Loss 0.0017112812492996454
Epoch 90: Loss 0.0026080275420099497
Recommended items for user 0: [ 639 1058 3873 2041 2889 2965 2927 3238 2237 2491]


In [None]:
import numpy as np
from sklearn.metrics import ndcg_score

def evaluate_recommendations(test_df, user_embeddings, item_embeddings, top_n=10):
    hit_rate = 0
    ndcg = 0
    precision = 0
    recall = 0
    total_users = test_df['user_id'].nunique()

    for user_id in test_df['user_id'].unique():
        # Get ground truth items for the user
        ground_truth = test_df[test_df['user_id'] == user_id]['item_id'].values

        # Generate recommendations for the user
        recommended_items = recommend_items(user_id, user_embeddings, item_embeddings, top_n)

        # Skip evaluation if no ground truth exists
        if len(ground_truth) == 0:
            continue

        # Calculate Hit Rate
        if len(np.intersect1d(recommended_items, ground_truth)) > 0:
            hit_rate += 1

        # Calculate NDCG
        relevance = np.isin(recommended_items, ground_truth).astype(int)
        if np.sum(relevance) > 0:
            ndcg += ndcg_score([relevance], [np.ones_like(relevance)], k=top_n)

        # Calculate Precision and Recall
        true_positives = len(np.intersect1d(recommended_items, ground_truth))
        precision += true_positives / top_n
        recall += true_positives / len(ground_truth)

    # Average metrics across all users
    hit_rate /= total_users
    ndcg /= total_users
    precision /= total_users
    recall /= total_users

    return hit_rate, ndcg, precision, recall

# Example usage
hit_rate, ndcg, precision, recall = evaluate_recommendations(test_df, user_embeddings, item_embeddings, top_n=10)
print(f"Hit Rate: {hit_rate:.4f}")
print(f"NDCG: {ndcg:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Hit Rate: 0.0026
NDCG: 0.0012
Precision: 0.0003
Recall: 0.0017
