In [34]:
cd ..

/home/abdalrhman/Desktop/Graduation Project/AiStore


# 1. Setup & Imports

In [35]:
import os
import sys
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.utils import negative_sampling
from srcs.utils.logger import get_module_logger
from srcs.utils.settings import (
    CLEANED_REVIEWS_PATH_CSV,
    CLEANED_METADATA_PATH_CSV,
    FULL_GRAPH_PATH,
    TRAIN_GRAPH_PATH,
    VAL_GRAPH_PATH,
    TEST_GRAPH_PATH,
    IMAGES_DIR,
    GNN_MODEL_SAVE_PATH,
    PREDICTOR_MODEL_SAVE_PATH
)

# Configure paths
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, os.pardir, os.pardir))
sys.path.append(PROJECT_ROOT)

# Initialize logger
logger = get_module_logger("graph_builder")

# 2. Data Loading with Metadata Filtering

In [36]:
import pandas as pd

# Load the full dataset
reviews_df = pd.read_csv(CLEANED_REVIEWS_PATH_CSV)
meta_df = pd.read_csv(CLEANED_METADATA_PATH_CSV)

# Randomly sample 50,000 reviews
reviews_df = reviews_df.sample(n=10_000, random_state=42)

# Filter metadata to only include items in sampled reviews
filtered_item_ids = reviews_df['parent_asin'].unique()
meta_df = meta_df[meta_df['parent_asin'].isin(filtered_item_ids)]

# Logging
logger.info(f"Sampled {len(reviews_df):,} reviews")
logger.info(f"Filtered metadata to {len(meta_df):,} items from {len(meta_df):,}")
logger.info(f"Unique items in sampled reviews: {len(filtered_item_ids):,}")


[2025-05-08 17:49:11] [INFO] graph_builder: Sampled 10,000 reviews
[2025-05-08 17:49:12] [INFO] graph_builder: Filtered metadata to 638 items from 638
[2025-05-08 17:49:12] [INFO] graph_builder: Unique items in sampled reviews: 8,354


In [37]:
reviews_df.head(5)

Unnamed: 0,user_id,parent_asin,rating,timestamp,year,month,day,hour,minute,recency,recency_weight
13162781,AFCO6LEANZBTDWKI4BH6BO7H4PIA,B0BKQWX8ZJ,5.0,2019-06-26 17:12:09.179,2019,6,26,17,12,2137,0.000468
2232846,AH3L645CVARFM3WRPSP3G26WOAEA,B005BH3QOY,4.0,2014-06-17 21:14:07.000,2014,6,17,21,14,3971,0.000252
8284619,AHIAQCSWKTDLBS4AV7TZMMHD5J2Q,B01M4NU4OM,5.0,2018-03-19 17:34:21.529,2018,3,19,17,34,2601,0.000384
6794797,AH7VRATJ52IOBIL3HQPYFKYLHWIQ,B0BYYJPGQB,5.0,2018-06-29 13:18:45.132,2018,6,29,13,18,2499,0.0004
5364485,AFUZ3QNYGXTLGGWTUTXD6PY4GLQA,B00AJFTHX2,5.0,2016-05-16 12:27:58.000,2016,5,16,12,27,3273,0.000305


In [38]:
meta_df.head(5)

Unnamed: 0,main_category,title,average_rating,rating_number,price,store,parent_asin,n_features,n_description_items,first_image,brand,color,date_first_available,primary_category,rating_bin
21,Computers,KHOMO - iPad 2 3 and 4 Generation Case - DUAL ...,4.5,2745,11.95,Khomo,B06XKRXLDR,5,1,https://m.media-amazon.com/images/I/31+mP+y8Uo...,Khomo,Black,2011-05-13,Electronics,Medium
35,All Electronics,"Charger for MacBook Pro 10FT, 96W USB C Charge...",4.5,2141,35.99,Ifeart,B07WZT643Q,5,0,https://m.media-amazon.com/images/I/21QlbdFXAG...,Ifeart,White,2019-09-23,Electronics,Medium
66,Home Audio & Theater,"C&E High Speed HDMI Cable with Ethernet Black,...",4.6,531,8.99,C&E,B07Q1JN792,5,16,https://m.media-amazon.com/images/I/41IfnleVoM...,C&E,1 Pack,2015-01-21,Electronics,High
73,Computers,Laptop Sleeve Elastic Neoprene Case Compatible...,4.6,8298,15.99,Hseok,B071YJFTV4,5,0,https://m.media-amazon.com/images/I/51wJ5C7w1+...,Hseok,Butterfly,2020-07-10,Electronics,High
83,Computers,ProCase 14-15.6 Inch Laptop Bag Messenger Shou...,4.6,1015,25.99,Procase,B07CRQDTKM,5,0,https://m.media-amazon.com/images/I/510HCN8zHb...,Procase,Grey,2016-09-22,Electronics,High


# 3. Bipartite Graph Construction

In [39]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

# 1. Encode users and items into index form
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Assuming reviews_df is already loaded
reviews_df['user_idx'] = user_encoder.fit_transform(reviews_df['user_id'])
reviews_df['item_idx'] = item_encoder.fit_transform(reviews_df['parent_asin'])

# Save mappings (optional but helpful later)
user_id_map = dict(zip(reviews_df['user_id'], reviews_df['user_idx']))
item_id_map = dict(zip(reviews_df['parent_asin'], reviews_df['item_idx']))

# 2. Build edge_index efficiently by stacking user_idx and item_idx
edge_index = torch.tensor(
    np.column_stack([reviews_df['user_idx'].values, reviews_df['item_idx'].values]), 
    dtype=torch.long
)

# 3. Extract ratings as edge weights
ratings = torch.tensor(reviews_df['rating'].values, dtype=torch.float)

# 4. Build HeteroData
data = HeteroData()

# Assuming user_feats and item_feats are available or generated, you can replace these with actual features
data['user'].x = torch.randn(reviews_df['user_idx'].nunique(), 64)  # Random user features (64-dimensional)
data['item'].x = torch.randn(reviews_df['item_idx'].nunique(), 64)  # Random item features (64-dimensional)

# Adding edges with weights
data['user', 'rates', 'item'].edge_index = edge_index
data['user', 'rates', 'item'].edge_weight = ratings

# Optional: Reverse edges for item -> user (not mandatory but useful for reverse propagation if needed)
data['item', 'rev_rates', 'user'].edge_index = edge_index.flip(0)  # If you need the reverse edges
data['item', 'rev_rates', 'user'].edge_weight = ratings  # Same ratings can be used for the reverse edges

print(f"Graph: {data}")


Graph: HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[10000, 2],
    edge_weight=[10000],
  },
  (item, rev_rates, user)={
    edge_index=[10000, 2],
    edge_weight=[10000],
  }
)


# 4. Feature Engineering with Cold-Start Handling

In [40]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder

# 1. Encode users and items into index form
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Assuming reviews_df is already loaded
reviews_df['user_idx'] = user_encoder.fit_transform(reviews_df['user_id'])
reviews_df['item_idx'] = item_encoder.fit_transform(reviews_df['parent_asin'])

# Save mappings (optional but helpful later)
user_id_map = dict(zip(reviews_df['user_id'], reviews_df['user_idx']))
item_id_map = dict(zip(reviews_df['parent_asin'], reviews_df['item_idx']))

# 2. Build edge_index
edge_index = torch.tensor([
    reviews_df['user_idx'].values,
    reviews_df['item_idx'].values
], dtype=torch.long)

# 3. Extract ratings as edge weights
ratings = torch.tensor(reviews_df['rating'].values, dtype=torch.float)

# 4. Build HeteroData
data = HeteroData()

# Assuming user_feats and item_feats are available or generated, you can replace these with actual features
data['user'].x = torch.randn(reviews_df['user_idx'].nunique(), 64)  # Random user features (64-dimensional)
data['item'].x = torch.randn(reviews_df['item_idx'].nunique(), 64)  # Random item features (64-dimensional)

# Adding edges with weights
data['user', 'rates', 'item'].edge_index = edge_index
data['user', 'rates', 'item'].edge_weight = ratings

# Optional: Reverse edges for item -> user (not mandatory but useful for reverse propagation if needed)
data['item', 'rev_rates', 'user'].edge_index = edge_index.flip(0)  # If you need the reverse edges
data['item', 'rev_rates', 'user'].edge_weight = ratings  # Same ratings can be used for the reverse edges

print(f"Graph: {data}")


Graph: HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[2, 10000],
    edge_weight=[10000],
  },
  (item, rev_rates, user)={
    edge_index=[2, 10000],
    edge_weight=[10000],
  }
)


In [41]:
import torch
from torch_geometric.transforms import RandomLinkSplit

# 1. Apply the split (automatically adds reverse edge)
transform = RandomLinkSplit(
    num_val=0.1,  # 10% of edges for validation
    num_test=0.1,  # 10% of edges for testing
    is_undirected=True,  # Treat edges as undirected for link prediction
    add_negative_train_samples=True,  # Automatically generate negative samples for training
    edge_types=('user', 'rates', 'item'),  # Specify the edge type (user -> item)
    rev_edge_types=('item', 'rev_rates', 'user')  # Reverse edge type for item -> user
)

# Split the data into train, validation, and test sets
train_data, val_data, test_data = transform(data)

torch.save(train_data, TRAIN_GRAPH_PATH)
torch.save(val_data, VAL_GRAPH_PATH)
torch.save(test_data, TEST_GRAPH_PATH)
torch.save(data, FULL_GRAPH_PATH)

logger.info(f"Saved full graph to {FULL_GRAPH_PATH}")

# Logging the splits
print(f"Graphs split and saved: train → {TRAIN_GRAPH_PATH}, val → {VAL_GRAPH_PATH}, test → {TEST_GRAPH_PATH}")


[2025-05-08 17:49:12] [INFO] graph_builder: Saved full graph to /home/abdalrhman/Desktop/Graduation Project/AiStore/Recommender_V1/data/processed/graph_splits/full_graph.pt


Graphs split and saved: train → /home/abdalrhman/Desktop/Graduation Project/AiStore/Recommender_V1/data/processed/graph_splits/graph_train.pt, val → /home/abdalrhman/Desktop/Graduation Project/AiStore/Recommender_V1/data/processed/graph_splits/graph_val.pt, test → /home/abdalrhman/Desktop/Graduation Project/AiStore/Recommender_V1/data/processed/graph_splits/graph_test.pt


In [42]:
print(train_data)   

HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[2, 8000],
    edge_weight=[8000],
    edge_label=[16000],
    edge_label_index=[2, 16000],
  },
  (item, rev_rates, user)={
    edge_index=[2, 8000],
    edge_weight=[8000],
  }
)


In [43]:
print(val_data)

HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[2, 8000],
    edge_weight=[8000],
    edge_label=[2000],
    edge_label_index=[2, 2000],
  },
  (item, rev_rates, user)={
    edge_index=[2, 8000],
    edge_weight=[8000],
  }
)


In [44]:
print(test_data)

HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[2, 9000],
    edge_weight=[9000],
    edge_label=[2000],
    edge_label_index=[2, 2000],
  },
  (item, rev_rates, user)={
    edge_index=[2, 9000],
    edge_weight=[9000],
  }
)


In [45]:
print(data)

HeteroData(
  user={ x=[9945, 64] },
  item={ x=[8354, 64] },
  (user, rates, item)={
    edge_index=[2, 10000],
    edge_weight=[10000],
  },
  (item, rev_rates, user)={
    edge_index=[2, 10000],
    edge_weight=[10000],
  }
)


In [46]:
import torch
from torch_geometric.data import HeteroData


# Load the full graph
# full_data = torch.load(FULL_GRAPH_PATH)

# Load train, validation, and test splits
train_data = torch.load(TRAIN_GRAPH_PATH, weights_only=False)
val_data = torch.load(VAL_GRAPH_PATH, weights_only=False)
test_data = torch.load(TEST_GRAPH_PATH, weights_only=False)

# Optional: Move data to device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# full_data = full_data.to(device)
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

# 5. Graph Splitting with Bipartite Awareness

In [47]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import SAGEConv, HeteroConv, Linear
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score, precision_score, recall_score
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ─── 1) Data Preparation ───────────────────────────────────────────────
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=False,
    edge_types=[('user', 'rates', 'item')],
    rev_edge_types=[('item', 'rev_rates', 'user')]
)

train_data, val_data, test_data = transform(data)
train_data, val_data, test_data = [d.to(device) for d in (train_data, val_data, test_data)]

num_items = train_data['item'].x.size(0)
edge_set = set(map(tuple, train_data['user','rates','item'].edge_index.cpu().t().tolist()))

# ─── 2) Fixed Heterogeneous GraphSAGE Model ────────────────────────────
class HeteroGraphSAGE(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.conv1 = HeteroConv({
            ('user', 'rates', 'item'): SAGEConv((-1, -1), hidden_dim),
            ('item', 'rev_rates', 'user'): SAGEConv((-1, -1), hidden_dim)
        }, aggr='mean')
        
        self.conv2 = HeteroConv({
            ('user', 'rates', 'item'): SAGEConv(hidden_dim, hidden_dim),
            ('item', 'rev_rates', 'user'): SAGEConv(hidden_dim, hidden_dim)
        }, aggr='mean')
        
        # BatchNorm for each node type
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        
        # Dropout (now applied per node type)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x_dict, edge_index_dict):
        # First conv layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: self.bn1(x) for key, x in x_dict.items()}
        x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
        x_dict = {key: self.dropout(x) for key, x in x_dict.items()}  # ✅ Fixed: Apply dropout per tensor

        # Second conv layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: self.bn2(x) for key, x in x_dict.items()}

        return x_dict

# ─── 3) Link Predictor with MLP ───────────────────────────────────────
class LinkPredictor(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.mlp = nn.Sequential(
            Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            Linear(hidden_dim, 1)
        )

    def forward(self, src, dst):
        return self.mlp(torch.cat([src, dst], dim=-1)).squeeze()

# ─── 4) Initialize Model, Optimizer, Scheduler ───────────────────────
hidden_dim = 128
gnn = HeteroGraphSAGE(hidden_dim).to(device)
predictor = LinkPredictor(hidden_dim).to(device)

optimizer = torch.optim.AdamW([
    {'params': gnn.parameters(), 'lr': 1e-3},
    {'params': predictor.parameters(), 'lr': 5e-4}
], weight_decay=1e-5)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=5, verbose=True
)



In [48]:
# ─── 5) Training with Hard Negative Sampling ─────────────────────────
def train_epoch(data, neg_ratio=5):
    gnn.train()
    predictor.train()
    optimizer.zero_grad()

    emb = gnn(data.x_dict, data.edge_index_dict)

    edge_index = data['user', 'rates', 'item'].edge_index
    src, dst = edge_index[0], edge_index[1]
    pos_pred = predictor(emb['user'][src], emb['item'][dst])

    # Hard negative sampling
    neg_src, neg_dst = [], []
    for u in src.unique().cpu().tolist():
        seen = dst[src == u]
        candidates = torch.randint(0, num_items, (len(seen)*neg_ratio,), device=seen.device)
        valid = ~torch.isin(candidates, seen)
        sampled = candidates[valid][:len(seen)]
        if len(sampled) > 0:
            neg_src.append(torch.tensor([u]*len(sampled), device=device))
            neg_dst.append(sampled)

    if not neg_src:
        return 0.0

    neg_src = torch.cat(neg_src)
    neg_dst = torch.cat(neg_dst)

    neg_pred = predictor(emb['user'][neg_src], emb['item'][neg_dst])

    loss = F.binary_cross_entropy_with_logits(
        torch.cat([pos_pred, neg_pred]),
        torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)])
    )

    loss.backward()
    optimizer.step()
    return loss.item()

# ─── 6) Evaluation with Dynamic Threshold ───────────────────────────
@torch.no_grad()
def evaluate(data):
    gnn.eval()
    predictor.eval()
    emb = gnn(data.x_dict, data.edge_index_dict)

    edge_index = data['user', 'rates', 'item'].edge_index
    src, dst = edge_index[0], edge_index[1]
    
    pos_pred = predictor(emb['user'][src], emb['item'][dst]).cpu().numpy()
    pos_labels = np.ones(len(pos_pred))

    neg_src, neg_dst = [], []
    for u in src.unique().cpu().tolist():
        seen = dst[src == u]
        candidates = torch.randint(0, num_items, (len(seen)*5,), device=seen.device)
        valid = ~torch.isin(candidates, seen)
        sampled = candidates[valid][:len(seen)]
        if len(sampled) > 0:
            neg_src.append(torch.tensor([u]*len(sampled), device=device))
            neg_dst.append(sampled)

    if not neg_src:
        return 0.0, 0.0, 0.0, 0.0, 0.0

    neg_pred = predictor(emb['user'][torch.cat(neg_src)], emb['item'][torch.cat(neg_dst)]).cpu().numpy()
    neg_labels = np.zeros(len(neg_pred))

    scores = np.concatenate([pos_pred, neg_pred])
    labels = np.concatenate([pos_labels, neg_labels])

    auc = roc_auc_score(labels, scores)
    prec, rec, threshs = precision_recall_curve(labels, scores)
    f1s = 2 * (prec * rec) / (prec + rec + 1e-8)
    best_idx = np.argmax(f1s)
    best_thresh = threshs[best_idx]

    preds = (scores >= best_thresh).astype(int)
    f1 = f1_score(labels, preds)
    prec_final = precision_score(labels, preds)
    rec_final = recall_score(labels, preds)

    return {
        'auc': auc,
        'f1': f1,
        'precision': prec_final,
        'recall': rec_final,
        'threshold': best_thresh
    }

In [49]:
# # ─── 7) Training Loop with Early Stopping ───────────────────────────
# best_f1 = 0.0
# patience = 0

# for epoch in range(1, 100):
#     loss = train_epoch(train_data)
#     val_metrics = evaluate(val_data)
#     scheduler.step(val_metrics['f1'])

#     print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | "
#           f"Val AUC: {val_metrics['auc']:.4f} | F1: {val_metrics['f1']:.4f} | "
#           f"Prec: {val_metrics['precision']:.4f} | Rec: {val_metrics['recall']:.4f}")

#     if val_metrics['f1'] > best_f1:
#         best_f1 = val_metrics['f1']
#         torch.save(gnn.state_dict(), 'best_gnn.pth')
#         torch.save(predictor.state_dict(), 'best_predictor.pth')
#         patience = 0
#     else:
#         patience += 1
#         if patience >= 20:
#             print("Early stopping triggered.")
#             break

# # ─── 8) Final Test Evaluation ───────────────────────────────────────
# gnn.load_state_dict(torch.load('best_gnn.pth'))
# predictor.load_state_dict(torch.load('best_predictor.pth'))
# test_metrics = evaluate(test_data)

# print("\nTest Metrics:")
# print(f"AUC: {test_metrics['auc']:.4f} | Precision: {test_metrics['precision']:.4f}")
# print(f"Recall: {test_metrics['recall']:.4f} | F1: {test_metrics['f1']:.4f}")
# print(f"Optimal Threshold: {test_metrics['threshold']:.4f}")

In [50]:
# # Save the best model after training
# torch.save(gnn.state_dict(), GNN_MODEL_SAVE_PATH)
# torch.save(predictor.state_dict(), PREDICTOR_MODEL_SAVE_PATH)

# print("Models saved successfully.")

In [51]:
# Recreate model architecture
hidden_dim = 128
gnn = HeteroGraphSAGE(hidden_dim)
predictor = LinkPredictor(hidden_dim)

# Load saved weights with map_location set to 'cpu'
gnn.load_state_dict(torch.load(GNN_MODEL_SAVE_PATH, map_location=torch.device('cpu')))
predictor.load_state_dict(torch.load(PREDICTOR_MODEL_SAVE_PATH, map_location=torch.device('cpu')))

# Move models to desired device (optional)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gnn = gnn.to(device)
predictor = predictor.to(device)

# Set models to evaluation mode for inference
gnn.eval()
predictor.eval()

LinkPredictor(
  (mlp): Sequential(
    (0): Linear(256, 128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(128, 1, bias=True)
  )
)

In [52]:
gnn.train()
predictor.train()

LinkPredictor(
  (mlp): Sequential(
    (0): Linear(256, 128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(128, 1, bias=True)
  )
)

# ✅ 1. Top-K Recommendations for a User

Recommend top k items for a given user based on predicted interaction scores.

In [53]:
@torch.no_grad()
def recommend(user_id, gnn_model, predictor_model, data, k=10):
    """
    Recommend top-k items for a given user based on learned embeddings.
    
    Args:
        user_id (int): ID of the user
        gnn_model: Trained heterogeneous GNN model (HeteroGraphSAGE)
        predictor_model: Link predictor (MLP)
        data: HeteroData object (val or test split)
        k (int): Number of recommendations to return
    
    Returns:
        list: Top-k item IDs recommended for the user
    """
    gnn_model.eval()
    predictor_model.eval()

    # Get embeddings for all nodes
    emb = gnn_model(data.x_dict, data.edge_index_dict)

    # Get user embedding
    user_emb = emb['user'][user_id].unsqueeze(0)  # shape: [1, hidden_dim]

    # Get all item embeddings
    item_embs = emb['item']  # shape: [num_items, hidden_dim]

    # Repeat user embedding to match item_embs size
    user_embs = user_emb.expand(item_embs.size(0), -1)

    # Predict scores for all items
    scores = predictor_model(user_embs, item_embs).cpu().numpy()

    # Exclude already interacted items
    interacted_items = data['user', 'rates', 'item'].edge_index[1][
        data['user', 'rates', 'item'].edge_index[0] == user_id
    ].cpu().numpy()

    # Mask out interacted items
    scores[interacted_items] = -np.inf

    # Get top-k item indices
    top_k_item_ids = np.argpartition(scores, -k)[-k:]
    top_k_item_ids = top_k_item_ids[np.argsort(-scores[top_k_item_ids])]  # Sort descending

    return top_k_item_ids.tolist()

In [54]:
user_id = 0
recommended_items = recommend(user_id, gnn, predictor, test_data, k=5)
print(f"Top-5 Recommended Items for User {user_id}: {recommended_items}")

Top-5 Recommended Items for User 0: [4980, 2416, 2674, 7208, 2938]


# 2. Item-to-Item Recommendations (Similar Items)

In [55]:
@torch.no_grad()
def similar_items(item_id, gnn_model, data, k=5):
    gnn_model.eval()
    emb = gnn_model(data.x_dict, data.edge_index_dict)['item']
    item_emb = emb[item_id].unsqueeze(0)
    
    # Compute cosine similarity
    sim = F.cosine_similarity(item_emb, emb).cpu().numpy()
    sim[item_id] = -np.inf  # Exclude self
    
    # Top-k most similar items
    top_k = np.argpartition(sim, -k)[-k:]
    return top_k[np.argsort(-sim[top_k])].tolist()

In [56]:
similar_items(42, gnn, test_data, k=5)

[4385, 2830, 6862, 3492, 1080]