In [65]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/thesis

In [66]:
# !pip install igraph
# !pip install scikit-network

In [67]:
# !pip install torchmetrics

In [68]:
#!conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 cpuonly -c pytorch

In [69]:
# import torch
# import os
# print("PyTorch has version {}".format(torch.__version__))

# srs_url = f"https://pytorch-geometric.com/whl/torch-{torch.__version__}.html"

In [70]:
# !pip install torch_scatter -f $srs_url
# !pip install torch_sparse -f $srs_url
# !pip install torch_cluster -f $srs_url
# !pip install torch_spline_conv -f $srs_url

In [71]:
# !pip install torch_geometric

In [72]:
# import torch
# import os
# print("PyTorch has version {}".format(torch.__version__))

# if 'IS_GRADESCOPE_ENV' not in os.environ:
#   !pip install torch==2.4.0

# # Install torch geometric
# if 'IS_GRADESCOPE_ENV' not in os.environ:
#   torch_version = str(torch.__version__)
#   scatter_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
#   sparse_src = f"https://pytorch-geometric.com/whl/torch-{torch_version}.html"
#   !pip install torch-scatter -f $scatter_src
#   !pip install torch-sparse -f $sparse_src
#   !pip install torch-geometric
#   !pip install ogb

In [1]:

import re
import os
import sys
import argparse
import yaml
from pprint import pprint
import time
from collections import Counter, defaultdict


from abc import ABC, abstractmethod
import scipy.sparse as sp
from typing import Literal
from dataclasses import dataclass
from typing import List

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import trange

#%env NX_CUGRAPH_AUTOCONFIG=True
import networkx as nx
from networkx.algorithms import bipartite, community
nx.config.warnings_to_ignore.add("cache")

import igraph as ig

import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, SAGEConv, VGAE, to_hetero
from torch_geometric.utils import train_test_split_edges
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler  import NegativeSampling

from sklearn.cluster import KMeans

from torch_scatter import scatter_mean

In [None]:
from src.data_loader import GraphLoader
from src.batch_loader import BatchLoader
from src.utils import *
from src.registry import GNN_LAYER_REGISTRY, GNN_MODEL_REGISTRY

In [5]:
import pathlib
dir_ = pathlib.Path().resolve()

# Parser
parser = argparse.ArgumentParser(description="Load parquet datasets")
parser.add_argument("--config", type=str, default=f"{dir_}/config.yaml", help="Path to YAML config file")
args, unknown = parser.parse_known_args()

with open(args.config, "r") as f:
    cfg = yaml.safe_load(f)

print("Configuration:")
pprint(cfg)
print("\n")

# Prepare the data
graph_loader = GraphLoader(cfg)
graph_data = graph_loader.load()

# Split the data into train/val/test sets
train_val_test_split = random_link_split(cfg)
train_data, val_data, test_data = train_val_test_split(graph_data)

# batch loader
batch_loader = BatchLoader(cfg)
train_loader = batch_loader.load(train_data, shuffle=True)

Configuration:
{'batch_method': 'binary_link_neighbors',
 'batch_size': 128,
 'book_features': [{'degree_log_transform': True,
                    'features': ['degree', 'coreness'],
                    'name': 'topo'},
                   {'dim': 128, 'name': 'random', 'xavier_init': True}],
 'books_filename': 'books_filtered.parquet',
 'coreness_k': 10,
 'descriptions_filename': 'descriptions_filtered.parquet',
 'dir': 'G:/My Drive/thesis/data_sample',
 'disjoint_train_ratio': 0.0,
 'dropout': 0.3,
 'embeddings_descriptions_filename': 'embeddings_descriptions_sbert_pt.parquet',
 'embeddings_reviews_filename': 'embeddings_reviews_sbert_pt.parquet',
 'epochs': 100,
 'gnn_layer_cls': 'gcn_conv',
 'hidden_channels': 128,
 'interactions_filename': 'interactions.parquet',
 'language_filter': True,
 'latent_dim': 64,
 'learning_rate': 0.001,
 'negative_sampling_method': 'random',
 'num_layers': 3,
 'num_neighbors': [10, 10],
 'review_coreness_k': 3,
 'reviews_filename': 'reviews_filtered.par

In [None]:
from torch_geometric.utils import negative_sampling


def batch_random_sample(self, batch_data, negative_sampling_ratio):
    "Randomly sample negative examples from the batch dataset"
    num_neg_samples = int(len(batch_data['user', 'interacts', 'item'].edge_label_index[0]) * negative_sampling_ratio)
    edge_label_index = batch_data['user', 'interacts', 'item'].edge_label_index
    negative_samples = negative_sampling(edge_label_index, num_neg_samples=num_neg_samples)
    return negative_samples, torch.zeros(num_neg_samples, dtype=torch.float32)

In [None]:
class VGAE(nn.Module):
    def __init__(self,
                 data,
                 cfg
                 ):
        """
        gnn_layer_cls: class of the GNN layer (e.g., GCNConv, SAGEConv)
        encoder_cls: class of the encoder (should accept gnn_layer_cls, in_channels, hidden_channels, latent_dim)
        decoder_cls: class of the decoder (should accept no arguments)
        in_channels: input feature dimension
        hidden_channels: hidden layer dimension
        latent_dim: latent space dimension
        encoder_kwargs: additional kwargs for encoder
        """
        super().__init__()
        gnn_layer_cls=GNN_LAYER_REGISTRY[cfg['gnn_layer_cls']]
        encoder=GNN_MODEL_REGISTRY[cfg['encoder']]
        decoder=GNN_MODEL_REGISTRY[cfg['decoder']]
        user_in_channels=data["user"].num_features
        item_in_channels=data["item"].num_features
        hidden_channels=cfg['hidden_channels']
        latent_dim=cfg['latent_dim']
        emb_linear_transform=cfg.get('emb_linear_transform', False)

        self.emb_linear_transform = cfg.get('emb_linear_transform', False)
        self.user_linear = nn.Linear(user_in_channels, hidden_channels)
        self.item_linear = nn.Linear(item_in_channels, hidden_channels)

        if user_in_channels != item_in_channels:
            print(f"Inconsistent input feature dimensions: user={user_in_channels}, item={item_in_channels}")
            print(f"Forcing linear transformation")
            self.emb_linear_transform = True

        elif user_in_channels == item_in_channels == hidden_channels:
            print(f"Consistent input feature dimensions: user={user_in_channels}, item={item_in_channels}, hidden={hidden_channels}")
            print(f"Disabling linear transformation")
            self.emb_linear_transform = False

        input_channels = hidden_channels if self.emb_linear_transform else user_in_channels

        self.encoder = encoder(gnn_layer_cls, input_channels, hidden_channels, latent_dim)
        self.decoder = decoder()

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, data):
        if self.emb_linear_transform:
            user_x = self.user_linear(data["user"].x)
            item_x = self.item_linear(data["item"].x)
        else:
            user_x = data["user"].x
            item_x = data["item"].x

        x = torch.cat([user_x, item_x], dim=0)
        edge_index = data['user', 'interacts', 'item'].edge_index

        mu, logvar = self.encoder(x, edge_index)
        z = self.reparameterize(mu, logvar)
        return z, mu, logvar

    def decode(self, z, edge_index):
        return self.decoder(z, edge_index)



class VGAEncoder(nn.Module):
    def __init__(self, GNNLayer, in_channels, hidden_channels, latent_dim):
        """
        in_channels: dimension of the node input features (after projecting items and using user embeddings)
        hidden_channels: hidden units for first GCN layer
        latent_dim: dimension of mu and logvar
        """
        super().__init__()
        self.conv1 = GNNLayer(in_channels, hidden_channels)
        self.conv_mu = GNNLayer(hidden_channels, latent_dim)
        self.conv_logvar = GNNLayer(hidden_channels, latent_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.leaky_relu(x)
        mu = self.conv_mu(x, edge_index)
        logvar = self.conv_logvar(x, edge_index)
        return mu, logvar


class InnerProductDecoder(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, z, edge_index, sigmoid=True):
        edge_feat_user = z[edge_index[0]]
        edge_feat_item = z[edge_index[1]]
        preds = (edge_feat_user * edge_feat_item).sum(dim=-1)
        if sigmoid:
            preds = torch.sigmoid(preds)
        return preds
    
    def forward_all(self, z, sigmoid=True):
        "Use it cautiously, as it computes all pairwise interactions"
        A_pred = (z @ z.T).view(-1)
        if sigmoid:
            A_pred = torch.sigmoid(A_pred)
        return A_pred




In [14]:
val_data = val_data.to(device)
val_data

HeteroData(
  user={ x=[3150, 130] },
  item={ x=[3568, 130] },
  (user, interacts, item)={
    edge_index=[2, 129985],
    edge_label=[32494],
    edge_label_index=[2, 32494],
  },
  (item, rev_interacts, user)={ edge_index=[2, 129985] }
)

In [16]:
edge_label = val_data["user", "interacts", "item"].edge_label
mask = edge_label == 1
mask

tensor([ True,  True,  True,  ..., False, False, False])

In [26]:
edge_label_index = val_data["user", "interacts", "item"].edge_label_index
edge_label_index[:,mask]

tensor([[ 547, 1073,  340,  ..., 1693, 1231, 1682],
        [ 851,  264,  220,  ...,   35, 2060, 3232]])

In [8]:
def binary_recon_loss(z, pos_preds, neg_preds):

    pos_loss = -torch.log(pos_preds + 1e-15).mean()
    neg_loss = -torch.log(1 - neg_preds + 1e-15).mean()

    return pos_loss + neg_loss

def kl_loss(mu, logvar):
    kl_div = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return kl_div

In [33]:
x = 0.5555
print(f"{x:.2%}")

55.55%


In [9]:
def compute_auc(y_scores, y_true):
    y_true = y_true.long() 
    score = AUROC(task="binary")(y_scores, y_true).item()
    return score

def compute_average_precision(y_scores, y_true):
    y_true = y_true.long() 
    score = AveragePrecision(task="binary")(y_scores, y_true).item()
    return score

In [None]:


def batch_evaluate(loader, model):
    model.eval()
    total_loss = 0
    total_loss_recon = 0
    total_loss_kl = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            z, mu, logvar = model(batch)
            positive_index = batch["user", "interacts", "item"].edge_label_index
            positive_labels = torch.ones(positive_index.size(0), dtype=torch.float32)
            if sampling_strategy == "batch_random":
                negative_index, negative_labels = negative_sampler.batch_random_sample(batch)

            pos_preds = model.decode(z, positive_index)
            neg_preds = model.decode(z, negative_index)

            preds = torch.cat([pos_preds, neg_preds])
            edge_labels = torch.cat([positive_labels, negative_labels])
            all_labels.append(edge_labels)
            all_preds.append(preds)

            loss_recon = binary_recon_loss(z, preds, batch)
            loss_kl = kl_loss(mu, logvar)

            loss = loss_recon + loss_kl
            
            total_loss += loss.item()
            total_loss_recon += loss_recon.item()
            total_loss_kl += loss_kl.item()

    y_scores = torch.cat(all_preds)
    y_true = torch.cat(all_labels)
    auc = compute_auc(y_scores, y_true)
    avg_precision = compute_average_precision(y_scores, y_true)
    return { "total_loss": total_loss, 
             "total_loss_recon": total_loss_recon, 
             "total_loss_kl": total_loss_kl, 
             "auc": auc, 
             "average_precision": avg_precision }

@torch.no_grad()
def all_evaluate(data, model):
    model.eval()
    total_loss = 0
    total_loss_recon = 0
    total_loss_kl = 0
    all_preds = []
    all_labels = []

    data = data.to(device)
    z, mu, logvar = model(data)
    edge_labels = data["user", "interacts", "item"].edge_labels
    edge_label_index = data["user", "interacts", "item"].edge_label_index
    pos_mask = edge_labels == 1
    positive_index = edge_label_index[pos_mask]
    negative_index = edge_label_index[~pos_mask]

    pos_preds = model.decode(z, positive_index)
    neg_preds = model.decode(z, negative_index)

    preds = torch.cat([pos_preds, neg_preds])
    edge_labels = torch.cat([positive_labels, negative_labels])
    all_labels.append(edge_labels)
    all_preds.append(preds)

    loss_recon = binary_recon_loss(z, preds, data)
    loss_kl = kl_loss(mu, logvar)

    loss = loss_recon + loss_kl

    total_loss += loss.item()
    total_loss_recon += loss_recon.item()
    total_loss_kl += loss_kl.item()

    y_scores = torch.cat(all_preds)
    y_true = torch.cat(all_labels)
    auc = compute_auc(y_scores, y_true)
    avg_precision = compute_average_precision(y_scores, y_true)
    return { "total_loss": total_loss,
            "total_loss_recon": total_loss_recon,
            "total_loss_kl": total_loss_kl,
            "auc": auc,
            "average_precision": avg_precision }

In [None]:


gnn_layer_cls = GNN_LAYER_REGISTRY[cfg['gnn_layer_cls']]
user_in_channels = train_data["user"].num_features
item_in_channels = train_data["item"].num_features

## Hyperparameters
hidden_channels = cfg['hidden_channels']
latent_dim = cfg['latent_dim']

In [None]:

# Instantiate model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = VGAE(
    gnn_layer_cls=GNN_LAYER_REGISTRY[cfg['gnn_layer_cls']],
    encoder=VGAEncoder,
    decoder=InnerProductDecoder,
    user_in_channels=train_data["user"].num_features,
    item_in_channels=train_data["item"].num_features,
    hidden_channels=cfg['hidden_channels'],
    latent_dim=cfg['latent_dim'],
    emb_linear_transform=cfg.get('emb_linear_transform', False)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)



In [None]:
# Training loop for your new VGAE version (from cell 17)


epochs = 3

history = {
    "train" : defaultdict(list),
    "val" : defaultdict(list),
}

best_model = None
best_loss = float('inf')

negative_sampler = NegativeSampler(cfg)
sampling_strategy = cfg['training'].get('sampling_strategy', 'random')

for epoch in trange(epochs, desc="Training", unit="Epochs"):
    total_loss = 0
    total_loss_recon = 0
    total_loss_kl = 0
    model.train()
    
    all_preds = []
    all_labels = []

    for batch in train_loader:

        # Zero gradients
        optimizer.zero_grad()
        batch = batch.to(device)
        z, mu, logvar = model(batch)


        positive_index = batch["user", "interacts", "item"].edge_label_index
        positive_labels = torch.ones(positive_index.size(0), dtype=torch.float32)
        if sampling_strategy == "random":
            negative_index, negative_labels = negative_sampler.random_sample(batch)

        pos_preds = model.decode(z, positive_index)
        neg_preds = model.decode(z, negative_index)

        preds = torch.cat([pos_preds, neg_preds])
        edge_labels = torch.cat([positive_labels, negative_labels])

        ## Store the prediction
        all_preds.append(preds)
        all_labels.append(edge_labels)

        ## Compute the loss
        loss_recon = binary_recon_loss(z, preds, batch)
        loss_kl = kl_loss(mu, logvar)
        loss = loss_recon + loss_kl

        # Store the loss
        total_loss += loss.item()
        all_preds.append(preds)
        all_labels.append(edge_labels)

        total_loss_recon += loss_recon.item()
        total_loss_kl += loss_kl.item()
        
        # Loss backward
        loss.backward()
        optimizer.step()

    ## Evaluation
    total_loss /= len(train_loader)
    total_loss_recon /= len(train_loader)
    total_loss_kl /= len(train_loader)
    history["train"]["total_loss"].append(total_loss)
    history["train"]["recon_loss"].append(total_loss_recon)
    history["train"]["kl_loss"].append(total_loss_kl)

    if epoch % 1 == 0:
        # TRAINING DATA
        y_scores, y_true = torch.cat(all_preds), torch.cat(all_labels)
        auc_score = compute_auc(y_scores, y_true)
        avg_precision = compute_average_precision(y_scores, y_true)
        history["train"]["auc"].append(auc_score)
        history["train"]["average_precision"].append(avg_precision)

        print(f"TRAINING DATA")
        print(f"Epoch {epoch}, Total Loss: {total_loss:.4f}, Rec. Loss: {total_loss_recon:.4f}, KL Loss: {total_loss_kl:.4f}")
        print(f"AUC: {auc_score:.4f}, Average Precision: {avg_precision:.4f}")

        # VALIDATION DATA
        val_metrics = evaluate(val_loader, model)
        history["val"]["total_loss"].append(val_metrics["total_loss"])
        history["val"]["recon_loss"].append(val_metrics["total_loss_recon"])
        history["val"]["kl_loss"].append(val_metrics["total_loss_kl"])
        history["val"]["auc"].append(val_metrics["auc"])
        history["val"]["average_precision"].append(val_metrics["average_precision"])

        print(f"VALIDATION DATA")
        print(f"Epoch {epoch}, Total Loss: {val_metrics['total_loss']:.4f}, Rec. Loss: {val_metrics['total_loss_recon']:.4f}, KL Loss: {val_metrics['total_loss_kl']:.4f}")
        print(f"AUC: {val_metrics['auc']:.4f}, Average Precision: {val_metrics['average_precision']:.4f}")



Training:   0%|          | 0/3 [00:00<?, ?Epochs/s]

Training:   0%|          | 0/3 [06:32<?, ?Epochs/s]

TRAINING DATA
Epoch 0, Total Loss: 1.6783, Rec. Loss: 1.4924, KL Loss: 0.1860
AUC: 0.5130, Average Precision: 0.5057





NameError: name 'val_loader' is not defined

In [None]:
# class NegativeSampling:
#     def __init__(self, num_samples):
#         self.num_samples = num_samples

#     def sample(self, pos_edge_index, num_nodes):
#         neg_src = torch.randint(0, num_nodes, (self.num_samples,))
#         neg_dst = torch.randint(0, num_nodes, (self.num_samples,))
#         neg_edge_index = torch.stack([neg_src, neg_dst], dim=0)
#         return neg_edge_index


In [None]:
class HybridBPRRatingAutoencoder(nn.Module):
    def __init__(self, num_features_u, num_features_v, hidden_dim, latent_dim, rating_range=(1, 5)):
        super().__init__()

        self.rating_min, self.rating_max = rating_range

        # Encoders (same as before)
        self.encoder_u = nn.Sequential(
            GCNConv(num_features_u, hidden_dim),
            nn.ReLU(),
            GCNConv(hidden_dim, latent_dim)
        )

        self.encoder_v = nn.Sequential(
            GCNConv(num_features_v, hidden_dim),
            nn.ReLU(),
            GCNConv(hidden_dim, latent_dim)
        )

        # Rating prediction head
        self.rating_decoder = nn.Sequential(
            nn.Linear(latent_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def encode(self, x_u, x_v, edge_index):
        z_u = self.encoder_u(x_u, edge_index)
        z_v = self.encoder_v(x_v, edge_index)
        return z_u, z_v

    def predict_score(self, z_u, z_v, user_ids, item_ids):
        """For BPR ranking"""
        user_emb = z_u[user_ids]
        item_emb = z_v[item_ids]
        return torch.sum(user_emb * item_emb, dim=1)

    def predict_rating(self, z_u, z_v, user_ids, item_ids):
        """For explicit rating prediction"""
        user_emb = z_u[user_ids]
        item_emb = z_v[item_ids]
        edge_embeddings = torch.cat([user_emb, item_emb], dim=1)

        raw_ratings = self.rating_decoder(edge_embeddings)
        ratings = raw_ratings * (self.rating_max - self.rating_min) + self.rating_min

        return ratings.squeeze()

    def forward(self, x_u, x_v, edge_index):
        z_u, z_v = self.encode(x_u, x_v, edge_index)
        return z_u, z_v

def hybrid_loss(model, z_u, z_v,
                # For explicit ratings
                rating_user_ids, rating_item_ids, true_ratings, rating_mask,
                # For BPR
                bpr_user_ids, bpr_pos_items, bpr_neg_items,
                rating_weight=1.0, bpr_weight=0.1):
    """
    Combines rating prediction loss and BPR loss
    """
    total_loss = 0

    # Rating loss (only on observed ratings)
    if rating_mask.sum() > 0:
        observed_idx = rating_mask.bool()
        pred_ratings = model.predict_rating(z_u, z_v,
                                           rating_user_ids[observed_idx],
                                           rating_item_ids[observed_idx])
        rating_loss = F.mse_loss(pred_ratings, true_ratings[observed_idx])
        total_loss += rating_weight * rating_loss

    # BPR loss for ranking
    if len(bpr_user_ids) > 0:
        bpr_loss_val = bpr_loss(model, z_u, z_v, bpr_user_ids, bpr_pos_items, bpr_neg_items)
        total_loss += bpr_weight * bpr_loss_val

    return total_loss