In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch_geometric.data import InMemoryDataset, HeteroData
from torch_geometric.nn import SAGEConv, to_hetero
from sklearn.model_selection import train_test_split

In [None]:
#Custom Dataset Class
class EventUserDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(EventUserDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['categories.csv', 'events.csv', 'users.csv', 'user-edges.csv', 'user-event.csv', 'user-category.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass

    def process(self):
        raw_dir = self.raw_dir
        # Load CSVs
        categories = pd.read_csv(os.path.join(raw_dir, 'categories.csv'))
        events = pd.read_csv(os.path.join(raw_dir, 'events.csv'))
        users = pd.read_csv(os.path.join(raw_dir, 'users.csv'))
        user_edges = pd.read_csv(os.path.join(raw_dir, 'user-edges.csv'))
        user_event = pd.read_csv(os.path.join(raw_dir, 'user-event.csv'))
        user_category = pd.read_csv(os.path.join(raw_dir, 'user-category.csv'))

        data = HeteroData()

        # Create mappings for node indices
        user_ids = users['user_id'].unique()
        event_ids = events['event_id'].unique()
        category_ids = categories['category_id'].unique()
        user_id_mapping = {uid: i for i, uid in enumerate(user_ids)}
        event_id_mapping = {eid: i for i, eid in enumerate(event_ids)}
        category_id_mapping = {cid: i for i, cid in enumerate(category_ids)}

        # For simplicity, assign random features (dimension 16) for each node type.
        feat_dim = 16
        num_users = len(user_ids)
        num_events = len(event_ids)
        num_categories = len(category_ids)
        data['user'].x = torch.randn(num_users, feat_dim)
        data['event'].x = torch.randn(num_events, feat_dim)
        data['category'].x = torch.randn(num_categories, feat_dim)

        # Build edges:

        # (a) User-User: from user_edges.csv
        src_uu = user_edges['user1'].map(user_id_mapping).values
        dst_uu = user_edges['user2'].map(user_id_mapping).values
        weight_uu = torch.tensor(user_edges['weight'].values, dtype=torch.float)
        edge_index_uu = torch.tensor([src_uu, dst_uu], dtype=torch.long)
        data['user', 'follows', 'user'].edge_index = edge_index_uu
        data['user', 'follows', 'user'].edge_attr = weight_uu.unsqueeze(1)
        # Reverse edge
        edge_index_uu_rev = torch.tensor([dst_uu, src_uu], dtype=torch.long)
        data['user', 'follows_rev', 'user'].edge_index = edge_index_uu_rev
        data['user', 'follows_rev', 'user'].edge_attr = weight_uu.unsqueeze(1)

        # (b) User-Event: from user-event.csv (user attended event)
        src_ue = user_event['user_id'].map(user_id_mapping).values
        dst_ue = user_event['event_id'].map(event_id_mapping).values
        edge_index_ue = torch.tensor([src_ue, dst_ue], dtype=torch.long)
        data['user', 'attends', 'event'].edge_index = edge_index_ue
        # Reverse edge: event attended by user
        edge_index_eu = torch.tensor([dst_ue, src_ue], dtype=torch.long)
        data['event', 'attended_by', 'user'].edge_index = edge_index_eu

        # (c) User-Category: from user-category.csv (user interests)
        src_uc = user_category['user_id'].map(user_id_mapping).values
        dst_uc = user_category['category_id'].map(category_id_mapping).values
        weight_uc = torch.tensor(user_category['weight'].values, dtype=torch.float)
        edge_index_uc = torch.tensor([src_uc, dst_uc], dtype=torch.long)
        data['user', 'interested_in', 'category'].edge_index = edge_index_uc
        data['user', 'interested_in', 'category'].edge_attr = weight_uc.unsqueeze(1)
        # Reverse edge: category to user
        edge_index_cu = torch.tensor([dst_uc, src_uc], dtype=torch.long)
        data['category', 'has_interest_from', 'user'].edge_index = edge_index_cu
        data['category', 'has_interest_from', 'user'].edge_attr = weight_uc.unsqueeze(1)

        # (d) Event-Category: from events.csv (each event belongs to a category)
        event_cat = events['category_id'].map(lambda x: category_id_mapping[x]).values
        event_idx = events['event_id'].map(event_id_mapping).values
        edge_index_ec = torch.tensor([event_idx, event_cat], dtype=torch.long)
        data['event', 'belongs_to', 'category'].edge_index = edge_index_ec
        # Reverse edge: category has event
        edge_index_ce = torch.tensor([event_cat, event_idx], dtype=torch.long)
        data['category', 'has_event', 'event'].edge_index = edge_index_ce

        positive_pairs = []
        for _, row in user_event.iterrows():
            uid = user_id_mapping[row['user_id']]
            eid = event_id_mapping[row['event_id']]
            positive_pairs.append([uid, eid, 1])
        positive_pairs = np.array(positive_pairs)

        negative_pairs = []
        user_event_dict = {}
        for _, row in user_event.iterrows():
            uid = user_id_mapping[row['user_id']]
            eid = event_id_mapping[row['event_id']]
            user_event_dict.setdefault(uid, set()).add(eid)
        for uid in range(num_users):
            num_pos = len(user_event_dict.get(uid, []))
            if num_pos == 0:
                continue
            # All events that the user did NOT attend:
            possible_events = set(range(num_events)) - user_event_dict.get(uid, set())
            if not possible_events:
                continue
            sampled = np.random.choice(list(possible_events), size=num_pos, replace=len(possible_events) < num_pos)
            for eid in sampled:
                negative_pairs.append([uid, int(eid), 0])
        negative_pairs = np.array(negative_pairs)

        # Combine positive and negative pairs, then shuffle and split into train/test.
        all_pairs = np.concatenate([positive_pairs, negative_pairs], axis=0)
        np.random.shuffle(all_pairs)
        all_pairs = torch.tensor(all_pairs, dtype=torch.long)
        train_pairs, test_pairs = train_test_split(all_pairs, test_size=0.2, random_state=42)
        # Save pairs inside the data object
        data['train_pairs'] = train_pairs
        data['test_pairs'] = test_pairs

        # Save the processed data object.
        torch.save(self.collate([data]), self.processed_paths[0])
        return self.collate([data])

In [None]:
# 2. Heterogeneous Model Development:
class BaseGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers=2):
        super(BaseGNN, self).__init__()
        self.convs = nn.ModuleList()
        # First layer: in_channels -> hidden_channels
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        # Additional layers
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
        return x

In [None]:
# PairClassifier: combines user & event embeddings for binary classification.
class PairClassifier(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(PairClassifier, self).__init__()
        self.fc1 = nn.Linear(2 * in_channels, hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, 1)

    def forward(self, user_emb, event_emb):
        # Concatenate embeddings
        x = torch.cat([user_emb, event_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
#Training and Evaluation
def train(model, classifier, data, optimizer, criterion):
    model.train()
    classifier.train()
    optimizer.zero_grad()
    # Forward pass through the hetero GNN.
    out_dict = model(data.x_dict, data.edge_index_dict)

    # Get training pairs (each row: [user_idx, event_idx, label])
    train_pairs = data['train_pairs']
    user_indices = train_pairs[:, 0]
    event_indices = train_pairs[:, 1]
    labels = train_pairs[:, 2].float()

    user_emb = out_dict['user'][user_indices]
    event_emb = out_dict['event'][event_indices]
    logits = classifier(user_emb, event_emb).squeeze()

    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
@torch.no_grad()
def test(model, classifier, data):
    model.eval()
    classifier.eval()
    out_dict = model(data.x_dict, data.edge_index_dict)

    test_pairs = data['test_pairs']
    user_indices = test_pairs[:, 0]
    event_indices = test_pairs[:, 1]
    labels = test_pairs[:, 2].float()

    user_emb = out_dict['user'][user_indices]
    event_emb = out_dict['event'][event_indices]
    logits = classifier(user_emb, event_emb).squeeze()
    preds = (torch.sigmoid(logits) > 0.5).float()
    accuracy = (preds == labels).sum().item() / labels.size(0)
    return accuracy

In [None]:
# Main function: load data, build model, train, and evaluate.
def main():
    # The dataset is expected in the folder "data/EventUser/raw" (for raw CSV files)
    dataset = EventUserDataset(root='data')
    data = dataset[0]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create the heterogeneous GNN.
    # First, build the base GNN which works on homogeneous graphs.
    base_gnn = BaseGNN(in_channels=16, hidden_channels=32, num_layers=2)
    # Convert the base GNN to a heterogeneous model using the graph metadata.
    model = to_hetero(base_gnn, metadata=data.metadata(), aggr='sum').to(device)

    # Create the classifier for user-event pair prediction.
    classifier = PairClassifier(in_channels=32, hidden_channels=16).to(device)

    data = data.to(device)

    optimizer = torch.optim.Adam(list(model.parameters()) + list(classifier.parameters()), lr=0.005)
    criterion = nn.BCEWithLogitsLoss()

    epochs = 50
    for epoch in range(epochs):
        loss = train(model, classifier, data, optimizer, criterion)
        if epoch % 5 == 0:
            acc = test(model, classifier, data)
            print(f'Epoch {epoch:03d} | Loss: {loss:.4f} | Test Accuracy: {acc:.4f}')

if __name__ == '__main__':
    main()

  self.data, self.slices = torch.load(self.processed_paths[0])


Epoch 000 | Loss: 0.7039 | Test Accuracy: 0.5198
Epoch 005 | Loss: 0.6724 | Test Accuracy: 0.5780
Epoch 010 | Loss: 0.6211 | Test Accuracy: 0.6639
Epoch 015 | Loss: 0.5484 | Test Accuracy: 0.7344
Epoch 020 | Loss: 0.4792 | Test Accuracy: 0.7859
Epoch 025 | Loss: 0.4316 | Test Accuracy: 0.8105
Epoch 030 | Loss: 0.4093 | Test Accuracy: 0.8232
Epoch 035 | Loss: 0.3848 | Test Accuracy: 0.8380
Epoch 040 | Loss: 0.3700 | Test Accuracy: 0.8458
Epoch 045 | Loss: 0.3545 | Test Accuracy: 0.8532
