In [4]:
from torch_geometric.datasets import UPFD
# different feature types can be selected: content(profile + spacy; dim: 310), profile(dim: 10), spacy(dim: 300)
# splits: train, test, val
# name: politifact, gossipcop
dataset = UPFD('data/upfd', name="politifact", feature='bert', split="train")

In [5]:
graph = dataset[0]
print(f"Graph at index 0: {graph}")
print(f"Node features shape: {graph.x.shape}")
print(f"Node labels shape: {graph.y.shape}")

print(f"Edge index shape: {graph.edge_index.shape}")
print(f"Edge index: {graph.edge_index}")

Graph at index 0: Data(x=[72, 768], edge_index=[2, 71], y=[1])
Node features shape: torch.Size([72, 768])
Node labels shape: torch.Size([1])
Edge index shape: torch.Size([2, 71])
Edge index: tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8, 16, 16, 16, 16, 16, 16,
         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
         24, 24, 24, 24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 60],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
         55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]])


In [6]:
def is_directed(edge_index):
    edge_set = set(map(tuple, edge_index.t().tolist()))
    for u, v in edge_set:
        if (v, u) not in edge_set:
            return True  # missing reverse edge => directed
    return False  # all reverse edges found => undirected

print(f"Is graph directed? {is_directed(graph.edge_index)}")

Is graph directed? True


# transform the homogeneous graphs(UFPD) to Heterogeneous graphs

In [7]:
import torch
from torch_geometric.data import HeteroData, Data
from typing import List, Union

def convert_single_graph(homogeneous_graph: Data, source_node_idx: int = 0, add_source_self_loop: bool = False) -> HeteroData:
    """
    Convert a single homogeneous graph to a heterogeneous graph with two node types:
    - 'source': News source node
    - 'user': All other nodes
    
    And two edge types:
    - ('source', 'to', 'user'): Edges from source to users
    - ('user', 'to', 'user'): Edges between users
    - ('source', 'to', 'source'): Self-loop for source node (optional)
    
    Args:
        homogeneous_graph: A PyTorch Geometric Data object
        source_node_idx: Index of the source node in the graph, default is 0
        add_source_self_loop: Whether to add a self-loop to the source node, default is False
        
    Returns:
        A HeteroData object
    """
    hetero_graph = HeteroData()
    
    # Get total number of nodes
    num_nodes = homogeneous_graph.num_nodes
    
    # Extract features for source node
    source_features = homogeneous_graph.x[source_node_idx:source_node_idx+1]
    
    # Extract features for user nodes (all nodes except source)
    user_indices = torch.cat([
        torch.arange(0, source_node_idx), 
        torch.arange(source_node_idx + 1, num_nodes)
    ])
    user_features = homogeneous_graph.x[user_indices]
    
    # Add node features to the heterogeneous graph
    hetero_graph['source'].x = source_features
    hetero_graph['user'].x = user_features
    
    # Create a mapping from original node indices to new node indices
    node_mapping = {}
    node_mapping[source_node_idx] = ('source', 0)  # Source node maps to index 0 in 'source' type
    
    # Map all other nodes to 'user' type
    user_counter = 0
    for i in range(num_nodes):
        if i != source_node_idx:
            node_mapping[i] = ('user', user_counter)
            user_counter += 1
    
    # Process edges
    edge_index = homogeneous_graph.edge_index
    
    # Source-to-user edges and User-to-user edges
    source_to_user_edges = []
    user_to_user_edges = []
    
    for i in range(edge_index.shape[1]):
        src, dst = edge_index[0, i].item(), edge_index[1, i].item()
        
        src_type, src_idx = node_mapping[src]
        dst_type, dst_idx = node_mapping[dst]
        
        if src_type == 'source' and dst_type == 'user':
            # Source to user edge
            source_to_user_edges.append((src_idx, dst_idx))
        elif src_type == 'user' and dst_type == 'user':
            # User to user edge
            user_to_user_edges.append((src_idx, dst_idx))
        # We ignore user-to-source edges as mentioned in the requirements
    
    # Add edges to the heterogeneous graph
    if source_to_user_edges:
        src_indices, dst_indices = zip(*source_to_user_edges)
        hetero_graph['source', 'to', 'user'].edge_index = torch.tensor(
            [src_indices, dst_indices], dtype=torch.long
        )
    
    if user_to_user_edges:
        src_indices, dst_indices = zip(*user_to_user_edges)
        hetero_graph['user', 'to', 'user'].edge_index = torch.tensor(
            [src_indices, dst_indices], dtype=torch.long
        )
    
    # Add self-loop to source node if requested
    if add_source_self_loop:
        hetero_graph['source', 'to', 'source'].edge_index = torch.tensor(
            [[0], [0]], dtype=torch.long
        )
    
    # Copy graph-level targets if they exist
    if hasattr(homogeneous_graph, 'y'):
        hetero_graph['source'].y = homogeneous_graph.y
    
    return hetero_graph

def convert_to_heterogeneous(homogeneous_dataset, source_node_idx=0, add_source_self_loop=False):
    """
    Convert a homogeneous UPFD dataset to a heterogeneous dataset.
    
    Args:
        homogeneous_dataset: A PyTorch Geometric UPFD dataset
        source_node_idx: Index of the source node in each graph, default is 0
        add_source_self_loop: Whether to add a self-loop to the source node, default is False
        
    Returns:
        A list of HeteroData objects
    """
    # Simply apply convert_single_graph to each graph in the dataset
    hetero_dataset = [
        convert_single_graph(graph, source_node_idx, add_source_self_loop) 
        for graph in homogeneous_dataset
    ]
    
    return hetero_dataset
import torch
from torch_geometric.data import HeteroData, Data
from typing import List, Union

def convert_single_graph(homogeneous_graph: Data, source_node_idx: int = 0, add_source_self_loop: bool = False) -> HeteroData:
    """
    Convert a single homogeneous graph to a heterogeneous graph with two node types:
    - 'source': News source node
    - 'user': All other nodes
    
    And two edge types:
    - ('source', 'to', 'user'): Edges from source to users
    - ('user', 'to', 'user'): Edges between users
    - ('source', 'to', 'source'): Self-loop for source node (optional)
    
    Args:
        homogeneous_graph: A PyTorch Geometric Data object
        source_node_idx: Index of the source node in the graph, default is 0
        add_source_self_loop: Whether to add a self-loop to the source node, default is False
        
    Returns:
        A HeteroData object
    """
    hetero_graph = HeteroData()
    
    # Get total number of nodes
    num_nodes = homogeneous_graph.num_nodes
    
    # Extract features for source node
    source_features = homogeneous_graph.x[source_node_idx:source_node_idx+1]
    
    # Extract features for user nodes (all nodes except source)
    user_indices = torch.cat([
        torch.arange(0, source_node_idx), 
        torch.arange(source_node_idx + 1, num_nodes)
    ])
    user_features = homogeneous_graph.x[user_indices]
    
    # Add node features to the heterogeneous graph
    hetero_graph['source'].x = source_features
    hetero_graph['user'].x = user_features
    
    # Create a mapping from original node indices to new node indices
    node_mapping = {}
    node_mapping[source_node_idx] = ('source', 0)  # Source node maps to index 0 in 'source' type
    
    # Map all other nodes to 'user' type
    user_counter = 0
    for i in range(num_nodes):
        if i != source_node_idx:
            node_mapping[i] = ('user', user_counter)
            user_counter += 1
    
    # Process edges
    edge_index = homogeneous_graph.edge_index
    
    # Source-to-user edges and User-to-user edges
    source_to_user_edges = []
    user_to_user_edges = []
    
    for i in range(edge_index.shape[1]):
        src, dst = edge_index[0, i].item(), edge_index[1, i].item()
        
        src_type, src_idx = node_mapping[src]
        dst_type, dst_idx = node_mapping[dst]
        
        if src_type == 'source' and dst_type == 'user':
            # Source to user edge
            source_to_user_edges.append((src_idx, dst_idx))
        elif src_type == 'user' and dst_type == 'user':
            # User to user edge
            user_to_user_edges.append((src_idx, dst_idx))
        # We ignore user-to-source edges as mentioned in the requirements
    
    # Add edges to the heterogeneous graph
    if source_to_user_edges:
        src_indices, dst_indices = zip(*source_to_user_edges)
        hetero_graph['source', 'to', 'user'].edge_index = torch.tensor(
            [src_indices, dst_indices], dtype=torch.long
        )
    
    if user_to_user_edges:
        src_indices, dst_indices = zip(*user_to_user_edges)
        hetero_graph['user', 'to', 'user'].edge_index = torch.tensor(
            [src_indices, dst_indices], dtype=torch.long
        )
    
    # Add self-loop to source node if requested
    if add_source_self_loop:
        hetero_graph['source', 'to', 'source'].edge_index = torch.tensor(
            [[0], [0]], dtype=torch.long
        )
    
    # Copy graph-level targets if they exist
    if hasattr(homogeneous_graph, 'y'):
        hetero_graph['source'].y = homogeneous_graph.y
    
    return hetero_graph

def convert_to_heterogeneous(homogeneous_dataset, source_node_idx=0, add_source_self_loop=False):
    """
    Convert a homogeneous UPFD dataset to a heterogeneous dataset.
    
    Args:
        homogeneous_dataset: A PyTorch Geometric UPFD dataset
        source_node_idx: Index of the source node in each graph, default is 0
        add_source_self_loop: Whether to add a self-loop to the source node, default is False
        
    Returns:
        A list of HeteroData objects
    """
    # Simply apply convert_single_graph to each graph in the dataset
    hetero_dataset = [
        convert_single_graph(graph, source_node_idx, add_source_self_loop) 
        for graph in homogeneous_dataset
    ]
    
    return hetero_dataset

In [9]:
from torch_geometric.datasets import UPFD

# Load the homogeneous dataset
dataset = UPFD('data/upfd', name="politifact", feature='bert', split="train")

# Convert to heterogeneous graph
hetero_graph = convert_single_graph(dataset[0])

# Print the heterogeneous graph structure
print(hetero_graph)
print("Node types:", hetero_graph.node_types)
print("Edge types:", hetero_graph.edge_types)
print("Source features shape:", hetero_graph['source'].x.shape)
print("User features shape:", hetero_graph['user'].x.shape)
print("node feature dict", hetero_graph.x_dict)
if ('source', 'to', 'user') in hetero_graph.edge_types:
    print("Source-to-user edges:", hetero_graph['source', 'to', 'user'].edge_index.shape)
if ('user', 'to', 'user') in hetero_graph.edge_types:
    print("User-to-user edges:", hetero_graph['user', 'to', 'user'].edge_index.shape)

HeteroData(
  source={
    x=[1, 768],
    y=[1],
  },
  user={ x=[71, 768] },
  (source, to, user)={ edge_index=[2, 27] },
  (user, to, user)={ edge_index=[2, 44] }
)
Node types: ['source', 'user']
Edge types: [('source', 'to', 'user'), ('user', 'to', 'user')]
Source features shape: torch.Size([1, 768])
User features shape: torch.Size([71, 768])
node feature dict {'source': tensor([[ 4.9833e-01,  1.4148e-01,  3.5605e-01, -4.5109e-01, -3.0611e-01,
          2.5061e-02,  2.8002e-01, -5.6461e-02, -2.2193e-01, -3.1098e-01,
         -3.6787e-01, -5.0690e-02,  9.9056e-02, -2.7402e-01, -2.9179e-01,
         -1.3563e-01,  1.1535e-01,  3.4642e-02, -2.8102e-01,  4.0781e-02,
         -2.3397e-01, -1.9931e-01,  3.7839e-01, -9.2718e-02,  1.5422e-01,
         -1.0526e-01,  7.9626e-02,  2.6651e-02,  3.3124e-01,  2.2782e-01,
         -4.6196e-01, -9.3723e-02, -2.3510e-01, -1.8293e-02, -5.5263e-01,
         -2.1882e-01,  1.3237e-01,  2.8989e-01, -7.3877e-02, -3.4337e-01,
          2.2737e-01,  1.5777e

# model definition

In [2]:
import os.path as osp
from typing import Dict, List, Union

import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric
import torch_geometric.transforms as T
from torch_geometric.datasets import IMDB
from torch_geometric.nn import HANConv

class HAN(nn.Module):
    def __init__(self, in_channels: Union[int, Dict[str, int]],
                 out_channels: int, hidden_channels=128, heads=8, metadata=None, dropout=0.6):
        super().__init__()
        self.han_conv = HANConv(in_channels, hidden_channels, heads=heads,
                                dropout=dropout, metadata=metadata)
        self.lin = nn.Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x = self.han_conv(x_dict, edge_index_dict)
        x = self.lin(x)
        return x

class HANForGraphClassification(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels=128, heads=8, metadata=None,
                 dropout=0.6, num_classes=2):
        super().__init__()
        self.han_conv = HANConv(in_channels, hidden_channels, heads=heads,
                               dropout=dropout, metadata=metadata)
        
        # Linear layer will be initialized during forward pass once we know the input dimension
        self.lin = None
        self.out_channels = out_channels
        self.classifier = nn.Linear(out_channels, num_classes)
    
    def forward(self, x_dict, edge_index_dict):
        # Get node embeddings from HANConv
        node_embeddings_dict = self.han_conv(x_dict, edge_index_dict)
        
        # Average pooling for each node type
        pooled_embeddings = []
        
        for node_type, embeddings in node_embeddings_dict.items():
            if embeddings is not None:
                # Average pooling for nodes of the same type
                pooled = torch.mean(embeddings, dim=0)
                pooled_embeddings.append(pooled)
        
        if not pooled_embeddings:
            raise ValueError("No node embeddings were produced by the model")
        
        # Concatenate all pooled embeddings from different node types
        x = torch.cat(pooled_embeddings)
        
        # Initialize the linear layer if not done yet
        if self.lin is None:
            lin_input_dim = x.size(0)
            self.lin = nn.Linear(lin_input_dim, self.out_channels).to(x.device)
        
        # Apply linear layer and classifier
        x = self.lin(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.classifier(x)
        
        return x

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dummy_graph = convert_single_graph(dataset[0], add_source_self_loop=True)
print(dummy_graph)
print(dummy_graph.x_dict)
print(dummy_graph.edge_index_dict)


HeteroData(
  source={
    x=[1, 768],
    y=[1],
  },
  user={ x=[71, 768] },
  (source, to, user)={ edge_index=[2, 27] },
  (user, to, user)={ edge_index=[2, 44] },
  (source, to, source)={ edge_index=[2, 1] }
)
{'source': tensor([[ 4.9833e-01,  1.4148e-01,  3.5605e-01, -4.5109e-01, -3.0611e-01,
          2.5061e-02,  2.8002e-01, -5.6461e-02, -2.2193e-01, -3.1098e-01,
         -3.6787e-01, -5.0690e-02,  9.9056e-02, -2.7402e-01, -2.9179e-01,
         -1.3563e-01,  1.1535e-01,  3.4642e-02, -2.8102e-01,  4.0781e-02,
         -2.3397e-01, -1.9931e-01,  3.7839e-01, -9.2718e-02,  1.5422e-01,
         -1.0526e-01,  7.9626e-02,  2.6651e-02,  3.3124e-01,  2.2782e-01,
         -4.6196e-01, -9.3723e-02, -2.3510e-01, -1.8293e-02, -5.5263e-01,
         -2.1882e-01,  1.3237e-01,  2.8989e-01, -7.3877e-02, -3.4337e-01,
          2.2737e-01,  1.5777e-01, -3.2284e-01, -1.1594e-01, -1.7053e-01,
         -3.4415e-01,  4.1308e-01,  5.5251e-01,  4.9701e-02,  2.2024e-01,
          6.8173e-01, -1.8886e-01, 

# training

In [11]:
han_model = HANForGraphClassification(
    in_channels=dummy_graph.num_node_features,
    out_channels=128,
    hidden_channels=128,
    heads=8,
    metadata=dummy_graph.metadata(),
    num_classes=2
)

han_model = han_model.to(device)
dummy_graph = dummy_graph.to(device)

print(han_model)
han_model.eval()
out = han_model(dummy_graph.x_dict, dummy_graph.edge_index_dict)
print(out)
print(dummy_graph['source'].y)

HANForGraphClassification(
  (han_conv): HANConv(128, heads=8)
  (classifier): Linear(in_features=128, out_features=2, bias=True)
)
tensor([0.0107, 0.0673], device='cuda:0', grad_fn=<ViewBackward0>)
tensor([0], device='cuda:0')


In [12]:
out = han_model(dummy_graph.x_dict, dummy_graph.edge_index_dict)
out = out.unsqueeze(dim=0)
print(out.shape)
print(dummy_graph['source'].y)

criterion = torch.nn.CrossEntropyLoss()
loss = criterion(out, dummy_graph['source'].y)
print(loss)

torch.Size([1, 2])
tensor([0], device='cuda:0')
tensor(0.7219, device='cuda:0', grad_fn=<NllLossBackward0>)


In [13]:
from torch_geometric.loader import DataLoader as HeteroDataLoader

# Load the original datasets
train_dataset_orig = UPFD('data/upfd', name='politifact', feature='bert', split='train')
val_dataset_orig = UPFD('data/upfd', name='politifact', feature='bert', split='val')
test_dataset_orig = UPFD('data/upfd', name='politifact', feature='bert', split='test')

# Convert to heterogeneous datasets
train_dataset = convert_to_heterogeneous(train_dataset_orig, add_source_self_loop=True)
val_dataset = convert_to_heterogeneous(val_dataset_orig, add_source_self_loop=True)
test_dataset = convert_to_heterogeneous(test_dataset_orig, add_source_self_loop=True)

# Create data loaders
train_loader = HeteroDataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = HeteroDataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = HeteroDataLoader(test_dataset, batch_size=1, shuffle=False)

In [20]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(han_model.parameters(), lr=0.005)
num_epochs = 10
accumulation_steps = 4  # Number of batches to accumulate gradients over

for epoch in range(num_epochs):
    han_model.train()
    han_model.to(device)
    total_loss = 0
    
    # Training loop
    for i, batch in enumerate(train_loader):
        batch = batch.to(device)
        
        out = han_model(batch.x_dict, batch.edge_index_dict)
        out = out.unsqueeze(dim=0)
        loss = criterion(out, batch['source'].y)
        
        # Normalize loss to account for accumulation
        loss = loss / accumulation_steps
        loss.backward()
        total_loss += loss.item()
        
        # Update weights after accumulation_steps
        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(train_loader):
            optimizer.step()
            optimizer.zero_grad()
            
    print(f"Epoch {epoch}, Avg Loss: {total_loss / len(train_loader)}")
    
    # Validation
    han_model.eval()
    correct = 0
    for batch in val_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = han_model(batch.x_dict, batch.edge_index_dict)
            out = out.unsqueeze(dim=0)
            
            pred = out.argmax(dim=1)
            correct += (pred == batch['source'].y).sum().item()
    val_acc = correct / len(val_loader.dataset)
    print(f"Validation Accuracy: {val_acc:.4f}")
    
# Test
han_model.eval()
correct = 0
for batch in test_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = han_model(batch.x_dict, batch.edge_index_dict)
        out = out.unsqueeze(dim=0)
        
        pred = out.argmax(dim=1)
        correct += (pred == batch['source'].y).sum().item()
test_acc = correct / len(test_loader.dataset)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 0, Avg Loss: 0.08084781177420679
Validation Accuracy: 0.6452
Epoch 1, Avg Loss: 0.0494933859090438
Validation Accuracy: 0.7097
Epoch 2, Avg Loss: 0.030985307638719723
Validation Accuracy: 0.6774
Epoch 3, Avg Loss: 0.02896637809786548
Validation Accuracy: 0.6452
Epoch 4, Avg Loss: 0.026702903495948133
Validation Accuracy: 0.7097
Epoch 5, Avg Loss: 0.0234403375196913
Validation Accuracy: 0.6774
Epoch 6, Avg Loss: 0.021567187815370654
Validation Accuracy: 0.7097
Epoch 7, Avg Loss: 0.0212852090640582
Validation Accuracy: 0.7097
Epoch 8, Avg Loss: 0.014492097865876564
Validation Accuracy: 0.7097
Epoch 9, Avg Loss: 0.018071358524210813
Validation Accuracy: 0.7097
Test Accuracy: 0.7964
