In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load encoded data from CSV files
encoded_drugbank_id_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_drugbank_id.csv')
encoded_name_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_name.csv')
encoded_state_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_state.csv')
encoded_groups_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_groups.csv')
encoded_categories_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_categories.csv')
encoded_atc_codes_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_atc_codes.csv')
encoded_targets_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_targets.csv')
encoded_interactions_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_interactions.csv')
encoded_molecular_formula_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_molecular_formula.csv')
encoded_doping_df = pd.read_csv('Heterogeneous KG\encoders_small/encoded_doping.csv')

# Convert DataFrames to tensors
encoded_drugbank_id_tensor_1 = torch.tensor(encoded_drugbank_id_df.values, dtype=torch.float32)
encoded_name_tensor_1 = torch.tensor(encoded_name_df.values, dtype=torch.float32)
encoded_state_tensor_1 = torch.tensor(encoded_state_df.values, dtype=torch.float32)
encoded_groups_tensor_1 = torch.tensor(encoded_groups_df.values, dtype=torch.float32)
encoded_categories_tensor_1 = torch.tensor(encoded_categories_df.values, dtype=torch.float32)
encoded_atc_codes_tensor_1 = torch.tensor(encoded_atc_codes_df.values, dtype=torch.float32)
encoded_targets_tensor_1 = torch.tensor(encoded_targets_df.values, dtype=torch.float32)
encoded_interactions_tensor_1 = torch.tensor(encoded_interactions_df.values, dtype=torch.float32)
encoded_molecular_formula_tensor_1 = torch.tensor(encoded_molecular_formula_df.values, dtype=torch.float32)
encoded_doping_tensor_1 = torch.tensor(encoded_doping_df.values, dtype=torch.float32)

# Initialize HeteroData
data_small = HeteroData()

# Add Drug node features
data_small['drug'].x = torch.cat([
    encoded_drugbank_id_tensor_1,
    encoded_name_tensor_1,
    encoded_state_tensor_1,
    encoded_groups_tensor_1,
    encoded_molecular_formula_tensor_1
], dim=1)

# Add Drug Category nodes (one-hot encoding)
data_small['drug_category'].x = torch.eye(len(encoded_categories_df.columns), dtype=torch.float32)

# Add ATC Code nodes (one-hot encoding)
data_small['atc_code'].x = torch.eye(len(encoded_atc_codes_df.columns), dtype=torch.float32)

# Add Target nodes (one-hot encoding)
data_small['target'].x = torch.eye(len(encoded_targets_df.columns), dtype=torch.float32)

# Add Doping nodes (one-hot encoding)
data_small['doping'].x = torch.eye(len(encoded_doping_df['Doping'].unique()), dtype=torch.float32)

# Create edge lists for drug-to-category relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_categories_df.iterrows():
    for category_idx in range(len(row)):
        if row[category_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(category_idx)
data_small['drug', 'isInCategory', 'drug_category'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-ATC code relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_atc_codes_df.iterrows():
    for atc_code_idx in range(len(row)):
        if row[atc_code_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(atc_code_idx)
data_small['drug', 'isClassifiedAs', 'atc_code'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-target relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_targets_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'targets', 'target'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-doping relationships
source_nodes = []
target_nodes = []
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    source_nodes.append(drug_idx)
    target_nodes.append(doping)
data_small['drug', 'isDoping', 'doping'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-drug interactions
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_interactions_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'interactsWith', 'drug'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

print(data_small)




HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 8347] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 2229] },
  (drug, targets, target)={ edge_index=[2, 338] },
  (drug, isDoping, doping)={ edge_index=[2, 338] },
  (drug, interactsWith, drug)={ edge_index=[2, 41415] }
)


In [40]:
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import KFold
import numpy as np

# Assume data is your input HeteroData
data = ToUndirected()(data_small)

def generate_k_folds(data, edge_type, k=5):
    edge_index = data[edge_type].edge_index
    num_edges = edge_index.size(1)
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    return kf.split(range(num_edges))

def split_edges(data, edge_type, train_indices, test_indices, val_split=0.1):
    edge_index = data[edge_type].edge_index
    
    # Select train and test edges
    train_edge_index = edge_index[:, train_indices]
    test_edge_index = edge_index[:, test_indices]
    
    # Split train indices further into train and validation
    val_size = int(len(train_indices) * val_split)
    train_indices, val_indices = train_indices[:-val_size], train_indices[-val_size:]
    
    val_edge_index = edge_index[:, val_indices]
    train_edge_index = edge_index[:, train_indices]
    
    return train_edge_index, val_edge_index, test_edge_index

# Perform k-fold cross-validation
k = 5  # or 10 for 10-fold cross-validation
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]
kf_splits = {edge_type: list(generate_k_folds(data, edge_type, k)) for edge_type in edge_types}

for fold in range(k):
    train_data = HeteroData()
    val_data = HeteroData()
    test_data = HeteroData()
    
    for edge_type in edge_types:
        train_indices, test_indices = kf_splits[edge_type][fold]
        train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type, train_indices, test_indices)
        
        train_data[edge_type].edge_index = train_edge_index
        val_data[edge_type].edge_index = val_edge_index
        test_data[edge_type].edge_index = test_edge_index
        
        if 'x' in data[edge_type[0]]:
            train_data[edge_type[0]].x = data[edge_type[0]].x
            val_data[edge_type[0]].x = data[edge_type[0]].x
            test_data[edge_type[0]].x = data[edge_type[0]].x
            
        if 'x' in data[edge_type[2]]:
            train_data[edge_type[2]].x = data[edge_type[2]].x
            val_data[edge_type[2]].x = data[edge_type[2]].x
            test_data[edge_type[2]].x = data[edge_type[2]].x

    print(f"Fold {fold+1}")
    print("Train Data:", train_data)
    print("Validation Data:", val_data)
    print("Test Data:", test_data)
    
    # Here, you can train your model using train_data, validate using val_data, and test using test_data


Fold 1
Train Data: HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6010] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1605] },
  (drug, targets, target)={ edge_index=[2, 243] },
  (drug, isDoping, doping)={ edge_index=[2, 243] },
  (drug, interactsWith, drug)={ edge_index=[2, 46112] }
)
Validation Data: HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 667] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 178] },
  (drug, targets, target)={ edge_index=[2, 27] },
  (drug, isDoping, doping)={ edge_index=[2, 27] },
  (drug, interactsWith, drug)={ edge_index=[2, 5123] }
)
Test Data: HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_cod

In [41]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomNodeSplit

# Load encoded data from CSV files and convert to tensors (omitted for brevity)
# Initialize HeteroData and add nodes and edges (omitted for brevity)

# Add doping labels to the drug nodes
data['drug'].y = torch.tensor(encoded_doping_df['Doping'].values, dtype=torch.long)

# Perform a node-level random split
transform = RandomNodeSplit(split='random', num_splits=1)
data = transform(data)

# Verify masks
print(data['drug'].train_mask.sum(), data['drug'].val_mask.sum(), data['drug'].test_mask.sum())


tensor(40) tensor(298) tensor(0)


In [42]:
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

# Assuming you have a tensor 'y' with labels for each node
num_nodes = data['drug'].x.size(0)

# Create train/val/test masks
train_mask, test_mask = train_test_split(range(num_nodes), test_size=0.2, random_state=42)
train_mask, val_mask = train_test_split(train_mask, test_size=0.1, random_state=42)

# Initialize masks
data['drug'].train_mask = torch.zeros(num_nodes, dtype=torch.bool)
data['drug'].val_mask = torch.zeros(num_nodes, dtype=torch.bool)
data['drug'].test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Assign masks
data['drug'].train_mask[train_mask] = True
data['drug'].val_mask[val_mask] = True
data['drug'].test_mask[test_mask] = True


In [19]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear, SAGEConv, GraphConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'targets', 'target'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'isDoping', 'doping'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'interactsWith', 'drug'): GraphConv((-1, -1), hidden_channels),
        }, aggr='sum')
        # Adjust the linear layer to match the output dimension of GATConv
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2)


In [22]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GraphConv, Linear

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GraphConv(-1, hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): GraphConv(-1, hidden_channels),
            ('drug', 'targets', 'target'): GraphConv(-1, hidden_channels),
            ('drug', 'isDoping', 'doping'): GraphConv(-1, hidden_channels),
            ('drug', 'interactsWith', 'drug'): GraphConv(-1, hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2)


In [23]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1


In [24]:
# To accumulate metrics across epochs
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train()
    train_acc, train_precision, train_recall, train_f1 = test(data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data['drug'].val_mask)
    test_acc, test_precision, test_recall, test_f1 = test(data['drug'].test_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Epoch: 001, Loss: 123.5039, Train Acc: 0.3868
Epoch: 002, Loss: 123.5039, Train Acc: 0.3868
Epoch: 003, Loss: 123.5039, Train Acc: 0.3868
Epoch: 004, Loss: 123.5039, Train Acc: 0.3868
Epoch: 005, Loss: 123.5039, Train Acc: 0.3868
Epoch: 006, Loss: 123.5039, Train Acc: 0.3868
Epoch: 007, Loss: 123.5039, Train Acc: 0.3868
Epoch: 008, Loss: 123.5039, Train Acc: 0.3868
Epoch: 009, Loss: 123.5039, Train Acc: 0.3868
Epoch: 010, Loss: 123.5039, Train Acc: 0.3868
Epoch: 011, Loss: 123.5039, Train Acc: 0.3868
Epoch: 012, Loss: 123.5039, Train Acc: 0.3868
Epoch: 013, Loss: 123.5039, Train Acc: 0.3868
Epoch: 014, Loss: 123.5039, Train Acc: 0.3868
Epoch: 015, Loss: 123.5039, Train Acc: 0.3868
Epoch: 016, Loss: 123.5039, Train Acc: 0.3868
Epoch: 017, Loss: 123.5039, Train Acc: 0.3868
Epoch: 018, Loss: 123.5039, Train Acc: 0.3868
Epoch: 019, Loss: 123.5039, Train Acc: 0.3868
Epoch: 020, Loss: 123.5039, Train Acc: 0.3868
Epoch: 021, Loss: 123.5039, Train Acc: 0.3868
Epoch: 022, Loss: 123.5039, Train 

In [39]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GraphConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GraphConv(-1, hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): GraphConv(-1, hidden_channels),
            ('drug', 'targets', 'target'): GraphConv(-1, hidden_channels),
            ('drug', 'isDoping', 'doping'): GraphConv(-1, hidden_channels),
            ('drug', 'interactsWith', 'drug'): GraphConv(-1, hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 201):
    loss = train(data)
    train_acc, train_precision, train_recall, train_f1 = test(data, data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data, data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data, data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Epoch: 001, Loss: 4.7782, Train Acc: 0.3868
Epoch: 002, Loss: 1928.8292, Train Acc: 0.3868
Epoch: 003, Loss: 973.8771, Train Acc: 0.3868
Epoch: 004, Loss: 32.6837, Train Acc: 0.6132
Epoch: 005, Loss: 317.9927, Train Acc: 0.6132
Epoch: 006, Loss: 434.1130, Train Acc: 0.6132
Epoch: 007, Loss: 449.3937, Train Acc: 0.6132
Epoch: 008, Loss: 404.1190, Train Acc: 0.6132
Epoch: 009, Loss: 319.5454, Train Acc: 0.6132
Epoch: 010, Loss: 210.3478, Train Acc: 0.6173
Epoch: 011, Loss: 84.9438, Train Acc: 0.3868
Epoch: 012, Loss: 87.9344, Train Acc: 0.3868
Epoch: 013, Loss: 155.4108, Train Acc: 0.3868
Epoch: 014, Loss: 135.0769, Train Acc: 0.3868
Epoch: 015, Loss: 67.4123, Train Acc: 0.6132
Epoch: 016, Loss: 19.9105, Train Acc: 0.6173
Epoch: 017, Loss: 53.8865, Train Acc: 0.6173
Epoch: 018, Loss: 67.6360, Train Acc: 0.6173
Epoch: 019, Loss: 65.1847, Train Acc: 0.6173
Epoch: 020, Loss: 49.8084, Train Acc: 0.6132
Epoch: 021, Loss: 24.3386, Train Acc: 0.3868
Epoch: 022, Loss: 13.9826, Train Acc: 0.3868


In [43]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): SAGEConv(-1, hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): SAGEConv(-1, hidden_channels),
            ('drug', 'targets', 'target'): SAGEConv(-1, hidden_channels),
            ('drug', 'isDoping', 'doping'): SAGEConv(-1, hidden_channels),
            ('drug', 'interactsWith', 'drug'): SAGEConv(-1, hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(data)
    train_acc, train_precision, train_recall, train_f1 = test(data, data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data, data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data, data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Epoch: 001, Loss: 5.0682, Train Acc: 0.3868
Epoch: 002, Loss: 11.5280, Train Acc: 0.3868
Epoch: 003, Loss: 6.8058, Train Acc: 0.5761
Epoch: 004, Loss: 0.8202, Train Acc: 0.6255
Epoch: 005, Loss: 2.8077, Train Acc: 0.6255
Epoch: 006, Loss: 2.9713, Train Acc: 0.6255
Epoch: 007, Loss: 2.2653, Train Acc: 0.6337
Epoch: 008, Loss: 1.0935, Train Acc: 0.4280
Epoch: 009, Loss: 0.8134, Train Acc: 0.3827
Epoch: 010, Loss: 1.5713, Train Acc: 0.3909
Epoch: 011, Loss: 1.5116, Train Acc: 0.4362
Epoch: 012, Loss: 0.9085, Train Acc: 0.6420
Epoch: 013, Loss: 0.6152, Train Acc: 0.6337
Epoch: 014, Loss: 0.8766, Train Acc: 0.6337
Epoch: 015, Loss: 1.0654, Train Acc: 0.6337
Epoch: 016, Loss: 1.0398, Train Acc: 0.6337
Epoch: 017, Loss: 0.8618, Train Acc: 0.6379
Epoch: 018, Loss: 0.6623, Train Acc: 0.6667
Epoch: 019, Loss: 0.6067, Train Acc: 0.5432
Epoch: 020, Loss: 0.7071, Train Acc: 0.4774
Epoch: 021, Loss: 0.8043, Train Acc: 0.4815
Epoch: 022, Loss: 0.8044, Train Acc: 0.5021
Epoch: 023, Loss: 0.7243, Train

In [35]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads=1):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True,add_self_loops = False),
            ('drug', 'isClassifiedAs', 'atc_code'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops = False),
            ('drug', 'targets', 'target'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops = False),
            ('drug', 'isDoping', 'doping'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops = False),
            ('drug', 'interactsWith', 'drug'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops = False),
        }, aggr='sum')
        
        # If concat=True in GATConv, the output channels are multiplied by the number of heads
        gat_output_channels = hidden_channels * num_heads
        
        self.lin = Linear(gat_output_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2, num_heads=8)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(data)
    train_acc, train_precision, train_recall, train_f1 = test(data, data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data, data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data, data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Epoch: 001, Loss: 7.1550, Train Acc: 0.3868
Epoch: 002, Loss: 201.6221, Train Acc: 0.3868
Epoch: 003, Loss: 82.9820, Train Acc: 0.4156
Epoch: 004, Loss: 2.4089, Train Acc: 0.6173
Epoch: 005, Loss: 35.3436, Train Acc: 0.6173
Epoch: 006, Loss: 36.5569, Train Acc: 0.6173
Epoch: 007, Loss: 29.8772, Train Acc: 0.6091
Epoch: 008, Loss: 19.5826, Train Acc: 0.6296
Epoch: 009, Loss: 7.8115, Train Acc: 0.4074
Epoch: 010, Loss: 6.9310, Train Acc: 0.3827
Epoch: 011, Loss: 7.9197, Train Acc: 0.4074
Epoch: 012, Loss: 4.8197, Train Acc: 0.5967
Epoch: 013, Loss: 2.9594, Train Acc: 0.6214
Epoch: 014, Loss: 3.7908, Train Acc: 0.6173
Epoch: 015, Loss: 3.2624, Train Acc: 0.5885
Epoch: 016, Loss: 1.9963, Train Acc: 0.4444
Epoch: 017, Loss: 1.8125, Train Acc: 0.4280
Epoch: 018, Loss: 2.1992, Train Acc: 0.4650
Epoch: 019, Loss: 1.3626, Train Acc: 0.6173
Epoch: 020, Loss: 1.3274, Train Acc: 0.6091
Epoch: 021, Loss: 1.5787, Train Acc: 0.6173
Epoch: 022, Loss: 1.2790, Train Acc: 0.5926
Epoch: 023, Loss: 0.8653,

In [46]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split

# Assume data is your input HeteroData
data = ToUndirected()(data_small)

# Create labels for the 'drug' nodes based on their connections to 'doping'
doping_labels = torch.zeros(data['drug'].x.size(0), dtype=torch.long)

# Assuming encoded_doping_df has the doping information
# 0 means not doping, 1 means is doping
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    if doping == 1:
        doping_labels[drug_idx] = 1

data['drug'].y = doping_labels

def split_edges(data, edge_type, test_size=0.2, val_size=0.1):
    edge_index = data[edge_type].edge_index.numpy()
    num_edges = edge_index.shape[1]
    
    # Split the edges into training and testing sets
    train_edges, test_edges = train_test_split(range(num_edges), test_size=test_size, random_state=42)
    
    # Further split the training edges into training and validation sets
    train_edges, val_edges = train_test_split(train_edges, test_size=val_size, random_state=42)
    
    # Create edge indices
    train_edge_index = torch.tensor(edge_index[:, train_edges], dtype=torch.long)
    val_edge_index = torch.tensor(edge_index[:, val_edges], dtype=torch.long)
    test_edge_index = torch.tensor(edge_index[:, test_edges], dtype=torch.long)
    
    return train_edge_index, val_edge_index, test_edge_index

# Define edge types
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]

# Initialize the train, validation, and test data
train_data = HeteroData()
val_data = HeteroData()
test_data = HeteroData()

for edge_type in edge_types:
    train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type)
    
    train_data[edge_type].edge_index = train_edge_index
    val_data[edge_type].edge_index = val_edge_index
    test_data[edge_type].edge_index = test_edge_index
    
    if 'x' in data[edge_type[0]]:
        train_data[edge_type[0]].x = data[edge_type[0]].x
        val_data[edge_type[0]].x = data[edge_type[0]].x
        test_data[edge_type[0]].x = data[edge_type[0]].x
        
    if 'x' in data[edge_type[2]]:
        train_data[edge_type[2]].x = data[edge_type[2]].x
        val_data[edge_type[2]].x = data[edge_type[2]].x
        test_data[edge_type[2]].x = data[edge_type[2]].x

# Set the node features and labels for the 'drug' nodes in the train, validation, and test data
num_nodes = data['drug'].x.size(0)
train_data['drug'].x = data['drug'].x
train_data['drug'].y = data['drug'].y
val_data['drug'].x = data['drug'].x
val_data['drug'].y = data['drug'].y
test_data['drug'].x = data['drug'].x
test_data['drug'].y = data['drug'].y

# Create train, validation, and test masks for nodes
def create_node_masks(data, num_nodes, train_ratio=0.8, val_ratio=0.1):
    train_mask, test_mask = train_test_split(range(num_nodes), test_size=1-train_ratio, random_state=42)
    train_mask, val_mask = train_test_split(train_mask, test_size=val_ratio, random_state=42)

    mask_dict = {
        'train_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'val_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'test_mask': torch.zeros(num_nodes, dtype=torch.bool)
    }
    
    mask_dict['train_mask'][train_mask] = True
    mask_dict['val_mask'][val_mask] = True
    mask_dict['test_mask'][test_mask] = True
    
    return mask_dict

node_masks = create_node_masks(data, num_nodes)

train_data['drug'].train_mask = node_masks['train_mask']
train_data['drug'].val_mask = node_masks['val_mask']
train_data['drug'].test_mask = node_masks['test_mask']

val_data['drug'].train_mask = node_masks['train_mask']
val_data['drug'].val_mask = node_masks['val_mask']
val_data['drug'].test_mask = node_masks['test_mask']

test_data['drug'].train_mask = node_masks['train_mask']
test_data['drug'].val_mask = node_masks['val_mask']
test_data['drug'].test_mask = node_masks['test_mask']

# Print data to verify the splits
print("Train Data:", train_data)
print("Validation Data:", val_data)
print("Test Data:", test_data)

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads=1):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'isClassifiedAs', 'atc_code'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'targets', 'target'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'isDoping', 'doping'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'interactsWith', 'drug'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
        }, aggr='sum')
        
        # If concat=True in GATConv, the output channels are multiplied by the number of heads
        gat_output_channels = hidden_channels * num_heads
        
        self.lin = Linear(gat_output_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Define the model, optimizer, and loss function
model = HeteroGNN(hidden_channels=64, out_channels=2, num_heads=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training and testing functions
def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(train_data)
    train_acc, train_precision, train_recall, train_f1 = test(train_data, train_data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(val_data, val_data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(test_data, test_data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Train Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6009] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1604] },
  (drug, targets, target)={ edge_index=[2, 243] },
  (drug, isDoping, doping)={ edge_index=[2, 243] },
  (drug, interactsWith, drug)={ edge_index=[2, 46111] }
)
Validation Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 668] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 179] },
  (drug, targets, target)={ edge_index=[2, 27] },
  (drug, isDoping, doping)={ edge_index=[2

In [47]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GraphConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split

# Assume data is your input HeteroData
data = ToUndirected()(data_small)

# Create labels for the 'drug' nodes based on their connections to 'doping'
doping_labels = torch.zeros(data['drug'].x.size(0), dtype=torch.long)

# Assuming encoded_doping_df has the doping information
# 0 means not doping, 1 means is doping
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    if doping == 1:
        doping_labels[drug_idx] = 1

data['drug'].y = doping_labels

def split_edges(data, edge_type, test_size=0.2, val_size=0.1):
    edge_index = data[edge_type].edge_index.numpy()
    num_edges = edge_index.shape[1]
    
    # Split the edges into training and testing sets
    train_edges, test_edges = train_test_split(range(num_edges), test_size=test_size, random_state=42)
    
    # Further split the training edges into training and validation sets
    train_edges, val_edges = train_test_split(train_edges, test_size=val_size, random_state=42)
    
    # Create edge indices
    train_edge_index = torch.tensor(edge_index[:, train_edges], dtype=torch.long)
    val_edge_index = torch.tensor(edge_index[:, val_edges], dtype=torch.long)
    test_edge_index = torch.tensor(edge_index[:, test_edges], dtype=torch.long)
    
    return train_edge_index, val_edge_index, test_edge_index

# Define edge types
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]

# Initialize the train, validation, and test data
train_data = HeteroData()
val_data = HeteroData()
test_data = HeteroData()

for edge_type in edge_types:
    train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type)
    
    train_data[edge_type].edge_index = train_edge_index
    val_data[edge_type].edge_index = val_edge_index
    test_data[edge_type].edge_index = test_edge_index
    
    if 'x' in data[edge_type[0]]:
        train_data[edge_type[0]].x = data[edge_type[0]].x
        val_data[edge_type[0]].x = data[edge_type[0]].x
        test_data[edge_type[0]].x = data[edge_type[0]].x
        
    if 'x' in data[edge_type[2]]:
        train_data[edge_type[2]].x = data[edge_type[2]].x
        val_data[edge_type[2]].x = data[edge_type[2]].x
        test_data[edge_type[2]].x = data[edge_type[2]].x

# Set the node features and labels for the 'drug' nodes in the train, validation, and test data
num_nodes = data['drug'].x.size(0)
train_data['drug'].x = data['drug'].x
train_data['drug'].y = data['drug'].y
val_data['drug'].x = data['drug'].x
val_data['drug'].y = data['drug'].y
test_data['drug'].x = data['drug'].x
test_data['drug'].y = data['drug'].y

# Create train, validation, and test masks for nodes
def create_node_masks(data, num_nodes, train_ratio=0.8, val_ratio=0.1):
    train_mask, test_mask = train_test_split(range(num_nodes), test_size=1-train_ratio, random_state=42)
    train_mask, val_mask = train_test_split(train_mask, test_size=val_ratio, random_state=42)

    mask_dict = {
        'train_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'val_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'test_mask': torch.zeros(num_nodes, dtype=torch.bool)
    }
    
    mask_dict['train_mask'][train_mask] = True
    mask_dict['val_mask'][val_mask] = True
    mask_dict['test_mask'][test_mask] = True
    
    return mask_dict

node_masks = create_node_masks(data, num_nodes)

train_data['drug'].train_mask = node_masks['train_mask']
train_data['drug'].val_mask = node_masks['val_mask']
train_data['drug'].test_mask = node_masks['test_mask']

val_data['drug'].train_mask = node_masks['train_mask']
val_data['drug'].val_mask = node_masks['val_mask']
val_data['drug'].test_mask = node_masks['test_mask']

test_data['drug'].train_mask = node_masks['train_mask']
test_data['drug'].val_mask = node_masks['val_mask']
test_data['drug'].test_mask = node_masks['test_mask']

# Print data to verify the splits
print("Train Data:", train_data)
print("Validation Data:", val_data)
print("Test Data:", test_data)

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GraphConv(-1, hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): GraphConv(-1, hidden_channels),
            ('drug', 'targets', 'target'): GraphConv(-1, hidden_channels),
            ('drug', 'isDoping', 'doping'): GraphConv(-1, hidden_channels),
            ('drug', 'interactsWith', 'drug'): GraphConv(-1, hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Define the model, optimizer, and loss function
model = HeteroGNN(hidden_channels=64, out_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training and testing functions
def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(train_data)
    train_acc, train_precision, train_recall, train_f1 = test(train_data, train_data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(val_data, val_data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(test_data, test_data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Train Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6009] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1604] },
  (drug, targets, target)={ edge_index=[2, 243] },
  (drug, isDoping, doping)={ edge_index=[2, 243] },
  (drug, interactsWith, drug)={ edge_index=[2, 46111] }
)
Validation Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 668] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 179] },
  (drug, targets, target)={ edge_index=[2, 27] },
  (drug, isDoping, doping)={ edge_index=[2

In [48]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split

# Assume data is your input HeteroData
data = ToUndirected()(data_small)

# Create labels for the 'drug' nodes based on their connections to 'doping'
doping_labels = torch.zeros(data['drug'].x.size(0), dtype=torch.long)

# Assuming encoded_doping_df has the doping information
# 0 means not doping, 1 means is doping
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    if doping == 1:
        doping_labels[drug_idx] = 1

data['drug'].y = doping_labels

def split_edges(data, edge_type, test_size=0.2, val_size=0.1):
    edge_index = data[edge_type].edge_index.numpy()
    num_edges = edge_index.shape[1]
    
    # Split the edges into training and testing sets
    train_edges, test_edges = train_test_split(range(num_edges), test_size=test_size, random_state=42)
    
    # Further split the training edges into training and validation sets
    train_edges, val_edges = train_test_split(train_edges, test_size=val_size, random_state=42)
    
    # Create edge indices
    train_edge_index = torch.tensor(edge_index[:, train_edges], dtype=torch.long)
    val_edge_index = torch.tensor(edge_index[:, val_edges], dtype=torch.long)
    test_edge_index = torch.tensor(edge_index[:, test_edges], dtype=torch.long)
    
    return train_edge_index, val_edge_index, test_edge_index

# Define edge types
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]

# Initialize the train, validation, and test data
train_data = HeteroData()
val_data = HeteroData()
test_data = HeteroData()

for edge_type in edge_types:
    train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type)
    
    train_data[edge_type].edge_index = train_edge_index
    val_data[edge_type].edge_index = val_edge_index
    test_data[edge_type].edge_index = test_edge_index
    
    if 'x' in data[edge_type[0]]:
        train_data[edge_type[0]].x = data[edge_type[0]].x
        val_data[edge_type[0]].x = data[edge_type[0]].x
        test_data[edge_type[0]].x = data[edge_type[0]].x
        
    if 'x' in data[edge_type[2]]:
        train_data[edge_type[2]].x = data[edge_type[2]].x
        val_data[edge_type[2]].x = data[edge_type[2]].x
        test_data[edge_type[2]].x = data[edge_type[2]].x

# Set the node features and labels for the 'drug' nodes in the train, validation, and test data
num_nodes = data['drug'].x.size(0)
train_data['drug'].x = data['drug'].x
train_data['drug'].y = data['drug'].y
val_data['drug'].x = data['drug'].x
val_data['drug'].y = data['drug'].y
test_data['drug'].x = data['drug'].x
test_data['drug'].y = data['drug'].y

# Create train, validation, and test masks for nodes
def create_node_masks(data, num_nodes, train_ratio=0.8, val_ratio=0.1):
    train_mask, test_mask = train_test_split(range(num_nodes), test_size=1-train_ratio, random_state=42)
    train_mask, val_mask = train_test_split(train_mask, test_size=val_ratio, random_state=42)

    mask_dict = {
        'train_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'val_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'test_mask': torch.zeros(num_nodes, dtype=torch.bool)
    }
    
    mask_dict['train_mask'][train_mask] = True
    mask_dict['val_mask'][val_mask] = True
    mask_dict['test_mask'][test_mask] = True
    
    return mask_dict

node_masks = create_node_masks(data, num_nodes)

train_data['drug'].train_mask = node_masks['train_mask']
train_data['drug'].val_mask = node_masks['val_mask']
train_data['drug'].test_mask = node_masks['test_mask']

val_data['drug'].train_mask = node_masks['train_mask']
val_data['drug'].val_mask = node_masks['val_mask']
val_data['drug'].test_mask = node_masks['test_mask']

test_data['drug'].train_mask = node_masks['train_mask']
test_data['drug'].val_mask = node_masks['val_mask']
test_data['drug'].test_mask = node_masks['test_mask']

# Print data to verify the splits
print("Train Data:", train_data)
print("Validation Data:", val_data)
print("Test Data:", test_data)

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): SAGEConv(-1, hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): SAGEConv(-1, hidden_channels),
            ('drug', 'targets', 'target'): SAGEConv(-1, hidden_channels),
            ('drug', 'isDoping', 'doping'): SAGEConv(-1, hidden_channels),
            ('drug', 'interactsWith', 'drug'): SAGEConv(-1, hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Define the model, optimizer, and loss function
model = HeteroGNN(hidden_channels=64, out_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training and testing functions
def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(train_data)
    train_acc, train_precision, train_recall, train_f1 = test(train_data, train_data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(val_data, val_data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(test_data, test_data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Train Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6009] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1604] },
  (drug, targets, target)={ edge_index=[2, 243] },
  (drug, isDoping, doping)={ edge_index=[2, 243] },
  (drug, interactsWith, drug)={ edge_index=[2, 46111] }
)
Validation Data: HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 668] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 179] },
  (drug, targets, target)={ edge_index=[2, 27] },
  (drug, isDoping, doping)={ edge_index=[2

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]
        
        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        
        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# To accumulate metrics across epochs
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train()
    train_acc, train_precision, train_recall, train_f1 = test(data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data['drug'].val_mask)
    test_acc = test(data['drug'].test_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


ZeroDivisionError: division by zero