In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load encoded data from CSV files
encoded_drugbank_id_df = pd.read_csv('encoders_small/encoded_drugbank_id.csv')
encoded_name_df = pd.read_csv('encoders_small/encoded_name.csv')
encoded_state_df = pd.read_csv('encoders_small/encoded_state.csv')
encoded_groups_df = pd.read_csv('encoders_small/encoded_groups.csv')
encoded_categories_df = pd.read_csv('encoders_small/encoded_categories.csv')
encoded_atc_codes_df = pd.read_csv('encoders_small/encoded_atc_codes.csv')
encoded_targets_df = pd.read_csv('encoders_small/encoded_targets.csv')
encoded_interactions_df = pd.read_csv('encoders_small/encoded_interactions.csv')
encoded_molecular_formula_df = pd.read_csv('encoders_small/encoded_molecular_formula.csv')
encoded_doping_df = pd.read_csv('encoders_small/encoded_doping.csv')

# Convert DataFrames to tensors
encoded_drugbank_id_tensor_1 = torch.tensor(encoded_drugbank_id_df.values, dtype=torch.float32)
encoded_name_tensor_1 = torch.tensor(encoded_name_df.values, dtype=torch.float32)
encoded_state_tensor_1 = torch.tensor(encoded_state_df.values, dtype=torch.float32)
encoded_groups_tensor_1 = torch.tensor(encoded_groups_df.values, dtype=torch.float32)
encoded_categories_tensor_1 = torch.tensor(encoded_categories_df.values, dtype=torch.float32)
encoded_atc_codes_tensor_1 = torch.tensor(encoded_atc_codes_df.values, dtype=torch.float32)
encoded_targets_tensor_1 = torch.tensor(encoded_targets_df.values, dtype=torch.float32)
encoded_interactions_tensor_1 = torch.tensor(encoded_interactions_df.values, dtype=torch.float32)
encoded_molecular_formula_tensor_1 = torch.tensor(encoded_molecular_formula_df.values, dtype=torch.float32)
encoded_doping_tensor_1 = torch.tensor(encoded_doping_df.values, dtype=torch.float32)

# Initialize HeteroData
data_small = HeteroData()

# Add Drug node features
data_small['drug'].x = torch.cat([
    encoded_drugbank_id_tensor_1,
    encoded_name_tensor_1,
    encoded_state_tensor_1,
    encoded_groups_tensor_1,
    encoded_molecular_formula_tensor_1
], dim=1)

# Add Drug Category nodes (one-hot encoding)
data_small['drug_category'].x = torch.eye(len(encoded_categories_df.columns), dtype=torch.float32)

# Add ATC Code nodes (one-hot encoding)
data_small['atc_code'].x = torch.eye(len(encoded_atc_codes_df.columns), dtype=torch.float32)

# Add Target nodes (one-hot encoding)
data_small['target'].x = torch.eye(len(encoded_targets_df.columns), dtype=torch.float32)

# Add Doping nodes (one-hot encoding)
data_small['doping'].x = torch.eye(len(encoded_doping_df['Doping'].unique()), dtype=torch.float32)

# Create edge lists for drug-to-category relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_categories_df.iterrows():
    for category_idx in range(len(row)):
        if row[category_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(category_idx)
data_small['drug', 'isInCategory', 'drug_category'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-ATC code relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_atc_codes_df.iterrows():
    for atc_code_idx in range(len(row)):
        if row[atc_code_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(atc_code_idx)
data_small['drug', 'isClassifiedAs', 'atc_code'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-target relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_targets_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'targets', 'target'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-doping relationships
source_nodes = []
target_nodes = []
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    source_nodes.append(drug_idx)
    target_nodes.append(doping)
data_small['drug', 'isDoping', 'doping'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-drug interactions
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_interactions_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'interactsWith', 'drug'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

print(data_small)




HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 8347] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 2229] },
  (drug, targets, target)={ edge_index=[2, 338] },
  (drug, isDoping, doping)={ edge_index=[2, 338] },
  (drug, interactsWith, drug)={ edge_index=[2, 41415] }
)


In [2]:
encoded_drugbank_id_df.size

338

In [9]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomNodeSplit

# Load encoded data from CSV files and convert to tensors (omitted for brevity)
# Initialize HeteroData and add nodes and edges (omitted for brevity)

# Add doping labels to the drug nodes
data_small['drug'].y = torch.tensor(encoded_doping_df['Doping'].values, dtype=torch.long)

# Perform a node-level random split
transform = RandomNodeSplit(split='random', num_splits=1)
data_small = transform(data_small)

# Verify masks
print(data_small['drug'].train_mask.sum(), data_small['drug'].val_mask.sum(), data_small['drug'].test_mask.sum())


tensor(40) tensor(298) tensor(0)


In [13]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

class HeteroGNN1(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN1, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'targets', 'target'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'isDoping', 'doping'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'interactsWith', 'drug'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

model1 = HeteroGNN1(hidden_channels=64, out_channels=2)


In [14]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(model1.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model1.train()
    optimizer.zero_grad()
    out = model1(data_small.x_dict, data_small.edge_index_dict)
    loss = criterion(out[data_small['drug'].train_mask], data_small['drug'].y[data_small['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model1.eval()
    with torch.no_grad():
        out = model1(data_small.x_dict, data_small.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data_small['drug'].y[mask]
        return int(correct.sum()) / int(mask.sum())

for epoch in range(1, 101):
    loss = train()
    train_acc = test(data_small['drug'].train_mask)
    val_acc = test(data_small['drug'].val_mask)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(data_small['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}')


Epoch: 001, Loss: 0.9811, Train Acc: 0.5500, Val Acc: 0.3826
Epoch: 002, Loss: 2.6032, Train Acc: 0.5000, Val Acc: 0.6342
Epoch: 003, Loss: 7.2715, Train Acc: 0.5000, Val Acc: 0.6477
Epoch: 004, Loss: 4.7531, Train Acc: 0.5000, Val Acc: 0.4396
Epoch: 005, Loss: 1.3754, Train Acc: 0.5000, Val Acc: 0.3658
Epoch: 006, Loss: 3.3175, Train Acc: 0.5250, Val Acc: 0.3859
Epoch: 007, Loss: 2.0312, Train Acc: 0.6000, Val Acc: 0.4799
Epoch: 008, Loss: 1.0818, Train Acc: 0.6500, Val Acc: 0.6007
Epoch: 009, Loss: 1.6187, Train Acc: 0.6250, Val Acc: 0.6275
Epoch: 010, Loss: 1.7635, Train Acc: 0.6750, Val Acc: 0.5805
Epoch: 011, Loss: 1.3000, Train Acc: 0.6500, Val Acc: 0.4966
Epoch: 012, Loss: 0.8333, Train Acc: 0.5250, Val Acc: 0.3993
Epoch: 013, Loss: 0.8998, Train Acc: 0.5750, Val Acc: 0.3893
Epoch: 014, Loss: 1.1544, Train Acc: 0.6000, Val Acc: 0.3926
Epoch: 015, Loss: 1.0189, Train Acc: 0.6500, Val Acc: 0.3960
Epoch: 016, Loss: 0.7029, Train Acc: 0.7000, Val Acc: 0.5268
Epoch: 017, Loss: 0.6086

ZeroDivisionError: division by zero

In [17]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomNodeSplit
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear

# Load encoded data from CSV files and convert to tensors (omitted for brevity)
# Initialize HeteroData and add nodes and edges (omitted for brevity)

# Add doping labels to the drug nodes
data_small['drug'].y = torch.tensor(encoded_doping_df['Doping'].values, dtype=torch.long)

# Perform a node-level random split with ensured allocation
transform = RandomNodeSplit(split='random', num_train_per_class=20, num_val=200, num_test=100)
data_small = transform(data_small)

# Verify masks
print('Train mask sum:', data_small['drug'].train_mask.sum().item())
print('Val mask sum:', data_small['drug'].val_mask.sum().item())
print('Test mask sum:', data_small['drug'].test_mask.sum().item())

class HeteroGNN1(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super(HeteroGNN1, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'targets', 'target'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'isDoping', 'doping'): SAGEConv((-1, -1), hidden_channels),
            ('drug', 'interactsWith', 'drug'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

model1 = HeteroGNN1(hidden_channels=64, out_channels=2)
# Define optimizer and loss function
optimizer = torch.optim.Adam(model1.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model1.train()
    optimizer.zero_grad()
    out = model1(data_small.x_dict, data_small.edge_index_dict)
    loss = criterion(out[data_small['drug'].train_mask], data_small['drug'].y[data_small['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model1.eval()
    with torch.no_grad():
        out = model1(data_small.x_dict, data_small.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data_small['drug'].y[mask]
        if int(mask.sum()) == 0:  # Check to avoid division by zero
            return float('nan')
        return int(correct.sum()) / int(mask.sum())

for epoch in range(1, 101):
    loss = train()
    train_acc = test(data_small['drug'].train_mask)
    val_acc = test(data_small['drug'].val_mask)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(data_small['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}')


Train mask sum: 40
Val mask sum: 200
Test mask sum: 98
Epoch: 001, Loss: 1.7704, Train Acc: 0.5000, Val Acc: 0.3700
Epoch: 002, Loss: 15.8405, Train Acc: 0.5000, Val Acc: 0.3700
Epoch: 003, Loss: 10.4824, Train Acc: 0.5000, Val Acc: 0.3700
Epoch: 004, Loss: 3.0849, Train Acc: 0.5000, Val Acc: 0.6300
Epoch: 005, Loss: 3.5898, Train Acc: 0.5000, Val Acc: 0.6300
Epoch: 006, Loss: 4.7413, Train Acc: 0.5000, Val Acc: 0.6300
Epoch: 007, Loss: 3.8613, Train Acc: 0.5000, Val Acc: 0.6300
Epoch: 008, Loss: 2.0971, Train Acc: 0.6000, Val Acc: 0.5250
Epoch: 009, Loss: 0.6750, Train Acc: 0.5250, Val Acc: 0.3700
Epoch: 010, Loss: 1.1810, Train Acc: 0.5250, Val Acc: 0.3750
Epoch: 011, Loss: 1.6749, Train Acc: 0.5250, Val Acc: 0.3750
Epoch: 012, Loss: 1.7238, Train Acc: 0.5250, Val Acc: 0.3850
Epoch: 013, Loss: 1.4627, Train Acc: 0.5250, Val Acc: 0.3650
Epoch: 014, Loss: 1.0560, Train Acc: 0.5750, Val Acc: 0.3750
Epoch: 015, Loss: 0.7090, Train Acc: 0.7000, Val Acc: 0.5000
Epoch: 016, Loss: 0.6318, Tr

In [12]:
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
# Make the graph undirected and remove reverse edge labels
data = ToUndirected()(data_small)

# Perform a link-level split into training, validation, and test edges
transform = RandomLinkSplit(
    num_val=0.05,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('drug', 'isInCategory', 'drug_category'),
                ('drug', 'isClassifiedAs', 'atc_code'),
                ('drug', 'targets', 'target'),
                ('drug', 'isDoping', 'doping'),
                ('drug', 'interactsWith', 'drug')],
)
train_data, val_data, test_data = transform(data)

print(train_data)
print(val_data)
print(test_data)

HeteroData(
  drug={
    x=[338, 467],
    y=[338],
    train_mask=[338],
    val_mask=[338],
    test_mask=[338],
  },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={
    edge_index=[2, 7096],
    edge_label=[7096],
    edge_label_index=[2, 7096],
  },
  (drug, isClassifiedAs, atc_code)={
    edge_index=[2, 1896],
    edge_label=[1896],
    edge_label_index=[2, 1896],
  },
  (drug, targets, target)={
    edge_index=[2, 289],
    edge_label=[289],
    edge_label_index=[2, 289],
  },
  (drug, isDoping, doping)={
    edge_index=[2, 289],
    edge_label=[289],
    edge_label_index=[2, 289],
  },
  (drug, interactsWith, drug)={
    edge_index=[2, 54438],
    edge_label=[54438],
    edge_label_index=[2, 54438],
  },
  (drug_category, rev_isInCategory, drug)={ edge_index=[2, 8347] },
  (atc_code, rev_isClassifiedAs, drug)={ edge_index=[2, 2229] },
  (target, rev_targets, drug)={ edge_in

In [16]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('hetero.csv')

# Extract the set of valid DrugBank IDs
valid_drugbank_ids = set(df['DrugBank ID'])

# Function to clean the values in a column
def clean_values(column):
    def clean_cell(cell):
        if pd.isna(cell):
            return cell
        ids = cell.split('; ')
        valid_ids = [drug_id for drug_id in ids if drug_id in valid_drugbank_ids]
        return '; '.join(valid_ids)
    
    return column.apply(clean_cell)

# Clean the 'Interactions' and 'Similar Structure' columns
df['Interactions'] = clean_values(df['Interactions'])
df['Similar Structure'] = clean_values(df['Similar Structure'])

# Save the cleaned DataFrame back to CSV
df.to_csv('cleaned_hetero.csv', index=False)
