In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load encoded data from CSV files
encoded_drugbank_id_df = pd.read_csv('Heterogeneous KG/encoders/encoded_drugbank_id.csv')
encoded_name_df = pd.read_csv('Heterogeneous KG/encoders/encoded_name.csv')
encoded_state_df = pd.read_csv('Heterogeneous KG/encoders/encoded_state.csv')
encoded_groups_df = pd.read_csv('Heterogeneous KG/encoders/encoded_groups.csv')
encoded_categories_df = pd.read_csv('Heterogeneous KG/encoders/encoded_categories.csv')
encoded_atc_codes_df = pd.read_csv('Heterogeneous KG/encoders/encoded_atc_codes.csv')
encoded_targets_df = pd.read_csv('Heterogeneous KG/encoders/encoded_targets.csv')
encoded_interactions_df = pd.read_csv('Heterogeneous KG/encoders/encoded_interactions.csv')
encoded_molecular_formula_df = pd.read_csv('Heterogeneous KG/encoders/encoded_molecular_formula.csv')
encoded_doping_df = pd.read_csv('Heterogeneous KG/encoders/encoded_doping.csv')

# Convert DataFrames to tensors
encoded_drugbank_id_tensor = torch.tensor(encoded_drugbank_id_df.values, dtype=torch.float32)
encoded_name_tensor = torch.tensor(encoded_name_df.values, dtype=torch.float32)
encoded_state_tensor = torch.tensor(encoded_state_df.values, dtype=torch.float32)
encoded_groups_tensor = torch.tensor(encoded_groups_df.values, dtype=torch.float32)
encoded_categories_tensor = torch.tensor(encoded_categories_df.values, dtype=torch.float32)
encoded_atc_codes_tensor = torch.tensor(encoded_atc_codes_df.values, dtype=torch.float32)
encoded_targets_tensor = torch.tensor(encoded_targets_df.values, dtype=torch.float32)
encoded_interactions_tensor = torch.tensor(encoded_interactions_df.values, dtype=torch.float32)
encoded_molecular_formula_tensor = torch.tensor(encoded_molecular_formula_df.values, dtype=torch.float32)
encoded_doping_tensor = torch.tensor(encoded_doping_df.values, dtype=torch.float32)

# Initialize HeteroData
data = HeteroData()

# Add Drug node features
data['drug'].x = torch.cat([
    encoded_drugbank_id_tensor,
    encoded_name_tensor,
    encoded_state_tensor,
    encoded_groups_tensor,
    encoded_molecular_formula_tensor
], dim=1)

data['drug_category'].x = torch.eye(len(encoded_categories_df.columns), dtype=torch.float32)
data['atc_code'].x = torch.eye(len(encoded_atc_codes_df.columns), dtype=torch.float32)
data['target'].x = torch.eye(len(encoded_targets_df.columns), dtype=torch.float32)
data['doping'].x = torch.eye(len(encoded_doping_df['Doping'].unique()), dtype=torch.float32)

source_nodes = []
target_nodes = []
for drug_idx, row in encoded_categories_df.iterrows():
    for category_idx in range(len(row)):
        if row[category_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(category_idx)
data['drug', 'isInCategory', 'drug_category'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

source_nodes = []
target_nodes = []
for drug_idx, row in encoded_atc_codes_df.iterrows():
    for atc_code_idx in range(len(row)):
        if row[atc_code_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(atc_code_idx)
data['drug', 'isClassifiedAs', 'atc_code'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

source_nodes = []
target_nodes = []
for drug_idx, row in encoded_targets_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data['drug', 'targets', 'target'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

source_nodes = []
target_nodes = []
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    source_nodes.append(drug_idx)
    target_nodes.append(doping)
data['drug', 'isDoping', 'doping'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

source_nodes = []
target_nodes = []
for drug_idx, row in encoded_interactions_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data['drug', 'interactsWith', 'drug'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

print(data)




HeteroData(
  drug={ x=[11773, 467] },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 83012] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 23341] },
  (drug, targets, target)={ edge_index=[2, 11773] },
  (drug, isDoping, doping)={ edge_index=[2, 11773] },
  (drug, interactsWith, drug)={ edge_index=[2, 2499843] }
)


In [38]:
def analyze_hetero_data(data):
    total_nodes = 0
    total_edges = 0
    total_features = 0
    unique_nodes = set()
    num_node_types = len(data.node_types)
    num_edge_types = len(data.edge_types)
    
    for node_type in data.node_types:
        num_nodes = data[node_type].num_nodes
        total_nodes += num_nodes
        total_features += data[node_type].x.size(1) if 'x' in data[node_type] else 0
        unique_nodes.update(range(num_nodes))
    
    for edge_type in data.edge_types:
        total_edges += data[edge_type].edge_index.size(1)
    
    average_nodes = total_nodes / num_node_types if num_node_types > 0 else 0
    average_edges = total_edges / num_edge_types if num_edge_types > 0 else 0
    
    return {
        'total_nodes': total_nodes,
        'total_edges': total_edges,
        'total_features': total_features,
        'unique_nodes': len(unique_nodes),
        'average_nodes': average_nodes,
        'average_edges': average_edges,
    }

# Analyze the HeteroData
analysis_result = analyze_hetero_data(data)
print(analysis_result)

{'total_nodes': 23614, 'total_edges': 5064233, 'total_features': 12308, 'unique_nodes': 11773, 'average_nodes': 4722.8, 'average_edges': 562692.5555555555}


In [3]:
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
# Make the graph undirected and remove reverse edge labels
data = ToUndirected()(data)

# Perform a link-level split into training, validation, and test edges
transform = RandomLinkSplit(
    num_val=0.05,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('drug', 'isInCategory', 'drug_category'),
                ('drug', 'isClassifiedAs', 'atc_code'),
                ('drug', 'targets', 'target'),
                ('drug', 'isDoping', 'doping'),
                ('drug', 'interactsWith', 'drug')],
)
train_data, val_data, test_data = transform(data)

print(train_data)
print(val_data)
print(test_data)

HeteroData(
  drug={ x=[11773, 467] },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={
    edge_index=[2, 70561],
    edge_label=[70561],
    edge_label_index=[2, 70561],
  },
  (drug, isClassifiedAs, atc_code)={
    edge_index=[2, 19840],
    edge_label=[19840],
    edge_label_index=[2, 19840],
  },
  (drug, targets, target)={
    edge_index=[2, 10008],
    edge_label=[10008],
    edge_label_index=[2, 10008],
  },
  (drug, isDoping, doping)={
    edge_index=[2, 10008],
    edge_label=[10008],
    edge_label_index=[2, 10008],
  },
  (drug, interactsWith, drug)={
    edge_index=[2, 4083771],
    edge_label=[4083771],
    edge_label_index=[2, 4083771],
  },
  (drug_category, rev_isInCategory, drug)={ edge_index=[2, 83012] },
  (atc_code, rev_isClassifiedAs, drug)={ edge_index=[2, 23341] },
  (target, rev_targets, drug)={ edge_index=[2, 11773] },
  (doping, rev_isDoping, drug)={ 

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear
from sklearn.metrics import precision_score, recall_score, f1_score
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split

data = ToUndirected()(data)

doping_labels = torch.zeros(data['drug'].x.size(0), dtype=torch.long)

# 0 means not doping, 1 means is doping
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    if doping == 1:
        doping_labels[drug_idx] = 1

data['drug'].y = doping_labels

def split_edges(data, edge_type, test_size=0.2, val_size=0.1):
    edge_index = data[edge_type].edge_index.numpy()
    num_edges = edge_index.shape[1]
    
    train_edges, test_edges = train_test_split(range(num_edges), test_size=test_size, random_state=42)
  
    train_edges, val_edges = train_test_split(train_edges, test_size=val_size, random_state=42)
    
    # Create edge indices
    train_edge_index = torch.tensor(edge_index[:, train_edges], dtype=torch.long)
    val_edge_index = torch.tensor(edge_index[:, val_edges], dtype=torch.long)
    test_edge_index = torch.tensor(edge_index[:, test_edges], dtype=torch.long)
    
    return train_edge_index, val_edge_index, test_edge_index

# Define edge types
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]

# Initialize the train, validation, and test data
train_data = HeteroData()
val_data = HeteroData()
test_data = HeteroData()

for edge_type in edge_types:
    train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type)
    
    train_data[edge_type].edge_index = train_edge_index
    val_data[edge_type].edge_index = val_edge_index
    test_data[edge_type].edge_index = test_edge_index
    
    if 'x' in data[edge_type[0]]:
        train_data[edge_type[0]].x = data[edge_type[0]].x
        val_data[edge_type[0]].x = data[edge_type[0]].x
        test_data[edge_type[0]].x = data[edge_type[0]].x
        
    if 'x' in data[edge_type[2]]:
        train_data[edge_type[2]].x = data[edge_type[2]].x
        val_data[edge_type[2]].x = data[edge_type[2]].x
        test_data[edge_type[2]].x = data[edge_type[2]].x

# Set the node features and labels for the 'drug' nodes in the train, validation, and test data
num_nodes = data['drug'].x.size(0)
train_data['drug'].x = data['drug'].x
train_data['drug'].y = data['drug'].y
val_data['drug'].x = data['drug'].x
val_data['drug'].y = data['drug'].y
test_data['drug'].x = data['drug'].x
test_data['drug'].y = data['drug'].y

# Create train, validation, and test masks for nodes
def create_node_masks(data, num_nodes, train_ratio=0.8, val_ratio=0.1):
    train_mask, test_mask = train_test_split(range(num_nodes), test_size=1-train_ratio, random_state=42)
    train_mask, val_mask = train_test_split(train_mask, test_size=val_ratio, random_state=42)

    mask_dict = {
        'train_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'val_mask': torch.zeros(num_nodes, dtype=torch.bool),
        'test_mask': torch.zeros(num_nodes, dtype=torch.bool)
    }
    
    mask_dict['train_mask'][train_mask] = True
    mask_dict['val_mask'][val_mask] = True
    mask_dict['test_mask'][test_mask] = True
    
    return mask_dict

node_masks = create_node_masks(data, num_nodes)

train_data['drug'].train_mask = node_masks['train_mask']
train_data['drug'].val_mask = node_masks['val_mask']
train_data['drug'].test_mask = node_masks['test_mask']

val_data['drug'].train_mask = node_masks['train_mask']
val_data['drug'].val_mask = node_masks['val_mask']
val_data['drug'].test_mask = node_masks['test_mask']

test_data['drug'].train_mask = node_masks['train_mask']
test_data['drug'].val_mask = node_masks['val_mask']
test_data['drug'].test_mask = node_masks['test_mask']

print("Train Data:", train_data)
print("Validation Data:", val_data)
print("Test Data:", test_data)

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_heads=8):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'isClassifiedAs', 'atc_code'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'targets', 'target'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'isDoping', 'doping'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
            ('drug', 'interactsWith', 'drug'): GATConv((-1, -1), hidden_channels, heads=num_heads, concat=True, add_self_loops=False),
        }, aggr='sum')
        
        # If concat=True in GATConv, the output channels are multiplied by the number of heads
        gat_output_channels = hidden_channels * num_heads
        
        self.lin = Linear(gat_output_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Define the model, optimizer, and loss function
model = HeteroGNN(hidden_channels=64, out_channels=2, num_heads=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training and testing functions
def train(data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]

        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)

        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# Training loop
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train(train_data)
    train_acc, train_precision, train_recall, train_f1 = test(train_data, train_data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(val_data, val_data['drug'].val_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(test_data, test_data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


Train Data: HeteroData(
  drug={
    x=[11773, 467],
    y=[11773],
    train_mask=[11773],
    val_mask=[11773],
    test_mask=[11773],
  },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 59768] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 16804] },
  (drug, targets, target)={ edge_index=[2, 8476] },
  (drug, isDoping, doping)={ edge_index=[2, 8476] },
  (drug, interactsWith, drug)={ edge_index=[2, 3459193] }
)
Validation Data: HeteroData(
  drug={
    x=[11773, 467],
    y=[11773],
    train_mask=[11773],
    val_mask=[11773],
    test_mask=[11773],
  },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6641] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1868] },
  (drug, targets, target)={ edge_index=[2, 942] },
  (dr

In [2]:
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from sklearn.model_selection import KFold
import numpy as np

data = ToUndirected()(data)

def generate_k_folds(data, edge_type, k=5):
    edge_index = data[edge_type].edge_index
    num_edges = edge_index.size(1)
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    return kf.split(range(num_edges))

def split_edges(data, edge_type, train_indices, test_indices, val_split=0.1):
    edge_index = data[edge_type].edge_index

    train_edge_index = edge_index[:, train_indices]
    test_edge_index = edge_index[:, test_indices]
    
    val_size = int(len(train_indices) * val_split)
    train_indices, val_indices = train_indices[:-val_size], train_indices[-val_size:]
    
    val_edge_index = edge_index[:, val_indices]
    train_edge_index = edge_index[:, train_indices]
    
    return train_edge_index, val_edge_index, test_edge_index

# Perform k-fold cross-validation
k = 5  
edge_types = [
    ('drug', 'isInCategory', 'drug_category'),
    ('drug', 'isClassifiedAs', 'atc_code'),
    ('drug', 'targets', 'target'),
    ('drug', 'isDoping', 'doping'),
    ('drug', 'interactsWith', 'drug')
]
kf_splits = {edge_type: list(generate_k_folds(data, edge_type, k)) for edge_type in edge_types}

for fold in range(k):
    train_data = HeteroData()
    val_data = HeteroData()
    test_data = HeteroData()
    
    for edge_type in edge_types:
        train_indices, test_indices = kf_splits[edge_type][fold]
        train_edge_index, val_edge_index, test_edge_index = split_edges(data, edge_type, train_indices, test_indices)
        
        train_data[edge_type].edge_index = train_edge_index
        val_data[edge_type].edge_index = val_edge_index
        test_data[edge_type].edge_index = test_edge_index
        
        if 'x' in data[edge_type[0]]:
            train_data[edge_type[0]].x = data[edge_type[0]].x
            val_data[edge_type[0]].x = data[edge_type[0]].x
            test_data[edge_type[0]].x = data[edge_type[0]].x
            
        if 'x' in data[edge_type[2]]:
            train_data[edge_type[2]].x = data[edge_type[2]].x
            val_data[edge_type[2]].x = data[edge_type[2]].x
            test_data[edge_type[2]].x = data[edge_type[2]].x

    print(f"Fold {fold+1}")
    print("Train Data:", train_data)
    print("Validation Data:", val_data)
    print("Test Data:", test_data)

Fold 1
Train Data: HeteroData(
  drug={ x=[11773, 467] },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 59769] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 16805] },
  (drug, targets, target)={ edge_index=[2, 8477] },
  (drug, isDoping, doping)={ edge_index=[2, 8477] },
  (drug, interactsWith, drug)={ edge_index=[2, 3459194] }
)
Validation Data: HeteroData(
  drug={ x=[11773, 467] },
  drug_category={ x=[3520, 3520] },
  atc_code={ x=[4742, 4742] },
  target={ x=[3577, 3577] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 6640] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 1867] },
  (drug, targets, target)={ edge_index=[2, 941] },
  (drug, isDoping, doping)={ edge_index=[2, 941] },
  (drug, interactsWith, drug)={ edge_index=[2, 384354] }
)
Test Data: HeteroData(
  drug={ x=[11773, 467] },
  drug_category={ x

In [3]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomNodeSplit

# Add doping labels to the drug nodes
data['drug'].y = torch.tensor(encoded_doping_df['Doping'].values, dtype=torch.long)

# Perform a node-level random split
transform = RandomNodeSplit(split='random', num_splits=1)
data = transform(data)

# Verify masks
print(data['drug'].train_mask.sum(), data['drug'].val_mask.sum(), data['drug'].test_mask.sum())


tensor(40) tensor(500) tensor(1000)


In [None]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, SAGEConv, Linear,GraphConv,GATConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels,heads=1):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GATConv((-1, -1), hidden_channels,heads=heads),
            ('drug', 'isClassifiedAs', 'atc_code'): GATConv((-1, -1), hidden_channels,heads=heads),
            ('drug', 'targets', 'target'): GATConv((-1, -1), hidden_channels,heads=heads),
            ('drug', 'isDoping', 'doping'): GATConv((-1, -1), hidden_channels,heads=heads),
            ('drug', 'interactsWith', 'drug'): GATConv((-1, -1), hidden_channels,heads=heads),
        }, aggr='sum')
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

model = HeteroGNN(hidden_channels=64, out_channels=2,heads=8)


In [5]:
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv, Linear, GraphConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, heads=1):
        super(HeteroGNN, self).__init__()
        self.conv1 = HeteroConv({
            ('drug', 'isInCategory', 'drug_category'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'isClassifiedAs', 'atc_code'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'targets', 'target'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'isDoping', 'doping'): GraphConv((-1, -1), hidden_channels),
            ('drug', 'interactsWith', 'drug'): GraphConv((-1, -1), hidden_channels),
        }, aggr='sum')
        # Adjust the linear layer to match the output dimension of GATConv
        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict['drug'])

# Instantiate the model
model = HeteroGNN(hidden_channels=64, out_channels=2)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]
        
        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        
        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

# To accumulate metrics across epochs
train_precisions = []
train_recalls = []
train_f1s = []

val_precisions = []
val_recalls = []
val_f1s = []

for epoch in range(1, 101):
    loss = train()
    train_acc, train_precision, train_recall, train_f1 = test(data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data['drug'].val_mask)
    test_acc = test(data['drug'].test_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Calculate the average metrics over all epochs
avg_train_precision = sum(train_precisions) / len(train_precisions)
avg_train_recall = sum(train_recalls) / len(train_recalls)
avg_train_f1 = sum(train_f1s) / len(train_f1s)

avg_val_precision = sum(val_precisions) / len(val_precisions)
avg_val_recall = sum(val_recalls) / len(val_recalls)
avg_val_f1 = sum(val_f1s) / len(val_f1s)

print(f'Average Train Precision: {avg_train_precision:.4f}, Average Train Recall: {avg_train_recall:.4f}, Average Train F1: {avg_train_f1:.4f}')
print(f'Average Val Precision: {avg_val_precision:.4f}, Average Val Recall: {avg_val_recall:.4f}, Average Val F1: {avg_val_f1:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]
        
        precision = precision_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        recall = recall_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        f1 = f1_score(data['drug'].y[mask].cpu(), pred[mask].cpu(), average='macro', zero_division=0)
        
        return int(correct.sum()) / int(mask.sum()), precision, recall, f1

for epoch in range(1, 101):
    loss = train()
    train_acc, train_precision, train_recall, train_f1 = test(data['drug'].train_mask)
    val_acc, val_precision, val_recall, val_f1 = test(data['drug'].val_mask)
    test_acc = test(data['drug'].test_mask)
    
    train_precisions.append(train_precision)
    train_recalls.append(train_recall)
    train_f1s.append(train_f1)
    
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

# Test the model
test_acc, test_precision, test_recall, test_f1 = test(data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1: {test_f1:.4f}')


In [7]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    loss = criterion(out[data['drug'].train_mask], data['drug'].y[data['drug'].train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=1)
        correct = pred[mask] == data['drug'].y[mask]
        return int(correct.sum()) / int(mask.sum())

for epoch in range(1, 101):
    loss = train()
    train_acc = test(data['drug'].train_mask)
    val_acc = test(data['drug'].val_mask)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

test_acc = test(data['drug'].test_mask)
print(f'Test Acc: {test_acc:.4f}')


Epoch: 001, Loss: 1981.4121, Train Acc: 0.5000, Val Acc: 0.9480
Epoch: 002, Loss: 33485.1211, Train Acc: 0.5000, Val Acc: 0.9480
Epoch: 003, Loss: 6529.0840, Train Acc: 0.7500, Val Acc: 0.4560
Epoch: 004, Loss: 7750.7002, Train Acc: 0.7500, Val Acc: 0.4420
Epoch: 005, Loss: 12394.8359, Train Acc: 0.7500, Val Acc: 0.4400
Epoch: 006, Loss: 10845.3486, Train Acc: 0.6500, Val Acc: 0.3040
Epoch: 007, Loss: 6662.1475, Train Acc: 0.5000, Val Acc: 0.9480
Epoch: 008, Loss: 1436.5637, Train Acc: 0.5000, Val Acc: 0.0520
Epoch: 009, Loss: 4220.1299, Train Acc: 0.5000, Val Acc: 0.0520
Epoch: 010, Loss: 6378.3892, Train Acc: 0.5000, Val Acc: 0.0540
Epoch: 011, Loss: 6597.0132, Train Acc: 0.7500, Val Acc: 0.4520
Epoch: 012, Loss: 5225.8999, Train Acc: 0.7500, Val Acc: 0.4620
Epoch: 013, Loss: 2482.1963, Train Acc: 0.5000, Val Acc: 0.9480
Epoch: 014, Loss: 5575.3018, Train Acc: 0.7250, Val Acc: 0.5360
Epoch: 015, Loss: 349.6091, Train Acc: 0.7500, Val Acc: 0.5180
Epoch: 016, Loss: 734.9860, Train Acc:

In [9]:
import networkx as nx
from torch_geometric.utils import to_networkx

def to_heterogeneous_graph(data):
    g = nx.Graph()
    
    # Add nodes with labels
    for node_type, node_data in data.x_dict.items():
        for i, features in enumerate(node_data):
            g.add_node((node_type, i), label=node_type)
    
    # Add edges with labels
    for (src, rel, dst), edge_index in data.edge_index_dict.items():
        edges = edge_index.t().tolist()
        for u, v in edges:
            g.add_edge((src, u), (dst, v), label=rel)
    
    return g
import matplotlib.pyplot as plt

def draw_heterogeneous_graph(g, edge_mask=None, draw_edge_labels=False):
    g = g.copy().to_undirected()
    
    # Prepare node labels
    node_labels = {(node_type, idx): f"{node_type[:2]}_{idx}" for node_type, idx in g.nodes()}
    
    # Generate positions for the nodes
    pos = nx.spring_layout(g)
    
    # Set edge colors and widths
    if edge_mask is None:
        edge_color = 'black'
        widths = None
    else:
        edge_color = [edge_mask.get((u, v), 0) for u, v in g.edges()]
        widths = [x * 10 for x in edge_color]
  
    nx.draw(g, pos=pos, labels=node_labels, width=widths,
            edge_color=edge_color, edge_cmap=plt.cm.Blues,
            node_color='azure')
   
    if draw_edge_labels and edge_mask is not None:
        edge_labels = {k: ('%.2f' % v) for k, v in edge_mask.items()}
        nx.draw_networkx_edge_labels(g, pos, edge_labels=edge_labels, font_color='red')
    
    # Show the plot
    plt.show()


In [None]:
import random
import torch
import numpy as np
from captum.attr import Saliency, IntegratedGradients
from collections import defaultdict
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def model_forward(edge_mask, data):
    x_dict, edge_index_dict = data.x_dict, data.edge_index_dict
    for key in edge_index_dict.keys():
        edge_index_dict[key] = edge_index_dict[key].to(device)
    batch = {key: torch.zeros(x.size(0), dtype=torch.long).to(device) for key, x in x_dict.items()}
    out = model(x_dict, edge_index_dict)
    return out['drug']

def explain(method, data, target=0):
    input_mask = torch.ones(data['drug', 'interactsWith', 'drug'].edge_index.shape[1], device=device, requires_grad=True)
    if method == 'ig':
        ig = IntegratedGradients(model_forward)
        mask = ig.attribute(input_mask, target=target, additional_forward_args=(data,))
    elif method == 'saliency':
        saliency = Saliency(model_forward)
        mask = saliency.attribute(input_mask, target=target, additional_forward_args=(data,))
    else:
        raise ValueError('Unknown explanation method')

    edge_mask = np.abs(mask.cpu().detach().numpy())
    if edge_mask.max() > 0:
        edge_mask = edge_mask / edge_mask.max()
    return edge_mask

def aggregate_edge_directions(edge_mask, data):
    edge_mask_dict = defaultdict(float)
    for val, u, v in list(zip(edge_mask, *data['drug', 'interactsWith', 'drug'].edge_index)):
        u, v = u.item(), v.item()
        if u > v:
            u, v = v, u
        edge_mask_dict[(u, v)] += val
    return edge_mask_dict

import random

data_sample = random.choice([data for data in test_data if data['drug'].y.item() == 1])  # select a random test sample
g = to_heterogeneous_graph(data_sample)

for title, method in [('Integrated Gradients', 'ig'), ('Saliency', 'saliency')]:
    edge_mask = explain(method, data_sample, target=1)
    edge_mask_dict = aggregate_edge_directions(edge_mask, data_sample)
    plt.figure(figsize=(10, 5))
    plt.title(title)
    draw_heterogeneous_graph(g, edge_mask_dict)



In [24]:
import pandas as pd

# Load the dataset
df33 = pd.read_csv('kg_smiles_updated.csv')
print(len(df33))

# Initialize dictionaries to store the DrugBank IDs and corresponding Doping labels for each column with missing data
missing_data = {
    'Categories': [],
    'ATC Codes': [],
    'Targets': [],
    'Interactions': []
}
debug_counts = {
    'Categories': 0,
    'ATC Codes': 0,
    'Targets': 0,
    'Interactions': 0
}

# Iterate through each row and check for NaN or empty values in the specified columns
for index, row in df33.iterrows():
    drug_id = row['DrugBank ID']
    doping_label = row['Doping']
    
    if pd.isna(row['Categories']) or row['Categories'].strip() == '':
        missing_data['Categories'].append((drug_id, doping_label))
        debug_counts['Categories'] += 1
    if pd.isna(row['ATC Codes']) or row['ATC Codes'].strip() == '':
        missing_data['ATC Codes'].append((drug_id, doping_label))
        debug_counts['ATC Codes'] += 1
    if pd.isna(row['Targets']) or row['Targets'].strip() == '':
        missing_data['Targets'].append((drug_id, doping_label))
        debug_counts['Targets'] += 1
    if pd.isna(row['Interactions']) or row['Interactions'].strip() == '':
        missing_data['Interactions'].append((drug_id, doping_label))
        debug_counts['Interactions'] += 1


# Display debug information
for column, count in debug_counts.items():
    print(f"Found {count} missing entries in {column}")

# Display the results
for column, missing_entries in missing_data.items():
    print(f"Missing data in {column}:")
    for drug_id, doping_label in missing_entries:
        print(f"DrugBank ID: {drug_id}, Doping: {doping_label}")
    print()


11773
Found 4660 missing entries in Categories
Found 8845 missing entries in ATC Codes
Found 4639 missing entries in Targets
Found 8119 missing entries in Interactions
Missing data in Categories:
DrugBank ID: DB00192, Doping: 0
DrugBank ID: DB01322, Doping: 0
DrugBank ID: DB01346, Doping: 0
DrugBank ID: DB01439, Doping: 1
DrugBank ID: DB01443, Doping: 1
DrugBank ID: DB01444, Doping: 0
DrugBank ID: DB01451, Doping: 1
DrugBank ID: DB01455, Doping: 1
DrugBank ID: DB01458, Doping: 1
DrugBank ID: DB01464, Doping: 0
DrugBank ID: DB01468, Doping: 0
DrugBank ID: DB01469, Doping: 1
DrugBank ID: DB01470, Doping: 1
DrugBank ID: DB01473, Doping: 0
DrugBank ID: DB01474, Doping: 1
DrugBank ID: DB01475, Doping: 0
DrugBank ID: DB01477, Doping: 0
DrugBank ID: DB01479, Doping: 1
DrugBank ID: DB01494, Doping: 0
DrugBank ID: DB01498, Doping: 0
DrugBank ID: DB01499, Doping: 0
DrugBank ID: DB01500, Doping: 1
DrugBank ID: DB01502, Doping: 0
DrugBank ID: DB01503, Doping: 1
DrugBank ID: DB01505, Doping: 0
Drug

In [21]:
# Filter the DataFrame to include only rows where all specified columns have non-NaN and non-empty values
merge_df = pd.read_csv('merge.csv')

complete_data_df = df33.dropna(subset=['Categories', 'ATC Codes', 'Targets', 'Interactions'])
complete_data_df = complete_data_df[(complete_data_df['Categories'].str.strip() != '') & 
                                    (complete_data_df['ATC Codes'].str.strip() != '') & 
                                    (complete_data_df['Targets'].str.strip() != '') & 
                                    (complete_data_df['Interactions'].str.strip() != '')]
merged_df = pd.merge(complete_data_df, merge_df[['DrugBank ID', 'Similar Structure']], on='DrugBank ID', how='inner')

# Filter the merged DataFrame to include only rows where 'Similarities' is non-NaN and non-empty
filtered_merged_df = merged_df.dropna(subset=['Similar Structure'])
filtered_merged_df = filtered_merged_df[filtered_merged_df['Similar Structure'].str.strip() != '']

# Extract the DrugBank IDs and their corresponding Doping labels
filtered_drugbank_ids = filtered_merged_df['DrugBank ID'].tolist()
filtered_doping_labels = filtered_merged_df['Doping'].tolist()

# Count the number of 0 and 1 labels in the Doping column
doping_counts = filtered_merged_df['Doping'].value_counts()

# Display the results
for drug_id, doping_label in zip(filtered_drugbank_ids, filtered_doping_labels):
    print(f"DrugBank ID: {drug_id}, Doping: {doping_label}")

# Print the total count of entries with all information present
print(f"\nTotal number of entries with all information present: {len(filtered_drugbank_ids)}")
# Print the counts of 0 and 1 labels in the Doping column
print(f"\nNumber of entries labeled as 0 under Doping: {doping_counts.get(0, 0)}")
print(f"Number of entries labeled as 1 under Doping: {doping_counts.get(1, 0)}")

DrugBank ID: DB00006, Doping: 0
DrugBank ID: DB00007, Doping: 1
DrugBank ID: DB00014, Doping: 1
DrugBank ID: DB00035, Doping: 1
DrugBank ID: DB00080, Doping: 0
DrugBank ID: DB00091, Doping: 0
DrugBank ID: DB00115, Doping: 0
DrugBank ID: DB00118, Doping: 0
DrugBank ID: DB00126, Doping: 0
DrugBank ID: DB00130, Doping: 0
DrugBank ID: DB00134, Doping: 0
DrugBank ID: DB00136, Doping: 0
DrugBank ID: DB00140, Doping: 0
DrugBank ID: DB00142, Doping: 0
DrugBank ID: DB00145, Doping: 0
DrugBank ID: DB00146, Doping: 0
DrugBank ID: DB00150, Doping: 0
DrugBank ID: DB00152, Doping: 0
DrugBank ID: DB00153, Doping: 0
DrugBank ID: DB00158, Doping: 0
DrugBank ID: DB00162, Doping: 0
DrugBank ID: DB00165, Doping: 0
DrugBank ID: DB00166, Doping: 0
DrugBank ID: DB00169, Doping: 0
DrugBank ID: DB00170, Doping: 0
DrugBank ID: DB00175, Doping: 0
DrugBank ID: DB00176, Doping: 0
DrugBank ID: DB00177, Doping: 0
DrugBank ID: DB00178, Doping: 0
DrugBank ID: DB00179, Doping: 0
DrugBank ID: DB00180, Doping: 1
DrugBank

In [26]:
import pandas as pd

# Load the datasets
df33 = pd.read_csv('kg_smiles_updated.csv')
merge_df = pd.read_csv('merge.csv')

similar = pd.read_csv('data_filtered\\raw\\all_one.csv')

# Filter the DataFrame to include only rows where all specified columns have non-NaN and non-empty values
complete_data_df = df33.dropna(subset=['Categories', 'ATC Codes', 'Targets', 'Interactions'])
complete_data_df = complete_data_df[(complete_data_df['Categories'].str.strip() != '') & 
                                    (complete_data_df['ATC Codes'].str.strip() != '') & 
                                    (complete_data_df['Targets'].str.strip() != '') & 
                                    (complete_data_df['Interactions'].str.strip() != '')]

# Merge complete_data_df with merge_df on 'DrugBank ID' to get the 'Similar Structure' column
merged_df = pd.merge(complete_data_df, merge_df[['DrugBank ID', 'Similar Structure']], on='DrugBank ID', how='inner')


# Filter the merged DataFrame to include only rows where 'Similar Structure' is non-NaN and non-empty
filtered_merged_df = merged_df.dropna(subset=['Similar Structure'])
filtered_merged_df = filtered_merged_df[filtered_merged_df['Similar Structure'].str.strip() != '']

# Extract the DrugBank IDs and their corresponding Doping labels
filtered_drugbank_ids = filtered_merged_df['DrugBank ID'].tolist()
filtered_doping_labels = filtered_merged_df['Doping'].tolist()
similar_merge = pd.merge(filtered_merged_df,similar['DrugBank ID'], on='DrugBank ID', how='inner')
filtered_drugbank_ids_2 = similar_merge['DrugBank ID'].tolist()
filtered_doping_labels_2 = similar_merge['Doping'].tolist()
# Count the number of 0 and 1 labels in the Doping column
doping_counts = filtered_merged_df['Doping'].value_counts()
doping_counts_2 = similar_merge['Doping'].value_counts()


# Display the results
for drug_id, doping_label in zip(filtered_drugbank_ids, filtered_doping_labels):
    print(f"DrugBank ID: {drug_id}, Doping: {doping_label}")

for drug_id, doping_label in zip(filtered_drugbank_ids_2, filtered_doping_labels_2):
    print(f"DrugBank ID: {drug_id}, Doping: {doping_label}")

# Print the total count of entries with all information present
print(f"\nTotal number of entries with all information present: {len(filtered_drugbank_ids)}")
print(f"\nTotal number of entries with all information present: {len(filtered_drugbank_ids_2)}")

# Print the counts of 0 and 1 labels in the Doping column
print(f"Number of entries labeled as 0 under Doping: {doping_counts.get(0, 0)}")
print(f"Number of entries labeled as 1 under Doping: {doping_counts.get(1, 0)}")

print(f"Number of entries labeled as 0 under Doping: {doping_counts_2.get(0, 0)}")
print(f"Number of entries labeled as 1 under Doping: {doping_counts_2.get(1, 0)}")


DrugBank ID: DB00006, Doping: 0
DrugBank ID: DB00007, Doping: 1
DrugBank ID: DB00014, Doping: 1
DrugBank ID: DB00035, Doping: 1
DrugBank ID: DB00080, Doping: 0
DrugBank ID: DB00091, Doping: 0
DrugBank ID: DB00115, Doping: 0
DrugBank ID: DB00118, Doping: 0
DrugBank ID: DB00126, Doping: 0
DrugBank ID: DB00130, Doping: 0
DrugBank ID: DB00134, Doping: 0
DrugBank ID: DB00136, Doping: 0
DrugBank ID: DB00140, Doping: 0
DrugBank ID: DB00142, Doping: 0
DrugBank ID: DB00145, Doping: 0
DrugBank ID: DB00146, Doping: 0
DrugBank ID: DB00150, Doping: 0
DrugBank ID: DB00152, Doping: 0
DrugBank ID: DB00153, Doping: 0
DrugBank ID: DB00158, Doping: 0
DrugBank ID: DB00162, Doping: 0
DrugBank ID: DB00165, Doping: 0
DrugBank ID: DB00166, Doping: 0
DrugBank ID: DB00169, Doping: 0
DrugBank ID: DB00170, Doping: 0
DrugBank ID: DB00175, Doping: 0
DrugBank ID: DB00176, Doping: 0
DrugBank ID: DB00177, Doping: 0
DrugBank ID: DB00178, Doping: 0
DrugBank ID: DB00179, Doping: 0
DrugBank ID: DB00180, Doping: 1
DrugBank

In [28]:
# Load the all_one.csv file
all_one_df = pd.read_csv('kg_smiles_updated.csv')

# Filter the DataFrame to include only rows that match the DrugBank IDs from the list
hetero_df = all_one_df[all_one_df['DrugBank ID'].isin(filtered_drugbank_ids_2)]

# Save the filtered DataFrame to hetero.csv
hetero_df.to_csv('hetero.csv', index=False)

merge_df = pd.read_csv('merge.csv')

# Merge hetero_df with merge_df on 'DrugBank ID' to add the 'Similar Structure' column
final_hetero_df = pd.merge(hetero_df, merge_df[['DrugBank ID', 'Similar Structure']], on='DrugBank ID', how='left')

# Save the final merged DataFrame to hetero.csv
final_hetero_df.to_csv('hetero.csv', index=False)

print("Data has been saved to hetero.csv")


Data has been saved to hetero.csv


In [30]:
final_hetero_df.shape

(338, 13)

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('hetero.csv')

# Extract the set of valid DrugBank IDs
valid_drugbank_ids = set(df['DrugBank ID'])

# Function to clean the values in a column
def clean_values(column):
    def clean_cell(cell):
        if pd.isna(cell):
            return cell
        ids = cell.split('; ')
        valid_ids = [drug_id for drug_id in ids if drug_id in valid_drugbank_ids]
        return '; '.join(valid_ids)
    
    return column.apply(clean_cell)

# Clean the 'Interactions' and 'Similar Structure' columns
df['Interactions'] = clean_values(df['Interactions'])
df['Similar Structure'] = clean_values(df['Similar Structure'])

# Save the cleaned DataFrame back to CSV
df.to_csv('cleaned_hetero.csv', index=False)
