In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load encoded data from CSV files
encoded_drugbank_id_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_drugbank_id.csv')
encoded_name_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_name.csv')
encoded_state_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_state.csv')
encoded_groups_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_groups.csv')
encoded_categories_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_categories.csv')
encoded_atc_codes_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_atc_codes.csv')
encoded_targets_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_targets.csv')
encoded_interactions_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_interactions.csv')
encoded_molecular_formula_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_molecular_formula.csv')
encoded_doping_df = pd.read_csv('Heterogeneous KG\encoders_small\encoded_doping.csv')

# Convert DataFrames to tensors
encoded_drugbank_id_tensor_1 = torch.tensor(encoded_drugbank_id_df.values, dtype=torch.float32)
encoded_name_tensor_1 = torch.tensor(encoded_name_df.values, dtype=torch.float32)
encoded_state_tensor_1 = torch.tensor(encoded_state_df.values, dtype=torch.float32)
encoded_groups_tensor_1 = torch.tensor(encoded_groups_df.values, dtype=torch.float32)
encoded_categories_tensor_1 = torch.tensor(encoded_categories_df.values, dtype=torch.float32)
encoded_atc_codes_tensor_1 = torch.tensor(encoded_atc_codes_df.values, dtype=torch.float32)
encoded_targets_tensor_1 = torch.tensor(encoded_targets_df.values, dtype=torch.float32)
encoded_interactions_tensor_1 = torch.tensor(encoded_interactions_df.values, dtype=torch.float32)
encoded_molecular_formula_tensor_1 = torch.tensor(encoded_molecular_formula_df.values, dtype=torch.float32)
encoded_doping_tensor_1 = torch.tensor(encoded_doping_df.values, dtype=torch.float32)

# Initialize HeteroData
data_small = HeteroData()

# Add Drug node features
data_small['drug'].x = torch.cat([
    encoded_drugbank_id_tensor_1,
    encoded_name_tensor_1,
    encoded_state_tensor_1,
    encoded_groups_tensor_1,
    encoded_molecular_formula_tensor_1
], dim=1)

# Add Drug Category nodes (one-hot encoding)
data_small['drug_category'].x = torch.eye(len(encoded_categories_df.columns), dtype=torch.float32)

# Add ATC Code nodes (one-hot encoding)
data_small['atc_code'].x = torch.eye(len(encoded_atc_codes_df.columns), dtype=torch.float32)

# Add Target nodes (one-hot encoding)
data_small['target'].x = torch.eye(len(encoded_targets_df.columns), dtype=torch.float32)

# Add Doping nodes (one-hot encoding)
data_small['doping'].x = torch.eye(len(encoded_doping_df['Doping'].unique()), dtype=torch.float32)

# Create edge lists for drug-to-category relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_categories_df.iterrows():
    for category_idx in range(len(row)):
        if row[category_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(category_idx)
data_small['drug', 'isInCategory', 'drug_category'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-ATC code relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_atc_codes_df.iterrows():
    for atc_code_idx in range(len(row)):
        if row[atc_code_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(atc_code_idx)
data_small['drug', 'isClassifiedAs', 'atc_code'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-target relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_targets_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'targets', 'target'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-doping relationships
source_nodes = []
target_nodes = []
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    source_nodes.append(drug_idx)
    target_nodes.append(doping)
data_small['drug', 'isDoping', 'doping'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-drug interactions
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_interactions_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'interactsWith', 'drug'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

print(data_small)




HeteroData(
  drug={ x=[338, 467] },
  drug_category={ x=[1095, 1095] },
  atc_code={ x=[974, 974] },
  target={ x=[249, 249] },
  doping={ x=[2, 2] },
  (drug, isInCategory, drug_category)={ edge_index=[2, 8347] },
  (drug, isClassifiedAs, atc_code)={ edge_index=[2, 2229] },
  (drug, targets, target)={ edge_index=[2, 338] },
  (drug, isDoping, doping)={ edge_index=[2, 338] },
  (drug, interactsWith, drug)={ edge_index=[2, 41415] }
)


In [2]:
import torch

# Assume 'edge_index' is your tensor [2, number_of_edges] where the second row has category indices
edge_index = data_small['drug', 'isInCategory', 'drug_category'].edge_index

# Get the second row which contains the category indices
category_indices = edge_index[1]

# Count occurrences of each category index
category_edge_counts = torch.bincount(category_indices)

# Print the number of edges for each category
print(category_edge_counts)


tensor([7, 7, 2,  ..., 3, 1, 1])


In [7]:
import torch
import pandas as pd

# Assuming 'edge_index' is already defined as shown before
edge_index = data_small['drug', 'isInCategory', 'drug_category'].edge_index

# Get the second row which contains the category indices
category_indices = edge_index[1]

# Count occurrences of each category index
category_edge_counts = torch.bincount(category_indices)

category_names = list(encoded_categories_df.columns)
# Create a DataFrame for clearer visualization
df = pd.DataFrame({
    'Category_ID': range(len(category_edge_counts)),
    'Number_of_Drugs': category_edge_counts.numpy(),  # Convert tensor to numpy array for DataFrame
    'Category_Name': category_names
})

print(df)
df.to_csv("drug_categories_small", index = False)




      Category_ID  Number_of_Drugs                          Category_Name
0               0                7              11-Hydroxycorticosteroids
1               1                7              17-Hydroxycorticosteroids
2               2                2                        17-Ketosteroids
3               3                1    2-Amino-1-Phenylethanol Derivatives
4               4                3         3-Oxoandrosten (4) Derivatives
...           ...              ...                                    ...
1090         1090                1             Vitamin B12 and Folic Acid
1091         1091                1                               Vitamins
1092         1092                3           Wakefulness-Promoting Agents
1093         1093                1  gamma-Aminobutyric Acid-ergic Agonist
1094         1094                1                    meta-Aminobenzoates

[1095 rows x 3 columns]
