In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sentence_transformers import SentenceTransformer
import torch

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, texts):
        embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_tensor=True, device=self.device)
        return embeddings.cpu()

class MolecularFormulaEncoder:
    @staticmethod
    def formula_to_vector(formula):
        elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 
            'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Ti', 'V', 'Cr', 
            'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 
            'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 
            'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Nd', 'Sm', 'Gd', 'Ho', 'Lu', 'Hf', 'Ta', 
            'W', 'Re', 'Os', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ra', 'Ac', 'Bk']
        vector = np.zeros(len(elements))
        for i, element in enumerate(elements):
            count = 0
            for j in range(len(formula)):
                if formula[j:j+len(element)] == element:
                    count = 1
                    k = j + len(element)
                    while k < len(formula) and formula[k].isdigit():
                        count = count * 10 + int(formula[k])
                        k += 1
                    vector[i] = count
        return vector

# Initialize encoders
sequence_encoder = SequenceEncoder(device='cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = LabelEncoder()

df = pd.read_csv("cleaned_hetero.csv")

df['State'] = df['State'].fillna('').astype(str)
df['Groups'] = df['Groups'].fillna('').astype(str)
df['Categories'] = df['Categories'].fillna('').astype(str)
df['ATC Codes'] = df['ATC Codes'].fillna('').astype(str)
df['Targets'] = df['Targets'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].apply(lambda x: x.split('; '))

# Encode columns
encoded_drugbank_id = label_encoder.fit_transform(df['DrugBank ID'])
encoded_name = sequence_encoder(df['Name'])
encoded_state = label_encoder.fit_transform(df['State'])

# Reinitialize MultiLabelBinarizer for each multi-label column
mlb_groups = MultiLabelBinarizer()
mlb_categories = MultiLabelBinarizer()
mlb_atc_codes = MultiLabelBinarizer()
mlb_targets = MultiLabelBinarizer()
mlb_interactions = MultiLabelBinarizer()

encoded_groups = mlb_groups.fit_transform(df['Groups'].str.split('; '))
encoded_categories = mlb_categories.fit_transform(df['Categories'].str.split('; '))
encoded_atc_codes = mlb_atc_codes.fit_transform(df['ATC Codes'].str.split('; '))
encoded_targets = mlb_targets.fit_transform(df['Targets'].str.split('; '))
encoded_interactions = mlb_interactions.fit_transform(df['Interactions'])

# Encoding Molecular Formula
encoded_molecular_formula = np.array([MolecularFormulaEncoder.formula_to_vector(formula) for formula in df['Molecular Formula']])

# Encoding Doping
encoded_doping = df['Doping'].values

# Convert encoded data to DataFrames
encoded_name_df = pd.DataFrame(encoded_name.numpy())
encoded_drugbank_id_df = pd.DataFrame(encoded_drugbank_id, columns=['DrugBank ID'])
encoded_state_df = pd.DataFrame(encoded_state, columns=['State'])
encoded_groups_df = pd.DataFrame(encoded_groups, columns=mlb_groups.classes_)
encoded_categories_df = pd.DataFrame(encoded_categories, columns=mlb_categories.classes_)
encoded_atc_codes_df = pd.DataFrame(encoded_atc_codes, columns=mlb_atc_codes.classes_)
encoded_targets_df = pd.DataFrame(encoded_targets, columns=mlb_targets.classes_)
encoded_interactions_df = pd.DataFrame(encoded_interactions, columns=mlb_interactions.classes_)
encoded_molecular_formula_df = pd.DataFrame(encoded_molecular_formula)
encoded_doping_df = pd.DataFrame(encoded_doping, columns=['Doping'])

# Save DataFrames to CSV
encoded_name_df.to_csv('encoders_small/encoded_name.csv', index=False)
encoded_drugbank_id_df.to_csv('encoders_small/encoded_drugbank_id.csv', index = False)
encoded_state_df.to_csv('encoders_small/encoded_state.csv', index=False)
encoded_groups_df.to_csv('encoders_small/encoded_groups.csv', index=False)
encoded_categories_df.to_csv('encoders_small/encoded_categories.csv', index=False)
encoded_atc_codes_df.to_csv('encoders_small/encoded_atc_codes.csv', index=False)
encoded_targets_df.to_csv('encoders_small/encoded_targets.csv', index=False)
encoded_interactions_df.to_csv('encoders_small/encoded_interactions.csv', index=False)
encoded_molecular_formula_df.to_csv('encoders_small/encoded_molecular_formula.csv', index=False)
encoded_doping_df.to_csv('encoders_small/encoded_doping.csv', index=False)

print("Data saved to CSV files.")


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Data saved to CSV files.


In [None]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Load encoded data from CSV files
encoded_drugbank_id_df = pd.read_csv('encoders_small/encoded_drugbank_id.csv')
encoded_name_df = pd.read_csv('encoders_small/encoded_name.csv')
encoded_state_df = pd.read_csv('encoders_small/encoded_state.csv')
encoded_groups_df = pd.read_csv('encoders_small/encoded_groups.csv')
encoded_categories_df = pd.read_csv('encoders_small/encoded_categories.csv')
encoded_atc_codes_df = pd.read_csv('encoders_small/encoded_atc_codes.csv')
encoded_targets_df = pd.read_csv('encoders_small/encoded_targets.csv')
encoded_interactions_df = pd.read_csv('encoders_small/encoded_interactions.csv')
encoded_molecular_formula_df = pd.read_csv('encoders_small/encoded_molecular_formula.csv')
encoded_doping_df = pd.read_csv('encoders_small/encoded_doping.csv')

# Convert DataFrames to tensors
encoded_drugbank_id_tensor = torch.tensor(encoded_drugbank_id_df.values, dtype=torch.float32)
encoded_name_tensor = torch.tensor(encoded_name_df.values, dtype=torch.float32)
encoded_state_tensor = torch.tensor(encoded_state_df.values, dtype=torch.float32)
encoded_groups_tensor = torch.tensor(encoded_groups_df.values, dtype=torch.float32)
encoded_categories_tensor = torch.tensor(encoded_categories_df.values, dtype=torch.float32)
encoded_atc_codes_tensor = torch.tensor(encoded_atc_codes_df.values, dtype=torch.float32)
encoded_targets_tensor = torch.tensor(encoded_targets_df.values, dtype=torch.float32)
encoded_interactions_tensor = torch.tensor(encoded_interactions_df.values, dtype=torch.float32)
encoded_molecular_formula_tensor = torch.tensor(encoded_molecular_formula_df.values, dtype=torch.float32)
encoded_doping_tensor = torch.tensor(encoded_doping_df.values, dtype=torch.float32)

# Initialize HeteroData
data_small = HeteroData()

# Add Drug node features
data_small['drug'].x = torch.cat([
    encoded_drugbank_id_tensor,
    encoded_name_tensor,
    encoded_state_tensor,
    encoded_groups_tensor,
    encoded_molecular_formula_tensor
], dim=1)

# Add Drug Category nodes (one-hot encoding)
data_small['drug_category'].x = torch.eye(len(encoded_categories_df.columns), dtype=torch.float32)

# Add ATC Code nodes (one-hot encoding)
data_small['atc_code'].x = torch.eye(len(encoded_atc_codes_df.columns), dtype=torch.float32)

# Add Target nodes (one-hot encoding)
data_small['target'].x = torch.eye(len(encoded_targets_df.columns), dtype=torch.float32)

# Add Doping nodes (one-hot encoding)
data_small['doping'].x = torch.eye(len(encoded_doping_df['Doping'].unique()), dtype=torch.float32)

# Create edge lists for drug-to-category relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_categories_df.iterrows():
    for category_idx in range(len(row)):
        if row[category_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(category_idx)
data_small['drug', 'isInCategory', 'drug_category'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-ATC code relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_atc_codes_df.iterrows():
    for atc_code_idx in range(len(row)):
        if row[atc_code_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(atc_code_idx)
data_small['drug', 'isClassifiedAs', 'atc_code'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-target relationships
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_targets_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'targets', 'target'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-doping relationships
source_nodes = []
target_nodes = []
for drug_idx, doping in enumerate(encoded_doping_df['Doping']):
    source_nodes.append(drug_idx)
    target_nodes.append(doping)
data_small['drug', 'isDoping', 'doping'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

# Create edge lists for drug-to-drug interactions
source_nodes = []
target_nodes = []
for drug_idx, row in encoded_interactions_df.iterrows():
    for target_idx in range(len(row)):
        if row[target_idx] == 1:
            source_nodes.append(drug_idx)
            target_nodes.append(target_idx)
data_small['drug', 'interactsWith', 'drug'].edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

print(data_small)
