In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('full database.xml')
root = tree.getroot()

# Namespace dictionary to handle the default namespace
ns = {'db': 'http://www.drugbank.ca'}

# Initialize lists to store the extracted data
drugbank_ids = []
names = []
interactions_list = []

# Iterate over each drug in the XML
for drug in root.findall('db:drug', ns):
    drugbank_id = drug.find('db:drugbank-id[@primary="true"]', ns)
    name = drug.find('db:name', ns)
    interactions = drug.find('db:drug-interactions', ns)

    # Extract text if elements are found, else use None or empty string
    drugbank_ids.append(drugbank_id.text if drugbank_id is not None else None)
    names.append(name.text if name is not None else None)

    interaction_texts = []
    if interactions is not None:
        for interaction in interactions.findall('db:drug-interaction', ns):
            interaction_id = interaction.find('db:drugbank-id', ns)
            if interaction_id is not None:
                interaction_texts.append(interaction_id.text)
    interactions_list.append('; '.join(interaction_texts) if interaction_texts else '')

# Create a pandas DataFrame
data = {
    'DrugBank ID': drugbank_ids,
    'Name': names,
    'Interactions': interactions_list

}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('drugbank_data.csv', index=False)


In [2]:
import pandas as pd

# Load the existing kg_smiles DataFrame
kg_smiles = pd.read_csv('data\\raw\kg_smiles.csv')

# Load the drugbank_data DataFrame
drugbank_data = pd.read_csv('drugbank_data.csv')

# Merge the two DataFrames on the 'DrugBank ID' column
merged_data = pd.merge(kg_smiles, drugbank_data[['DrugBank ID', 'Interactions']], on='DrugBank ID', how='left')

# Replace NaN with an empty string if no interaction found
merged_data['Interactions'] = merged_data['Interactions'].fillna('')

# Save the updated DataFrame to a CSV file if needed
merged_data.to_csv('kg_smiles_updated.csv', index=False)



In [1]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

In [2]:
def load_node_csv(path, index_col, encoders=None, **kwargs):
    df = pd.read_csv(path, index_col=index_col, **kwargs)
    mapping = {index: i for i, index in enumerate(df.index.unique())}

    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [3]:
class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [4]:
class MolecularFormulaEncoder:
    @staticmethod
    def formula_to_vector(formula):
        # Define the elements to consider
        elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 
            'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Ti', 'V', 'Cr', 
            'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 
            'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 
            'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Nd', 'Sm', 'Gd', 'Ho', 'Lu', 'Hf', 'Ta', 
            'W', 'Re', 'Os', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ra', 'Ac', 'Bk']
        vector = np.zeros(len(elements))
        for i, element in enumerate(elements):
            count = 0
            for j in range(len(formula)):
                if formula[j] == element:
                    count = 1
                    k = j + 1
                    while k < len(formula) and formula[k].isdigit():
                        count = count * 10 + int(formula[k])
                        k += 1
                    break
            vector[i] = count
        return vector

# Sample Data


# Initialize encoders
sequence_encoder = SequenceEncoder(device='cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = LabelEncoder()
multi_label_binarizer = MultiLabelBinarizer()

df = pd.read_csv("kg_smiles_updated.csv")

df['State'] = df['State'].fillna('').astype(str)
df['Groups'] = df['Groups'].fillna('').astype(str)
df['Categories'] = df['Categories'].fillna('').astype(str)
df['ATC Codes'] = df['ATC Codes'].fillna('').astype(str)
df['Targets'] = df['Targets'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].apply(lambda x: x.split('; '))
# Encode columns
encoded_name = sequence_encoder(df['Name'])
#encoded_description = sequence_encoder(df['Description'])
encoded_state = label_encoder.fit_transform(df['State'])
encoded_groups = multi_label_binarizer.fit_transform(df['Groups'].str.split('; '))
encoded_categories = multi_label_binarizer.fit_transform(df['Categories'].str.split('; '))
encoded_atc_codes = multi_label_binarizer.fit_transform(df['ATC Codes'].str.split('; '))
encoded_targets = multi_label_binarizer.fit_transform(df['Targets'].str.split('; '))
encoded_interactions = multi_label_binarizer.fit_transform(df['Interactions'])

# Encoding Molecular Formula
encoded_molecular_formula = np.array([MolecularFormulaEncoder.formula_to_vector(formula) for formula in df['Molecular Formula']])

# Encoding Doping
encoded_doping = df['Doping'].values

# Encoding Interactions
# encoded_interactions = multi_label_binarizer.fit_transform(df['Interactions'].str.split('; '))

print("Encoded Name:", encoded_name)
# print("Encoded Description:", encoded_description)
print("Encoded State:", encoded_state)
print("Encoded Groups:", encoded_groups)
print("Encoded Categories:", encoded_categories)
print("Encoded ATC Codes:", encoded_atc_codes)
print("Encoded Targets:", encoded_targets)
print("Encoded Molecular Formula:", encoded_molecular_formula)
print("Encoded Doping:", encoded_doping)
print("Encoded Interactions:", encoded_interactions)
print("Encoded Interactions:", encoded_interactions)

df.to_csv('encoders.csv',index=False)


Batches:   0%|          | 0/368 [00:00<?, ?it/s]

Encoded Name: tensor([[ 0.0221,  0.0028, -0.0143,  ...,  0.0315,  0.0228, -0.0055],
        [-0.0701,  0.0300, -0.0395,  ..., -0.0313,  0.0318,  0.1018],
        [-0.0144,  0.0111, -0.0841,  ..., -0.0193,  0.0504,  0.0011],
        ...,
        [ 0.0118, -0.0102, -0.0170,  ..., -0.0978,  0.0508,  0.0646],
        [-0.0443,  0.0019, -0.0986,  ..., -0.0279,  0.0060,  0.0594],
        [-0.0159,  0.0464, -0.0685,  ...,  0.0277,  0.0194,  0.0729]])
Encoded State: [3 3 3 ... 0 0 3]
Encoded Groups: [[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
Encoded Categories: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Encoded ATC Codes: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
Encoded Targets: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sentence_transformers import SentenceTransformer
import torch

class SequenceEncoder:
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, texts):
        embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_tensor=True, device=self.device)
        return embeddings.cpu()

class MolecularFormulaEncoder:
    @staticmethod
    def formula_to_vector(formula):
        elements = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 
            'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Ti', 'V', 'Cr', 
            'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 
            'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 
            'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 
            'Ba', 'La', 'Ce', 'Nd', 'Sm', 'Gd', 'Ho', 'Lu', 'Hf', 'Ta', 
            'W', 'Re', 'Os', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ra', 'Ac', 'Bk']
        vector = np.zeros(len(elements))
        for i, element in enumerate(elements):
            count = 0
            for j in range(len(formula)):
                if formula[j:j+len(element)] == element:
                    count = 1
                    k = j + len(element)
                    while k < len(formula) and formula[k].isdigit():
                        count = count * 10 + int(formula[k])
                        k += 1
                    vector[i] = count
        return vector

# Initialize encoders
sequence_encoder = SequenceEncoder(device='cuda' if torch.cuda.is_available() else 'cpu')
label_encoder = LabelEncoder()

df = pd.read_csv("kg_smiles_updated.csv")

df['State'] = df['State'].fillna('').astype(str)
df['Groups'] = df['Groups'].fillna('').astype(str)
df['Categories'] = df['Categories'].fillna('').astype(str)
df['ATC Codes'] = df['ATC Codes'].fillna('').astype(str)
df['Targets'] = df['Targets'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].fillna('').astype(str)
df['Interactions'] = df['Interactions'].apply(lambda x: x.split('; '))

# Encode columns
encoded_drugbank_id = label_encoder.fit_transform(df['DrugBank ID'])
encoded_name = sequence_encoder(df['Name'])
encoded_state = label_encoder.fit_transform(df['State'])

# Reinitialize MultiLabelBinarizer for each multi-label column
mlb_groups = MultiLabelBinarizer()
mlb_categories = MultiLabelBinarizer()
mlb_atc_codes = MultiLabelBinarizer()
mlb_targets = MultiLabelBinarizer()
mlb_interactions = MultiLabelBinarizer()

encoded_groups = mlb_groups.fit_transform(df['Groups'].str.split('; '))
encoded_categories = mlb_categories.fit_transform(df['Categories'].str.split('; '))
encoded_atc_codes = mlb_atc_codes.fit_transform(df['ATC Codes'].str.split('; '))
encoded_targets = mlb_targets.fit_transform(df['Targets'].str.split('; '))
encoded_interactions = mlb_interactions.fit_transform(df['Interactions'])

# Encoding Molecular Formula
encoded_molecular_formula = np.array([MolecularFormulaEncoder.formula_to_vector(formula) for formula in df['Molecular Formula']])

# Encoding Doping
encoded_doping = df['Doping'].values

# Convert encoded data to DataFrames
encoded_name_df = pd.DataFrame(encoded_name.numpy())
encoded_drugbank_id_df = pd.DataFrame(encoded_drugbank_id, columns=['DrugBank ID'])
encoded_state_df = pd.DataFrame(encoded_state, columns=['State'])
encoded_groups_df = pd.DataFrame(encoded_groups, columns=mlb_groups.classes_)
encoded_categories_df = pd.DataFrame(encoded_categories, columns=mlb_categories.classes_)
encoded_atc_codes_df = pd.DataFrame(encoded_atc_codes, columns=mlb_atc_codes.classes_)
encoded_targets_df = pd.DataFrame(encoded_targets, columns=mlb_targets.classes_)
encoded_interactions_df = pd.DataFrame(encoded_interactions, columns=mlb_interactions.classes_)
encoded_molecular_formula_df = pd.DataFrame(encoded_molecular_formula)
encoded_doping_df = pd.DataFrame(encoded_doping, columns=['Doping'])

# Save DataFrames to CSV
encoded_name_df.to_csv('encoders/encoded_name.csv', index=False)
encoded_drugbank_id_df.to_csv('encoders/encoded_drugbank_id.csv', index = False)
encoded_state_df.to_csv('encoders/encoded_state.csv', index=False)
encoded_groups_df.to_csv('encoders/encoded_groups.csv', index=False)
encoded_categories_df.to_csv('encoders/encoded_categories.csv', index=False)
encoded_atc_codes_df.to_csv('encoders/encoded_atc_codes.csv', index=False)
encoded_targets_df.to_csv('encoders/encoded_targets.csv', index=False)
encoded_interactions_df.to_csv('encoders/encoded_interactions.csv', index=False)
encoded_molecular_formula_df.to_csv('encoders/encoded_molecular_formula.csv', index=False)
encoded_doping_df.to_csv('encoders/encoded_doping.csv', index=False)

print("Data saved to CSV files.")


Batches:   0%|          | 0/368 [00:00<?, ?it/s]

Data saved to CSV files.
