In [None]:
import pandas as pd
import numpy as np
from scipy.io import savemat

# Define file path for the input data
file_path = r'./data/DrugBank/drkg.tsv'

# Load the dataset as a Pandas DataFrame
df = pd.read_csv(file_path, sep='\t', header=None)

# Filter rows where the second column represents drug-disease relationships
df_drug_disease = df[(df[1].str.endswith('Compound:Disease')) & (df[1].str.startswith('DRUGBANK'))]

# Filter rows where the second column starts with 'DRUGBANK' (general drug-related relationships)
df_drug_related = df[df[1].str.startswith('DRUGBANK')]

# Extract unique drug entities from both the first and third column
col1_filtered = df_drug_related[0][df_drug_related[0].str.startswith('Compound')]
col3_filtered = df_drug_related[2][df_drug_related[2].str.startswith('Compound')]
union_set = set(col1_filtered).union(set(col3_filtered))

# Extract unique diseases and drugs
diseases = df_drug_disease[2].unique()
drugs = list(union_set)

# Create an empty binary matrix with diseases as rows and drugs as columns
binary_matrix = pd.DataFrame(0, index=diseases, columns=drugs)

# Populate the drug-disease binary matrix
for _, row in df_drug_disease.iterrows():
    drug = row[0]
    disease = row[2]
    if drug in binary_matrix.columns and disease in binary_matrix.index:
        binary_matrix.at[disease, drug] = 1

# Extract drug-drug interaction relationships
df_drug_interaction = df[df[1].str.startswith('DRUGBANK::ddi')]

drugs = binary_matrix.columns  # Ensure the drug order is consistent with the binary matrix

# Create an empty interaction matrix (drug-drug relationships)
interaction_matrix = pd.DataFrame(0, index=drugs, columns=drugs)

# Populate the drug-drug interaction matrix
for _, row in df_drug_interaction.iterrows():
    drug1, drug2 = row[0], row[2]
    if drug1 in interaction_matrix.index and drug2 in interaction_matrix.columns:
        interaction_matrix.at[drug1, drug2] = 1
        # Uncomment the next line if the interaction matrix should be symmetric
        # interaction_matrix.at[drug2, drug1] = 1

# Extract disease and drug names
Wdname = binary_matrix.index.tolist()  # List of disease names
Wrname = binary_matrix.columns.tolist()  # List of drug names

# Convert data matrices to NumPy arrays
didr = binary_matrix.to_numpy()  # Drug-disease binary matrix
drug = interaction_matrix.to_numpy()  # Drug-drug interaction matrix

def compute_drug_similarity(interaction_matrix):
    """
    Compute the drug similarity matrix based on shared interactions.
    
    :param interaction_matrix: A square binary (0/1) matrix representing drug-drug interactions.
    :return: A square matrix containing similarity scores between drugs.
    """
    interaction_matrix = interaction_matrix.astype(bool).astype(int)
    intersection_matrix = interaction_matrix @ interaction_matrix.T
    degree_matrix = interaction_matrix.sum(axis=1, keepdims=True)
    union_matrix = degree_matrix + degree_matrix.T
    np.fill_diagonal(union_matrix, 1)  # Prevent division by zero
    similarity_matrix = 2 * intersection_matrix / union_matrix
    np.fill_diagonal(similarity_matrix, 0)  # Restore diagonal to zero
    return similarity_matrix

# Compute the drug similarity matrix
drug = compute_drug_similarity(drug)

# Save the processed data to a .mat file
savemat('DrugBank.mat', {
    'didr': didr,  # Drug-disease binary matrix
    'drug': drug,  # Drug similarity matrix
    'Wdname': Wdname,  # Disease names
    'Wrname': Wrname   # Drug names
})

print("DrugBank.mat file has been successfully saved!")