In [1]:
import pandas as pd
import numpy as np
from scipy.io import savemat

In [2]:
# ============================== Step 1: Construct the Drug-Indication Binary Matrix ==============================

# Define file path for drug-indication relationships
file_path = r'./data/MSI/6_drug_indication_df.tsv'

# Load the dataset
df = pd.read_csv(file_path, sep='\t')

# Create a binary matrix using pivot_table (indications as rows, drugs as columns)
binary_matrix = pd.pivot_table(df, index='indication', columns='drug', aggfunc='size', fill_value=0)

# Convert all nonzero values to 1, indicating the presence of a drug-indication relationship
binary_matrix[binary_matrix > 0] = 1

binary_matrix

drug,5-hydroxytryptophan,ACT-132577,DB00001,DB00002,DB00004,DB00005,DB00006,DB00007,DB00008,DB00009,...,testosterone-enanthate,testosterone-undecanoate,thiostrepton,thiram,trelagliptin,trepibutone,ufenamate,vinburnine,xipamide,zotarolimus
indication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0000737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3463824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3495559,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3544321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C4048328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# ============================== Step 2: Compute Drug-Drug Similarity Matrix ==============================

def compute_similarity_matrix(file_path):
    """
    Compute the drug or disease similarity matrix based on shared proteins.
    
    :param file_path: Path to the TSV file containing entity-protein relationships.
    :return: A similarity matrix as a Pandas DataFrame.
    """
    # Load the dataset
    df = pd.read_csv(file_path, sep='\t')

    # Create a mapping from drugs/diseases to their associated proteins
    entity_to_proteins = df.groupby('node_1')['node_2'].apply(set).to_dict()

    # Get the list of unique entities (drugs or diseases)
    entities = list(entity_to_proteins.keys())
    num_entities = len(entities)

    # Initialize an empty similarity matrix
    similarity_matrix = np.zeros((num_entities, num_entities))

    # Compute similarity between each pair of entities
    for i in range(num_entities):
        for j in range(i, num_entities):
            entity_i = entities[i]
            entity_j = entities[j]

            # Compute intersection size
            intersection_size = len(entity_to_proteins[entity_i].intersection(entity_to_proteins[entity_j]))

            # Compute Jaccard-like similarity
            similarity = (2 * intersection_size) / (len(entity_to_proteins[entity_i]) + len(entity_to_proteins[entity_j]))

            # Fill the symmetric similarity matrix
            similarity_matrix[i, j] = similarity_matrix[j, i] = similarity

    # Convert similarity matrix to a Pandas DataFrame for easy access
    return pd.DataFrame(similarity_matrix, index=entities, columns=entities)

# Compute drug-drug similarity matrix
drug_similarity = compute_similarity_matrix(r'./data/MSI/1_drug_to_protein.tsv')

# Compute disease-disease similarity matrix
disease_similarity = compute_similarity_matrix(r'./data/MSI/2_indication_to_protein.tsv')

drug_similarity

Unnamed: 0,5-hydroxytryptophan,ACT-132577,DB00001,DB00002,DB00004,DB00005,DB00006,DB00007,DB00008,DB00009,...,testosterone-enanthate,testosterone-undecanoate,thiostrepton,thiram,trelagliptin,trepibutone,ufenamate,vinburnine,xipamide,zotarolimus
5-hydroxytryptophan,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACT-132577,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00001,0.0,0.0,1.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00002,0.0,0.0,0.0,1.0,0.0,0.846154,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00004,0.0,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
trepibutone,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
ufenamate,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
vinburnine,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
xipamide,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
disease_similarity

Unnamed: 0,C0000737,C0000744,C0000786,C0000809,C0001122,C0001126,C0001206,C0001403,C0001418,C0001430,...,C2930898,C2931618,C2936781,C2937421,C2973725,C3463824,C3495559,C3544321,C4048328,C4083212
C0000737,1.000000,0.0,0.009524,0.0,0.0,0.018519,0.000000,0.0,0.048309,0.046512,...,0.0,0.0,0.0,0.000000,0.029197,0.045113,0.018018,0.0,0.0,0.0
C0000744,0.000000,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
C0000786,0.009524,0.0,1.000000,0.0,0.0,0.000000,0.015504,0.0,0.027149,0.013986,...,0.0,0.0,0.0,0.017241,0.052980,0.013605,0.008475,0.0,0.0,0.0
C0000809,0.000000,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
C0001122,0.000000,0.0,0.000000,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3463824,0.045113,0.0,0.013605,0.0,0.0,0.000000,0.038462,0.0,0.027778,0.000000,...,0.0,0.0,0.0,0.000000,0.027027,1.000000,0.012579,0.0,0.0,0.0
C3495559,0.018018,0.0,0.008475,0.0,0.0,0.000000,0.014184,0.0,0.042918,0.012903,...,0.0,0.0,0.0,0.000000,0.000000,0.012579,1.000000,0.0,0.0,0.0
C3544321,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
C4048328,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0


In [6]:
# ============================== Step 3: Reorder Matrices to Match Drug and Disease Order ==============================

# Extract drug and disease order from the binary matrix
drug_order = binary_matrix.columns
disease_order = binary_matrix.index

# Reorder drug-drug similarity matrix to match the drug order
drug_similarity_matrix = drug_similarity.loc[drug_order, drug_order]

# Reorder disease-disease similarity matrix to match the disease order
disease_similarity_matrix = disease_similarity.loc[disease_order, disease_order]

drug_similarity_matrix

drug,5-hydroxytryptophan,ACT-132577,DB00001,DB00002,DB00004,DB00005,DB00006,DB00007,DB00008,DB00009,...,testosterone-enanthate,testosterone-undecanoate,thiostrepton,thiram,trelagliptin,trepibutone,ufenamate,vinburnine,xipamide,zotarolimus
drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5-hydroxytryptophan,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ACT-132577,0.0,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00001,0.0,0.0,1.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00002,0.0,0.0,0.0,1.0,0.0,0.846154,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB00004,0.0,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
trepibutone,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
ufenamate,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
vinburnine,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
xipamide,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
disease_similarity_matrix

indication,C0000737,C0000744,C0000786,C0000809,C0001122,C0001126,C0001206,C0001403,C0001418,C0001430,...,C2930898,C2931618,C2936781,C2937421,C2973725,C3463824,C3495559,C3544321,C4048328,C4083212
indication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0000737,1.000000,0.0,0.009524,0.0,0.0,0.018519,0.000000,0.0,0.048309,0.046512,...,0.0,0.0,0.0,0.000000,0.029197,0.045113,0.018018,0.0,0.0,0.0
C0000744,0.000000,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
C0000786,0.009524,0.0,1.000000,0.0,0.0,0.000000,0.015504,0.0,0.027149,0.013986,...,0.0,0.0,0.0,0.017241,0.052980,0.013605,0.008475,0.0,0.0,0.0
C0000809,0.000000,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
C0001122,0.000000,0.0,0.000000,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3463824,0.045113,0.0,0.013605,0.0,0.0,0.000000,0.038462,0.0,0.027778,0.000000,...,0.0,0.0,0.0,0.000000,0.027027,1.000000,0.012579,0.0,0.0,0.0
C3495559,0.018018,0.0,0.008475,0.0,0.0,0.000000,0.014184,0.0,0.042918,0.012903,...,0.0,0.0,0.0,0.000000,0.000000,0.012579,1.000000,0.0,0.0,0.0
C3544321,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0
C4048328,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0


In [8]:
# ============================== Step 4: Save Data to .mat File ==============================

# Prepare data for saving
data_to_save = {
    'didr': binary_matrix.values,  # Drug-disease binary matrix (NumPy array)
    'disease': disease_similarity_matrix.values,  # Disease-disease similarity matrix
    'drug': drug_similarity_matrix.values,  # Drug-drug similarity matrix
    'Wdname': binary_matrix.index.values.tolist(),  # List of disease names
    'Wrname': binary_matrix.columns.values.tolist()  # List of drug names
}

# Save the processed data as a .mat file
savemat('MSI.mat', data_to_save)

print("Data successfully saved as MSI.mat")

Data successfully saved as MSI.mat
