<a href="https://colab.research.google.com/github/EdWangLoDaSc/DDI_Graph/blob/main/Dataset%20Sources/DDI_Dataset/data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import networkx as nx

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# BioSNAP Processing


In [3]:
# Specify the column names
column_names = ['drug1', 'drug2']

# Load the TSV file into a pandas DataFrame with the specified column names
snap = pd.read_csv('/content/drive/MyDrive/DDI_Dataset/BioSNAP/ChCh-Miner_durgbank-chem-chem.tsv', sep='\t', header=None, names=column_names)

drugbank_name = pd.read_csv('/content/drive/MyDrive/DDI_Dataset/BioSNAP/drugbank vocabulary.csv')

# Print the first few rows of the DataFrame to verify the column titles
print(snap.head())
print(drugbank_name.head())

     drug1    drug2
0  DB00862  DB00966
1  DB00575  DB00806
2  DB01242  DB08893
3  DB01151  DB08883
4  DB01235  DB01275
  DrugBank ID     Accession Numbers          Common name          CAS  \
0     DB00001  BTD00024 | BIOD00024            Lepirudin  138068-37-8   
1     DB00002  BTD00071 | BIOD00071            Cetuximab  205923-56-4   
2     DB00003  BTD00001 | BIOD00001         Dornase alfa  143831-71-4   
3     DB00004  BTD00084 | BIOD00084  Denileukin diftitox  173146-27-5   
4     DB00005  BTD00052 | BIOD00052           Etanercept  185243-69-0   

         UNII                                           Synonyms  \
0  Y43GF64R34  [Leu1, Thr2]-63-desulfohirudin | Desulfatohiru...   
1  PQX0D8J21J                Cetuximab | Cétuximab | Cetuximabum   
2  953A26OA1Y  Deoxyribonuclease (human clone 18-1 protein mo...   
3  25E79B5CTM  Denileukin | Denileukin diftitox | Interleukin...   
4  OP401G7OJC  Etanercept | etanercept-szzs | etanercept-ykro...   

  Standard InChI Key  
0        

In [39]:
# Merge snap with drugbank_name to match drug1 IDs with their common names
merged1 = pd.merge(snap, drugbank_name, left_on='drug1', right_on='DrugBank ID')
merged1 = merged1.rename(columns={'Common name': 'drug1_name'})

# Drop unnecessary columns from the first merge
merged1 = merged1[['drug1', 'drug2', 'drug1_name']]

# Merge the result with drugbank_name again to match drug2 IDs with their common names
merged_final = pd.merge(merged1, drugbank_name, left_on='drug2', right_on='DrugBank ID')
merged_final = merged_final.rename(columns={'Common name': 'drug2_name'})

# Drop unnecessary columns from the final DataFrame
merged_final = merged_final[['drug1', 'drug1_name', 'drug2', 'drug2_name']]
merged_final[['nodeA','nodeB']] = merged_final[['drug1', 'drug2']]

# Print the first few rows of the final DataFrame to verify
print(merged_final.head())


output_path = '/content/drive/MyDrive/DDI_Dataset/BioSNAP/processed_drug_pairs_biosnap.csv'

merged_final.to_csv(output_path, index=False)


     drug1    drug1_name    drug2            drug2_name    nodeA    nodeB
0  DB00862    Vardenafil  DB00883  Isosorbide dinitrate  DB00862  DB00883
1  DB00673    Aprepitant  DB00883  Isosorbide dinitrate  DB00673  DB00883
2  DB00177     Valsartan  DB00883  Isosorbide dinitrate  DB00177  DB00883
3  DB00834  Mifepristone  DB00883  Isosorbide dinitrate  DB00834  DB00883
4  DB00820     Tadalafil  DB00883  Isosorbide dinitrate  DB00820  DB00883


## Property

In [5]:
import networkx as nx

G = nx.Graph()

# Add edges from the DataFrame. This also adds the nodes implicitly
G.add_edges_from(snap.values)

# Calculate properties
N = G.number_of_nodes()  # Number of nodes
E = G.number_of_edges()  # Number of edges
sparsity = 2 * E / (N * (N - 1))  # Sparsity formula for undirected graphs

# Maximum degree
max_degree = max(dict(G.degree()).values())

# Minimum degree
min_degree = min(dict(G.degree()).values())

betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)  # Increase max_iter if necessary
clustering_coefficient = nx.average_clustering(G)
modularity = nx.algorithms.community.modularity(G, nx.algorithms.community.greedy_modularity_communities(G))

# Print basic properties
print(f"Number of nodes: {N}")
print(f"Number of edges: {E}")
print(f"Sparsity: {sparsity:.4f}")
print(f"Maximum Degree: {max_degree}")
print(f"Minimum Degree: {min_degree}")

# Print advanced metrics
print("Average Betweenness Centrality:", sum(betweenness_centrality.values()) / N)
print("Average Closeness Centrality:", sum(closeness_centrality.values()) / N)
print("Average Eigenvector Centrality:", sum(eigenvector_centrality.values()) / N)
print(f"Average Clustering Coefficient: {clustering_coefficient:.4f}")
print(f"Modularity: {modularity:.4f}")


Number of nodes: 1397
Number of edges: 16384
Sparsity: 0.0168
Maximum Degree: 142
Minimum Degree: 1
Average Betweenness Centrality: 0.001329528096814869
Average Closeness Centrality: 0.3483634978469241
Average Eigenvector Centrality: 0.018303386590821567
Average Clustering Coefficient: 0.1028
Modularity: 0.3348


# TWOSIDES

In [10]:
import csv

# Input file path
input_file_path = '/content/drive/MyDrive/DDI_Dataset/TWOSIDES/Names.txt'
# Output CSV file path
output_file_path = '/content/drive/MyDrive/DDI_Dataset/TWOSIDES/Names.csv'

# Open the original text file and the new CSV file
with open(input_file_path, 'r') as infile, open(output_file_path, 'w', newline='') as csvfile:
    # Create a CSV writer object
    csvwriter = csv.writer(csvfile)

    # Write the header row
    csvwriter.writerow(['code', 'name'])

    # Read each line from the text file
    for line in infile:
        # Strip leading/trailing whitespace and split by tab to get the code and name
        code, name = line.strip().split('\t')
        # Write the code and name to the CSV file
        csvwriter.writerow([code, name])



# Creating DataFrame
df = pd.DataFrame(cid_name)

# Clean the 'code' and 'name' columns by removing single quotes and backslashes
df['code'] = df['code'].str.replace("'", "")
df['name'] = df['name'].str.replace("'", "").str.replace("\\", "")

# Output CSV file path
output_file_path = '/content/drive/MyDrive/DDI_Dataset/TWOSIDES/Names.csv'

# Save the cleaned DataFrame to a CSV file
df.to_csv(output_file_path, index=False)

print("Cleaned data has been saved to CSV.")


Cleaned data has been saved to CSV.


  df['name'] = df['name'].str.replace("'", "").str.replace("\\", "")


In [11]:
twosides = pd.read_csv('/content/drive/MyDrive/DDI_Dataset/TWOSIDES/drug_drug_matrix.csv')


cid_name = pd.read_csv('/content/drive/MyDrive/DDI_Dataset/TWOSIDES/Names.csv')


print(twosides.head(2))
print(cid_name)


     Unnamed: 0  CID000002244  CID000004609  CID000003696  CID000003446  \
0  CID000002244             0             1             1             1   
1  CID000004609             1             0             0             1   

   CID000004058  CID000004440  CID000004679  CID000003419  CID000002656  ...  \
0             1             0             1             1             1  ...   
1             0             0             1             0             1  ...   

   CID000005538  CID000000158  CID000003350  CID000002756  CID000002713  \
0             1             0             1             1             1   
1             0             0             1             1             1   

   CID000012620  CID000004917  CID000068844  CID000005514  CID000000119  
0             0             1             1             1             1  
1             0             1             0             0             0  

[2 rows x 549 columns]
              code                       name
0     CID000000

In [38]:
# Initialize an empty list to hold the drug-drug pairs
drug_pairs = []

# Iterate over the DataFrame to find pairs with a link (value of 1)
for i, row in twosides.iterrows():
    drug1 = row['Unnamed: 0']
    for drug2 in twosides.columns[1:]:  # Skip the first column which is 'Unnamed: 0'
        if row[drug2] == 1:
            # Append the pair and the link indicator to the list
            drug_pairs.append([drug1, drug2, 1])

# Convert the list to a DataFrame
drug_pairs_df = pd.DataFrame(drug_pairs, columns=['drug1', 'drug2', 'link'])


# Sort the drug identifiers within each row
drug_pairs_df[['drug1', 'drug2']] = np.sort(drug_pairs_df[['drug1', 'drug2']], axis=1)

# Drop duplicates
drug_pairs_df = drug_pairs_df.drop_duplicates(['drug1', 'drug2']).reset_index(drop=True)
drug_code_name_mapping = cid_name.drop_duplicates(subset='code').set_index('code')['name'].to_dict()

# Apply the mapping to replace drug_codes with drug_names in drug_pairs_df
drug_pairs_df['drug1_name'] = drug_pairs_df['drug1'].map(drug_code_name_mapping)
drug_pairs_df['drug2_name'] = drug_pairs_df['drug2'].map(drug_code_name_mapping)

drug_pairs_df[['nodeA','nodeB']] = drug_pairs_df[['drug1', 'drug2']]

# Verify the mapping by displaying the first few rows
print(drug_pairs_df.tail(20))
output_path = '/content/drive/MyDrive/DDI_Dataset/TWOSIDES/processed_drug_pairs.csv'
drug_pairs_df.to_csv(output_path, index=False)


G = nx.Graph()

# Iterate over DataFrame rows to add each drug interaction as an edge in the graph
for index, row in drug_pairs_df.iterrows():
    G.add_edge(row['drug1'], row['drug2'])

# Now, calculate the properties based on this graph
N = G.number_of_nodes()  # Number of nodes
E = G.number_of_edges()  # Number of edges
sparsity = 2 * E / (N * (N - 1)) if N > 1 else 0  # Sparsity formula, with check to avoid division by zero

# Maximum degree
max_degree = max(dict(G.degree()).values()) if N > 0 else 0

# Minimum degree
min_degree = min(dict(G.degree()).values()) if N > 0 else 0

betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)  # Increase max_iter if necessary
clustering_coefficient = nx.average_clustering(G)
modularity = nx.algorithms.community.modularity(G, nx.algorithms.community.greedy_modularity_communities(G))

# Print basic properties
print(f"Number of nodes: {N}")
print(f"Number of edges: {E}")
print(f"Sparsity: {sparsity:.4f}")
print(f"Maximum Degree: {max_degree}")
print(f"Minimum Degree: {min_degree}")

# Print advanced metrics
print("Average Betweenness Centrality:", sum(betweenness_centrality.values()) / N)
print("Average Closeness Centrality:", sum(closeness_centrality.values()) / N)
print("Average Eigenvector Centrality:", sum(eigenvector_centrality.values()) / N)
print(f"Average Clustering Coefficient: {clustering_coefficient:.4f}")
print(f"Modularity: {modularity:.4f}")




              drug1         drug2  link            drug1_name  \
48564  CID000000119  CID000005039     1                  GABA   
48565  CID000000596  CID000093860     1  cytosine arabinoside   
48566  CID000000596  CID000005538     1  cytosine arabinoside   
48567  CID000000596  CID000003350     1  cytosine arabinoside   
48568  CID000000596  CID000002713     1  cytosine arabinoside   
48569  CID000000596  CID000004917     1  cytosine arabinoside   
48570  CID000003350  CID000093860     1           finasteride   
48571  CID000004917  CID000093860     1      prochlorperazine   
48572  CID000003350  CID000005538     1           finasteride   
48573  CID000002756  CID000005538     1            cimetidine   
48574  CID000004917  CID000005538     1      prochlorperazine   
48575  CID000005514  CID000005538     1            topiramate   
48576  CID000002713  CID000003350     1         chlorhexidine   
48577  CID000003350  CID000004917     1           finasteride   
48578  CID000003350  CID0

# DDInter

In [37]:
ddinter = pd.read_csv('/content/drive/MyDrive/DDI_Dataset/DDInter/ddinter_downloads_code_R.csv')
ddinter[['nodeA','nodeB']] = ddinter[['DDInterID_A','DDInterID_B']]
ddinter.head()

output_path = '/content/drive/MyDrive/DDI_Dataset/DDInter/processed_drug_pairs_DDInter.csv'

ddinter.to_csv(output_path, index=False)

In [28]:
import networkx as nx
import pandas as pd

# Example of loading the DataFrame
# ddinter = pd.read_csv('path_to_your_ddinter.csv')

G = nx.Graph()

# Initialize a set to keep track of added edges to avoid duplicates
added_edges = set()

for index, row in ddinter.iterrows():
    edge = (row['DDInterID_A'], row['DDInterID_B'])

    # Check if either tuple or its reverse hasn't been added yet to ensure undirected uniqueness
    if edge not in added_edges and (edge[1], edge[0]) not in added_edges:
        G.add_edge(*edge)
        added_edges.add(edge)

# Calculate the properties
N = G.number_of_nodes()  # Number of nodes
E = G.number_of_edges()  # Number of edges
sparsity = E / (N * (N - 1) / 2) if N > 1 else 0

# Degree calculations
degrees = dict(G.degree())
max_degree = max(degrees.values())
min_degree = min(degrees.values())

# Centrality measures
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)  # Adjust max_iter if necessary
clustering_coefficient = nx.average_clustering(G)

# Modularity
communities = list(nx.algorithms.community.greedy_modularity_communities(G))
modularity = nx.algorithms.community.modularity(G, communities)

# Print basic properties
print(f"Number of nodes: {N}")
print(f"Number of edges: {E}")
print(f"Sparsity: {sparsity:.4f}")
print(f"Maximum Degree: {max_degree}")
print(f"Minimum Degree: {min_degree}")

# Print advanced metrics
print("Average Betweenness Centrality:", sum(betweenness_centrality.values()) / N)
print("Average Closeness Centrality:", sum(closeness_centrality.values()) / N)
print("Average Eigenvector Centrality:", sum(eigenvector_centrality.values()) / N)
print(f"Average Clustering Coefficient: {clustering_coefficient:.4f}")
print(f"Modularity: {modularity:.4f}")





Number of nodes: 1185
Number of edges: 4527
Sparsity: 0.0065
Maximum Degree: 460
Minimum Degree: 1
Average Betweenness Centrality: 0.0015385719134501901
Average Closeness Centrality: 0.36137263786722407
Average Eigenvector Centrality: 0.018647247181594113
Average Clustering Coefficient: 0.3259
Modularity: 0.4335


# Analyze the overlap of different datasets' link pairs

In [None]:
# Filter out rows where either drug1_name or drug2_name is NaN before creating pairs
merged_final = merged_final.dropna(subset=['drug1_name', 'drug2_name'])
drug_pairs_df = drug_pairs_df.dropna(subset=['drug1_name', 'drug2_name'])
ddinter = ddinter.dropna(subset=['Drug_A', 'Drug_B'])

# Then create unique pair identifiers as before
merged_final['pair'] = merged_final.apply(lambda x: '-'.join(sorted([x['drug1_name'], x['drug2_name']])), axis=1)
drug_pairs_df['pair'] = drug_pairs_df.apply(lambda x: '-'.join(sorted([x['drug1_name'], x['drug2_name']])), axis=1)
ddinter['pair'] = ddinter.apply(lambda x: '-'.join(sorted([x['Drug_A'], x['Drug_B']])), axis=1)
# Example for one DataFrame, apply the same logic for others
if 'drug1_name' in merged_final.columns and 'drug2_name' in merged_final.columns:
    # Ensure there are no NaN values in the columns of interest
    merged_final.dropna(subset=['drug1_name', 'drug2_name'], inplace=True)

    # Safely apply the function to create 'pair' column
    try:
        merged_final['pair'] = merged_final.apply(lambda x: '-'.join(sorted([str(x['drug1_name']), str(x['drug2_name'])])), axis=1)
    except Exception as e:
        print(f"An error occurred: {e}")

# Do similar checks and operations for drug_pairs_df and ddinter

# Find overlaps
overlap_1_2 = set(merged_final['pair']).intersection(set(drug_pairs_df['pair']))
overlap_1_3 = set(merged_final['pair']).intersection(set(ddinter['pair']))
overlap_2_3 = set(drug_pairs_df['pair']).intersection(set(ddinter['pair']))

# Print overlap information
print(f"Overlap between merged_final and drug_pairs_df: {len(overlap_1_2)} pairs")
print(f"Overlap between merged_final and ddinter: {len(overlap_1_3)} pairs")
print(f"Overlap between drug_pairs_df and ddinter: {len(overlap_2_3)} pairs")

# For a comprehensive overlap across all three datasets
overlap_all = set(merged_final['pair']).intersection(drug_pairs_df['pair'], ddinter['pair'])
print(f"Overlap across all three datasets: {len(overlap_all)} pairs")



ValueError: Wrong number of items passed 5, placement implies 1