Step 1: Import Python Libraries into the Jupyter Notebook.

Note: Please download the relevant Python packages to execute this notebook.

In [1]:
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Step 2: Load the csv file that contains the drug molecules as represented by their DB_ID and SMILES. DB_ID refers to the DrugBank ID that is used to unequivocally refer to a specific drug structure in the DrugBank database.

In [2]:
fixed_df=pd.read_csv("1_1_1_Input\DB_ID_SMILES.csv")
fixed_db_id=fixed_df.loc[:,"DB_ID"].values
fixed_SMI=fixed_df.loc[:,"SMILES_Representation"].values

Step 3: Generate the 1,710x1,710 Tanimoto Correlation Matrix using Morgan2 Fingerprint in the RDKit library as pandas DataFrame called "Tanimoto_Correlation_Matrix_1710_x_1710".

In [4]:
list_fixed_SMI=list(fixed_SMI)
fixed_mol=[]
for i in range(len(list_fixed_SMI)):
    fixed_mol.append(Chem.MolFromSmiles(list_fixed_SMI[i]))

DDI_mol=fixed_mol

DDI_fps=[]
for index, i in enumerate(DDI_mol):
    try:
        DDI_fps.append(AllChem.GetMorganFingerprint(i, 2))
    except:
        print(index, i)
        
fixed_fps=DDI_fps

Tanimoto_List=[]
for i in range(len(DDI_fps)):
    temp=[]
    for a in range(len(fixed_fps)):
        temp.append(str(DataStructs.TanimotoSimilarity(DDI_fps[i], fixed_fps[a])))
    Tanimoto_List.append(temp)
    
df_Tanimoto=pd.DataFrame(data=Tanimoto_List, columns=fixed_db_id)
Complementary_TC_df=pd.DataFrame(fixed_db_id, columns=['Correlation Matrix'])

Tanimoto_Correlation_Matrix_1710_x_1710 = pd.concat([Complementary_TC_df, df_Tanimoto], axis=1)

Step 4: Reduce Tanimoto_Correlation_Matrix_1710_x_1710 to SSP_1710x50 using PCA.

In [5]:
def pca_n_factor(standardized_x, threshold):
    #Step 4: Factor Extraction by Performing PCA on Standardized Data
    pca=PCA()
    principalComponents=pca.fit_transform(standardized_x)
    eigenvalue_pca=pca.explained_variance_
    #print(eigenvalue_pca)
    for i in range(len(eigenvalue_pca)):
        if float(eigenvalue_pca[i])<threshold:
            output=i
            break
    PCA_component=pca.explained_variance_ratio_
    loadings = pca.components_
    return i, eigenvalue_pca, principalComponents, PCA_component, loadings

DB_ID=list(Tanimoto_Correlation_Matrix_1710_x_1710.loc[:,"Correlation Matrix"].values)
fixed_comparison_features=list(Tanimoto_Correlation_Matrix_1710_x_1710.columns.values)[1:]
x=Tanimoto_Correlation_Matrix_1710_x_1710.loc[:, fixed_comparison_features].values
standardized_x=StandardScaler().fit_transform(x)
df_standardized=pd.DataFrame(data=standardized_x, columns=fixed_comparison_features)

n_pca=len(x)
full_eigenvalue_pca, full_principal_components, variance_explained_per_PC, loadings=pca_n_factor(standardized_x,0)[1:]

######
columns=[]
for i in range(n_pca):
    temp_line='PC-'+str(i+1)
    columns.append(temp_line)
principalDf=pd.DataFrame(data=full_principal_components, columns=columns)
######

DB_ID_Df=pd.DataFrame(DB_ID, columns=["DB_ID"])
Concatenated_TC_df = pd.concat([DB_ID_Df, principalDf], axis=1)

SSP_1710x50=Concatenated_TC_df[list(Concatenated_TC_df.columns)[0:51]]
SSP_1710x50.to_csv("1_1_2_Output\DDI_MLP_SSP.csv", index=False)