In [1]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import pandas as pd

In [3]:
# SMILES representations of the statins
smiles = {
    'Simvastatin': 'CCC(C)(C)C(=O)OC1CC(C=C2C1C(C(C=C2)C)CCC3CC(CC(=O)O3)O)C',
    'Pravastatin': 'CCC(C)C(=O)OC1CC(C=C2C1C(C(C=C2)C)CCC(CC(CC(=O)O)O)O)O',
    'Atorvastatin': 'CC(C)C1=C(C(=C(N1CCC(CC(CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4',
    'Fluvastatin': 'CC(C)N1C2=CC=CC=C2C(=C1C=CC(CC(CC(=O)O)O)O)C3=CC=C(C=C3)F'
}

In [4]:
# Convert SMILES to molecular representations
molecules = {name: Chem.MolFromSmiles(smi) for name, smi in smiles.items()}

In [5]:
# Generate fingerprints
fpgen = AllChem.GetRDKitFPGenerator()
fingerprints = {name: fpgen.GetFingerprint(mol) for name, mol in molecules.items()}

In [7]:
# Initialize an empty DataFrame to store the similarity matrix
statin_names = list(fingerprints.keys())
similarity_matrix = pd.DataFrame(index=statin_names, columns=statin_names)

# Calculate the pairwise Tanimoto similarity and populate the matrix
for i in range(len(statin_names)):
    for j in range(i, len(statin_names)):
        statin1, statin2 = statin_names[i], statin_names[j]
        fp1, fp2 = fingerprints[statin1], fingerprints[statin2]
        tanimoto_score = DataStructs.TanimotoSimilarity(fp1, fp2)
        similarity_matrix.loc[statin1, statin2] = tanimoto_score
        similarity_matrix.loc[statin2, statin1] = tanimoto_score  # Symmetric

# Display the similarity matrix
print(similarity_matrix)

             Simvastatin Pravastatin Atorvastatin Fluvastatin
Simvastatin          1.0    0.871988     0.300432    0.292937
Pravastatin     0.871988         1.0     0.303571    0.304478
Atorvastatin    0.300432    0.303571          1.0     0.52184
Fluvastatin     0.292937    0.304478      0.52184         1.0
