0.0 Installs & Imports

In [17]:
pip install rdkit numpy

Note: you may need to restart the kernel to use updated packages.


In [18]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit import DataStructs

1.0 Sample Drugs

In [None]:
drugs = [
    "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
    "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",  # Ibuprofen
    "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # Caffeine
    "CC1=C(C(=CC=C1)NC(=O)C2=CC=C(C=C2)Cl)OC",  # Atenolol
]

names = ["Aspirin", "Ibuprofen", "Caffeine", "Atenolol"]

print("Drug Similarity Analysis")
print("=" * 40)

Drug Similarity Analysis


2.0 Convert SMILES to molecules

In [None]:
print("\n1. LOADING DRUG MOLECULES...")
molecules = []

for i, smiles in enumerate(drug_smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        molecules.append(mol)
        print(f"   ✓ {drug_names[i]}")
    else:
        print(f"   ✗ ERROR: Could not load {drug_names[i]}")

✓ Loaded: Aspirin
✓ Loaded: Ibuprofen
✓ Loaded: Caffeine
✓ Loaded: Atenolol


3.0 Calculate fingerprints and similarity matrix

In [None]:
print("\n2. CALCULATING MOLECULAR FINGERPRINTS...")
print("   Using Morgan fingerprints (radius=2, 1024 bits)...")

fingerprints = []
for mol in molecules:
    # CORRECTED: Using AllChem instead of Chem.rdmolops
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
    fingerprints.append(fp)
    
print("\n3. CALCULATING TANIMOTO SIMILARITIES...")
n_drugs = len(molecules)
similarity_matrix = np.zeros((n_drugs, n_drugs))

for i in range(n_drugs):
    for j in range(i, n_drugs):
        if i == j:
            similarity_matrix[i, j] = 1.0  # Same drug
        else:
            similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity  # Symmetric


AttributeError: module 'rdkit.Chem.rdmolops' has no attribute 'GetAtomPairFingerprintAsBitVect'

4.0 Show Similarities

In [None]:
print("\n4. SIMILARITY MATRIX:")
print("   " + "    ".join([name[:8] for name in drug_names]))

for i in range(n_drugs):
    print(f"{drug_names[i][:8]:10}", end="")
    for j in range(n_drugs):
        print(f"{similarity_matrix[i, j]:.3f}  ", end="")
    print()

5.0 Find similar pairs

In [None]:
print(f"\n5. SIMILAR DRUG PAIRS (Threshold ≥ 0.3):")
print("   " + "-" * 40)

threshold = 0.3
similar_pairs = []

for i in range(n_drugs):
    for j in range(i + 1, n_drugs):
        similarity = similarity_matrix[i, j]
        if similarity >= threshold:
            similar_pairs.append((drug_names[i], drug_names[j], similarity))

if similar_pairs:
    for drug1, drug2, sim in similar_pairs:
        print(f"   • {drug1} ↔ {drug2}: {sim:.3f}")
else:
    print("   No similar pairs found above threshold.")

6.0 Similarity Statistics

In [None]:
print("\n6. SIMILARITY STATISTICS:")
print("   " + "-" * 40)

# Collect all pairwise similarities
all_similarities = []
for i in range(n_drugs):
    for j in range(i + 1, n_drugs):
        all_similarities.append(similarity_matrix[i, j])

if all_similarities:
    print(f"   Total drug pairs: {len(all_similarities)}")
    print(f"   Average similarity: {np.mean(all_similarities):.3f}")
    print(f"   Minimum similarity: {np.min(all_similarities):.3f}")
    print(f"   Maximum similarity: {np.max(all_similarities):.3f}")
    print(f"   Pairs with similarity ≥ 0.3: {sum(1 for s in all_similarities if s >= 0.3)}")
    print(f"   Pairs with similarity ≥ 0.5: {sum(1 for s in all_similarities if s >= 0.5)}")
    print(f"   Pairs with similarity ≥ 0.7: {sum(1 for s in all_similarities if s >= 0.7)}")

7.0 REMOVE SIMILAR COMPOUNDS FROM DATASET

In [None]:
print("\n7. FILTERING SIMILAR COMPOUNDS:")
print("   " + "-" * 40)

keep = [True] * n_drugs
removed_drugs = []

for i in range(n_drugs):
    if not keep[i]:
        continue
    for j in range(i + 1, n_drugs):
        if keep[j] and similarity_matrix[i, j] >= threshold:
            keep[j] = False
            removed_drugs.append(drug_names[j])

remaining_drugs = [drug_names[i] for i in range(n_drugs) if keep[i]]

print(f"   Original dataset: {n_drugs} drugs")
print(f"   After filtering: {len(remaining_drugs)} drugs")
print(f"   Removed: {len(removed_drugs)} drugs")
if removed_drugs:
    print(f"   Removed drugs: {', '.join(removed_drugs)}")

8.0 VISUALIZE MOLECULES

In [None]:
print("\n8. MOLECULE VISUALIZATION:")
print("   " + "-" * 40)

try:
    # Display all molecules
    img = Draw.MolsToGridImage(
        molecules,
        molsPerRow=3,
        subImgSize=(300, 300),
        legends=drug_names
    )
    print("   ✓ Molecular structures displayed above")
    
    # Display similar pairs
    if similar_pairs:
        print("\n   Similar drug pairs:")
        for drug1, drug2, sim in similar_pairs[:3]:  # Show max 3 pairs
            idx1 = drug_names.index(drug1)
            idx2 = drug_names.index(drug2)
            img_pair = Draw.MolsToGridImage(
                [molecules[idx1], molecules[idx2]],
                molsPerRow=2,
                subImgSize=(400, 400),
                legends=[f"{drug1} (Sim: {sim:.3f})", f"{drug2} (Sim: {sim:.3f})"]
            )
            print(f"   ✓ {drug1} ↔ {drug2} displayed above")
            
except Exception as e:
    print(f"   Note: Visualization requires Jupyter/Notebook environment")
    print(f"   Error: {e}")

9.0 EXPORT RESULTS

In [None]:
print("\n9. EXPORTING RESULTS:")
print("   " + "-" * 40)

# Create results table
results = []
for i in range(n_drugs):
    for j in range(i + 1, n_drugs):
        results.append({
            "Drug_1": drug_names[i],
            "Drug_2": drug_names[j],
            "Similarity": similarity_matrix[i, j],
            "Above_Threshold_0.3": similarity_matrix[i, j] >= 0.3
        })

# Show top 5 most similar pairs
print("\n   Top 5 most similar drug pairs:")
print("   " + "-" * 40)

sorted_results = sorted(results, key=lambda x: x["Similarity"], reverse=True)
for i, pair in enumerate(sorted_results[:5]):
    print(f"   {i+1}. {pair['Drug_1']} ↔ {pair['Drug_2']}: {pair['Similarity']:.3f}")

# Save to file
try:
    import pandas as pd
    df = pd.DataFrame(results)
    df.to_csv("drug_similarity_results.csv", index=False)
    print(f"\n   ✓ Results saved to 'drug_similarity_results.csv'")
except:
    print("\n   Note: Install pandas for CSV export: pip install pandas")