In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Preserving only the name of the drug molecule and the SMILE string
df_original = pd.read_csv("Data/Raw/Original Dataset.csv")
df = df_original[['Name', 'SMILES']]

In [None]:
# Enabling Pandas to render images in the dataframe
PandasTools.RenderImagesInAllDataFrames(images=True)

In [None]:
# Generating molecules in the dataframe by converting SMILES strings
df['Molecule'] = df['SMILES'].apply(Chem.MolFromSmiles)

In [None]:
# Generating all required descriptors
df['Mol Wt'] = df['Molecule'].apply(Descriptors.MolWt)
df['LogP'] = df['Molecule'].apply(Descriptors.MolLogP)
df['TPSA'] = df['Molecule'].apply(Descriptors.TPSA)
df['HBD'] = df['Molecule'].apply(Descriptors.NumHDonors)
df['HBA'] = df['Molecule'].apply(Descriptors.NumHAcceptors)
df['AtomCount'] = df['Molecule'].apply(Descriptors.HeavyAtomCount)
df['RotatableBonds'] = df['Molecule'].apply(Descriptors.NumRotatableBonds)
df['MR'] = df['Molecule'].apply(Descriptors.MolMR)

In [None]:
# Filtering conditions
filters = {
    'Lipinski' : {
        "MW": lambda x : x["Mol Wt"] <= 500,
        "LogP" : lambda x : x["LogP"] <= 5,
        "HBA" : lambda x : x["HBA"] <= 10,
        "HBD" : lambda x : x["HBD"] <= 5
    },
    'Ghose' : {
        'MW' : lambda x : 160 <= x["Mol Wt"] <= 480,
        "LogP" : lambda x : -0.4 <= x["LogP"] <= 5.6,
        "MR" : lambda x : 40 <= x["MR"] <= 130,
        "atomCount" : lambda x : 20 <= x["AtomCount"] <= 70
    },
    'Veber' : {
        'RB' : lambda x : x['RotatableBonds'] <= 10,
        'TPSA' : lambda x : x['TPSA'] <= 140
    },
    'Egan' : {
        "LogP" : lambda x : -1 <= x['LogP'] <= 5,
        'TPSA' : lambda x : x['TPSA'] <= 131
    }
}

In [None]:
# Applying filtering conditions in the dataframe
# and generating pass/fail for each filter and each molecule
for filter, conditions in filters.items():
        passed = pd.DataFrame({
            descriptor: df.apply(condition, axis=1)
            for descriptor, condition in conditions.items()
        })
        df[filter] = np.where(passed.all(axis=1), True, False)

In [None]:
# Special logic for lipinski, since upto 1 violation is acceptable
lipinski_fail = pd.DataFrame({
        descriptor: ~df.apply(condition, axis=1)
        for descriptor, condition in filters['Lipinski'].items()
})
lipinski_fail_count = lipinski_fail.sum(axis=1)
df["Lipinski"] = lipinski_fail_count <= 1

In [None]:
# Plotting the pass/fail molecules for each filter

filter_names = ['Lipinski', 'Ghose', 'Veber', 'Egan']
titles = ['Lipinski Filter Pass Frequency', 
          'Ghose Filter Pass Frequency', 
          'Veber Filter Pass Frequency', 
          'Egan Filter Pass Frequency']

fig, ax = plt.subplots(2, 2, figsize=(12, 9))
ax = ax.flatten()

fig.suptitle('Filter Pass Frequency Comparison', fontsize=20)

for i, filt in enumerate(filter_names):
    counts = df[filt].value_counts().sort_index()
    bars = ax[i].bar(counts.index, counts.values, color=["#ff6f61", "#7be0ad"], edgecolor='black')
    ax[i].set_title(titles[i])
    ax[i].set_xlabel('Score')
    ax[i].set_ylabel('Pass Frequency')
    ax[i].set_xticks(counts.index, ['Fail', 'Pass'])
    ax[i].set_yticks(np.arange(0, len(df), 100))
    ax[i].bar_label(bars)
fig.tight_layout()

In [None]:
# Logic for plotting frequency molecules failing due to descriptors falling out of the filter conditions

lipinski_fail = pd.DataFrame({
        descriptor: ~df.apply(condition, axis=1)
        for descriptor, condition in filters['Lipinski'].items()
})
lipinski_fail_count = lipinski_fail.sum()

ghose_fail = pd.DataFrame({
        descriptor: ~df.apply(condition, axis=1)
        for descriptor, condition in filters['Ghose'].items()
})
ghose_fail_count = ghose_fail.sum()


# Plotting the graph
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
bars = []
bars.append(ax[0].barh(lipinski_fail_count.index, lipinski_fail_count.values, color='#C44E52', edgecolor='black'))
ax[0].set_title("For Lipinski Filter")
bars.append(ax[1].barh(ghose_fail_count.index, ghose_fail_count.values, color='#C44E52', edgecolor='black'))
ax[1].set_title("For Ghose Filter")
for i in range(len(ax)):
    ax[i].set_xlabel("Molecular Descriptors")
    ax[i].set_ylabel("Fail Frequency of Descriptors")
    ax[i].bar_label(bars[i], padding=2)
    ax[i].margins(x=0.1)

fig.suptitle("Descriptor-wise failure chart", fontsize=20)
fig.tight_layout()

In [None]:
# Calculating the number of filters passed by each drug
score_columns = ["Lipinski", "Ghose", "Veber", "Egan"]
df["PassScore"] = df[score_columns].apply(lambda x : x.sum(), axis=1)

# Plotting the pass score vs no of drugs passed
fig, ax = plt.subplots(figsize=(6, 5))
colours = ['#5A9CB5','#caffbf', '#FAAC68','#FA6868','#FACE68']
counts = df["PassScore"].value_counts().sort_index()
percentage = counts/len(df)*100
bars = ax.bar(counts.index, counts, color=colours, edgecolor="black", label=[f"{p:.2f}%" for p in percentage])
ax.legend(loc='upper left')
ax.set_xlabel("Number of filters passed")
ax.set_ylabel("Pass Frequency")
ax.bar_label(bars)
fig.suptitle("Distribution of Drugs by Number of Filters Passed", fontsize=15)
fig.tight_layout()

In [None]:
df.drop(columns=["Molecule"], inplace=True)
df.to_csv('Data/Processed/Final Processed Dataset.csv', index=False)