In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale, StandardScaler
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors
from scipy.stats import zscore

In [None]:
# Load data
tested = pd.read_csv("data/tested_molecules.csv")
tested.head()

In [None]:
smiles = tested['SMILES']
mols = [Chem.MolFromSmiles(smi) for smi in smiles]
Draw.MolsToGridImage(mols[0:8], molsPerRow=4, subImgSize=(400, 200))

In [None]:
MoleculeDescriptors.MolecularDescriptorCalculator(['fr_ketone']).GetDescriptorSummaries()

In [None]:
# 1. Dataset preparation with RDKit, e.g., calculation of various types of molecular descriptors.

In [None]:
# 2D Descriptors
desc_list = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)
rdkit_desc = [calc.CalcDescriptors(m) for m in mols]  # our rdkit descriptors: 1116 rows by 210 cols

# Create 2d descriptor dataframe
desc_names = calc.GetDescriptorNames()
df_desc_2d = pd.DataFrame(rdkit_desc, index = smiles, columns=desc_names)
df_desc_2d.head()

In [None]:
# Binary (Morgan) ECFP6 fingerprints
radius = 2  # 2 for similarity exploration, 3 for ML
nBits = 1024  # 2048 is default, 1024 is also fine

# Calculate binary ECFP6 fingerprints:
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(m , radius = radius, nBits = nBits) for m in mols]

# Create fingerprint dataframe where each column represents a bit
fprint_cols = [f'Bit_{i}' for i in range(1, nBits + 1)]
fprint_bits = [list(x) for x in fingerprints]
df_fprint = pd.DataFrame(fprint_bits, index = smiles, columns = fprint_cols)
df_fprint.head()

In [None]:
# MACCS keys

maccs_keys = np.array([MACCSkeys.GenMACCSKeys(m) for m in mols])
col_name = [f'feature_{i}' for i in range(1, len(maccs_keys[0]) + 1)]
# Create MACCS dataframe where each column corresponds to a MACCS feature (structural feature)
df_maccs = pd.DataFrame(data = maccs_keys, index = smiles, columns = col_name)

df_maccs.head()

In [None]:
# 2. Exploratory data analysis, outlier analysis, and descriptor selection.

In [None]:
def find_corr_cols(df, threshold):
    # Find column pairs for which correlation > threshold
    corr_ma = df.corr()  # correlation matrix; shape: (210, 210)
    col_corr = set()  # set of tuples containing column names for which corr > threshold
    for i in range(0, len(corr_ma.columns)):
        for j in range(0, i):
            if corr_ma.iloc[i, j] >= threshold:
                col1_name, col2_name = corr_ma.columns[i], corr_ma.columns[j]
                if col1_name != col2_name:
                    col_corr.add((col1_name, col2_name))
    return col_corr

def make_corr_dict(col_corr):
    # Make dictionary with as key the column name and as value the number of times the key is in a highly correlated pair.
    d = {}
    for (i, j) in col_corr:
        if i not in d.keys():
            d[i] = 1
        else:
            d[i] += 1
        if j not in d.keys():
            d[j] = 1
        else:
            d[j] += 1
    # Sort dictionary keys in descending value order
    d = dict(sorted(d.items(), key=lambda item: item[1])[::-1])
    return d


In [None]:
# Find highly correlated variables

# 2D Descriptors
d_corr_desc_2d = make_corr_dict(find_corr_cols(df_desc_2d, 0.95))
print(list(d_corr_desc_2d.keys())[0:5])  # print top 5 highly correlated variables

# Binary fingerprints
d_corr_fprint = make_corr_dict(find_corr_cols(df_fprint, 0.95))
print(list(d_corr_fprint.keys())[0:5])

# MACCS Keys
d_corr_maccs = make_corr_dict(find_corr_cols(df_maccs, 0.95))
print(list(d_corr_maccs.keys())[0:5])

In [None]:
# Outlier analysis

from scipy.stats import zscore
z_scores = zscore(df_desc_2d)
np.median(z_scores, axis = 0)
print(np.mean(z_scores, axis = 0).iloc[0])
z_scores.head(100)

In [None]:
# 3. Identification of structure-activity relationships, e.g., using ML concepts or insights from EDA.