In [None]:
!bash ./init.sh

In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

def smiles_to_descriptors(smiles, names=rdMolDescriptors.Properties.GetAvailableProperties()):
    X = []
    names = list(names)
    get_descriptors = rdMolDescriptors.Properties(names)
    for smile in smiles:
        mol = Chem.MolFromSmiles(smile)
        descriptors = []
        if mol:
            descriptors = np.array(get_descriptors.ComputeProperties(mol))
        X.append(descriptors)

    return pd.DataFrame(X, columns=names)

smiles_to_descriptors([
    'CCO',
    'CNC(C)Cc1ccccc1',
    'CN1[C@H]2CC[C@@H]1[C@@H](C(OC)=O)[C@@H](OC(C3=CC=CC=C3)=O)C2',
    'CC(OC1=C(O[C@@H]2[C@]34CCN(C)[C@@H]([C@@H]4C=C[C@@H]2OC(C)=O)C5)C3=C5C=C1)=O',
    'O=C(CC)N(C1CCN(CC1)CCc2ccccc2)c3ccccc3',
    'CCN(CC)C(=O)[C@H]1CN([C@@H]2Cc3c[nH]c4c3c(ccc4)C2=C1)C'
])

Unnamed: 0,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,46.041865,46.069,1.0,1.0,0.0,1.0,1.0,3.0,9.0,1.0,...,2.15432,1.023335,0.0,0.0,0.0,-0.04,2.96,1.96,1.96,1.933867
1,149.120449,149.237,1.0,1.0,3.0,1.0,1.0,11.0,26.0,1.0,...,7.171208,4.038511,1.877512,1.877512,1.069709,-0.82,8.278232,4.140518,2.724337,3.116016
2,303.147058,303.358,5.0,0.0,3.0,0.0,5.0,22.0,43.0,5.0,...,12.89768,7.673077,4.955461,4.955461,3.766875,-1.88,15.032579,6.084897,2.60872,4.157804
3,369.157623,369.417,6.0,0.0,2.0,0.0,6.0,27.0,50.0,6.0,...,15.728578,9.487845,6.670833,6.670833,5.737369,-2.34,16.806225,5.58014,2.161613,3.473374
4,336.220164,336.479,3.0,0.0,6.0,0.0,2.0,25.0,53.0,3.0,...,15.103276,9.410297,5.087289,5.087289,3.650415,-2.13,17.685289,8.816461,4.698265,6.236866
5,323.199762,323.44,4.0,1.0,3.0,1.0,2.0,24.0,49.0,4.0,...,14.672554,8.936816,5.660535,5.660535,4.375139,-2.07,15.457261,5.77854,2.203566,3.721683


In [3]:
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

def smiles_to_fingerprints(smiles, n_bits=2048, radius=3):
    X = np.zeros((len(smiles), n_bits))
    for idx, smile in enumerate(smiles):
        mol = Chem.MolFromSmiles(smile)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        np_fp = np.zeros(len(fp))
        DataStructs.ConvertToNumpyArray(fp, np_fp)
        X[idx] = np_fp

    return pd.DataFrame(X, columns=[f'fp_{i}' for i in range(n_bits)])

smiles_to_fingerprints([
    'CCO',
    'CNC(C)Cc1ccccc1',
    'CN1[C@H]2CC[C@@H]1[C@@H](C(OC)=O)[C@@H](OC(C3=CC=CC=C3)=O)C2',
    'CC(OC1=C(O[C@@H]2[C@]34CCN(C)[C@@H]([C@@H]4C=C[C@@H]2OC(C)=O)C5)C3=C5C=C1)=O',
    'O=C(CC)N(C1CCN(CC1)CCc2ccccc2)c3ccccc3',
    'CCN(CC)C(=O)[C@H]1CN([C@@H]2Cc3c[nH]c4c3c(ccc4)C2=C1)C'
])

Unnamed: 0,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,fp_8,fp_9,...,fp_2038,fp_2039,fp_2040,fp_2041,fp_2042,fp_2043,fp_2044,fp_2045,fp_2046,fp_2047
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
