In [1]:
from rdkit import Chem
# from rdkit.Chem import Draw, Descriptors
from rdkit.Chem import Descriptors
import pandas as pd
import json

In [2]:
df = pd.read_csv("./tables/SMILES_peptides_monomer.txt")
print(df.head())

  Molecule                         SMILES    Description
0        G                     C(C(=O)O)N        Glycine
1        A               C[C@@H](C(=O)O)N        Alanine
2        L          CC(C)C[C@@H](C(=O)O)N        Leucine
3        M            CSCC[C@@H](C(=O)O)N     Methionine
4        F  C1=CC=C(C=C1)C[C@@H](C(=O)O)N  Phenylalanine


In [3]:
descriptors_to_keep = pd.read_json("unique_descriptors.json")
descriptors_to_keep = descriptors_to_keep.iloc[:, 0].tolist()

In [5]:
SMILES = "C1=CC=C(C=C1)C[C@@H](C(=O)O)N"
mol = Chem.MolFromSmiles(SMILES)
all_descriptors = Descriptors.CalcMolDescriptors(mol, missingVal=-9999, silent=True)

all_descriptors = {key.lower(): val for key, val in all_descriptors.items()}
desired_descriptors = [all_descriptors[key] for key in descriptors_to_keep]

len(desired_descriptors)



126

In [5]:
df.loc[:, 'mol'] = df.loc[:, 'SMILES'].map(lambda x: Chem.MolFromSmiles(x))

In [6]:
%%capture
df.loc[:, 'descriptors'] = df.loc[:, 'mol'].apply(lambda x: Descriptors.CalcMolDescriptors(x,
                                                  missingVal=-9999, silent=True),)

In [7]:
expanded_df = pd.json_normalize(df['descriptors'])
expanded_df.index = df.index
df = df.drop(columns=['descriptors'])
df = pd.concat([df, expanded_df], axis=1)

In [8]:
df.columns = df.columns.str.lower()

In [9]:
df[descriptors_to_keep].head()

Unnamed: 0,peoe_vsa2,ringcount,slogp_vsa10,kappa3,fr_aryl_methyl,fr_nh2,estate_vsa10,fr_allylic_oxid,slogp_vsa3,numheteroatoms,...,estate_vsa1,chi4v,chi3v,chi1n,chi2n,numaliphaticheterocycles,fr_aniline,numaromaticrings,kappa2,maxabsestateindex
0,4.794537,0,0.0,3.43,0,1,4.794537,0,4.794537,3,...,5.969305,0.0,0.17462,1.189533,0.597863,0,0,0,1.721545,9.243056
1,4.794537,0,0.0,1.721545,0,1,4.794537,0,4.794537,3,...,12.011146,0.0,0.389528,1.62709,1.126913,0,0,0,1.767634,9.574074
2,4.794537,0,0.0,3.968358,0,1,4.794537,0,4.794537,3,...,12.011146,0.608685,1.042536,3.020937,2.57428,0,0,0,3.454517,10.109769
3,4.794537,0,0.0,4.294139,0,1,4.794537,0,4.794537,4,...,12.011146,0.85793,1.594127,2.654911,1.726205,0,0,0,4.639273,10.070883
4,4.794537,1,0.0,2.358651,0,1,4.794537,0,11.215359,3,...,12.011146,0.955337,1.615638,3.722225,2.634453,0,0,1,3.875201,10.378642


In [10]:
# descriptors_to_keep = ["peoe_vsa2", "ringcount", "slogp_vsa10", "kappa3", "fr_aryl_methyl", 
#                  "fr_nh2", "estate_vsa10", "fr_allylic_oxid", "slogp_vsa3", "numheteroatoms", 
#                  "nhohcount", "numhdonors", "peoe_vsa14", "numhacceptors", "fr_quatn", 
#                  "bcut2d_logplow", "minabsestateindex", "peoe_vsa6", "bcut2d_chglo", 
#                  "hallkieralpha", "smr_vsa1", "fr_morpholine", "smr_vsa7", "chi3n", 
#                  "minabspartialcharge", "numsaturatedheterocycles", "fpdensitymorgan3", 
#                  "estate_vsa2", "fr_methoxy", "kappa1", "fr_guanido", "nocount", "estate_vsa8", 
#                  "minpartialcharge", "bcut2d_mrlow", "fr_unbrch_alkane", "chi0v", "slogp_vsa5", 
#                  "estate_vsa7", "vsa_estate5", "estate_vsa11", "qed", "vsa_estate1", "smr_vsa6", 
#                  "numaromaticcarbocycles", "maxabspartialcharge", "minestateindex", "estate_vsa9", 
#                  "estate_vsa3", "molmr", "chi0", "estate_vsa6", "fpdensitymorgan2", "smr_vsa4", 
#                  "bertzct", "maxpartialcharge", "vsa_estate2", "vsa_estate8", "bcut2d_logphi", 
#                  "numvalenceelectrons", "estate_vsa4", "estate_vsa5", "heavyatommolwt", 
#                  "fr_benzene", "peoe_vsa9", "vsa_estate4", "slogp_vsa4", "fr_ether", "slogp_vsa2", 
#                  "fr_c_o_nocoo", "bcut2d_mwlow", "fr_nh1", "smr_vsa10", "smr_vsa5", 
#                  "heavyatomcount", "fr_nh0", "vsa_estate7", "chi2v", "fpdensitymorgan1", 
#                  "bcut2d_mwhi", "smr_vsa3", "slogp_vsa6", "peoe_vsa12", "peoe_vsa7", "tpsa", 
#                  "numaromaticheterocycles", "fractioncsp3", "molwt", "fr_ar_n", "vsa_estate3", 
#                  "numsaturatedrings", "fr_para_hydroxylation", "peoe_vsa10", "maxestateindex", 
#                  "balabanj", "numrotatablebonds", "mollogp", "slogp_vsa1", "fr_halogen", 
#                  "peoe_vsa1", "fr_amide", "peoe_vsa8", "bcut2d_chghi", "chi1v", "fr_c_o", "chi0n", 
#                  "labuteasa", "fr_imidazole", "numaliphaticrings", "chi4n", "chi1", "vsa_estate6", 
#                  "peoe_vsa3", "exactmolwt", "bcut2d_mrhi", "vsa_estate9", "estate_vsa1", "chi4v", 
#                  "chi3v", "chi1n", "chi2n", "numaliphaticheterocycles", "fr_aniline", 
#                  "numaromaticrings", "kappa2", "maxabsestateindex"]