# Applicability of Ro5 and beyond rules for chemical diversity in patents
This notebook looks into the rule-of-file (Ro5) properties of the SureChEMBL compounds.

# Import Modules

In [1]:
import json
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Add path constants

In [2]:
DATA_DIR = '../data'

# Load data file

In [3]:
surechembl_df = pd.read_parquet(f'{DATA_DIR}/surechembl_dump.pq')
surechembl_df.head(2)

Unnamed: 0,SureChEMBL_ID,SMILES,InChIKey,PATENT_ID,PUBLICATION_DATE,Field
0,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2842582-A2,2015-03-04,Description
1,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2838373-A2,2015-02-25,Description


In [4]:
surechembl_df['year'] = surechembl_df['PUBLICATION_DATE'].progress_apply(lambda x: x.split('-')[0])

  0%|          | 0/133512452 [00:00<?, ?it/s]

100%|██████████| 133512452/133512452 [06:42<00:00, 331425.80it/s]


# Filtering unique compounds per year

In [6]:
surechem_df = surechembl_df.drop_duplicates(subset=["InChIKey","year"], keep='first')
surechem_df.reset_index(drop=True, inplace=True)
len(surechem_df)

21857225

In [7]:
year_df = surechem_df[['SMILES', 'year']]
year_df.drop_duplicates(subset=['SMILES'], keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.drop_duplicates(subset=['SMILES'], keep='first', inplace=True)


# Load properties data dump

In [8]:
properties_dict = json.load(open(f'{DATA_DIR}/properties.json'))

In [9]:
properties_df = pd.DataFrame(properties_dict)
properties_df = properties_df.T
properties_df.reset_index(inplace=True)
properties_df.rename(columns={'index': 'SMILES'}, inplace=True)
properties_df.head(2)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.077264,0.6279,3.0,2.0,3.0,57.61,0.777778,1.0
1,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,492.210781,1.3164,7.0,4.0,11.0,170.54,0.458333,1.0


In [10]:
len(properties_df)

10695979

# Combining the properties data with the SureChEMBL compounds

In [11]:
property_annotated_smiles_df = pd.merge(properties_df, year_df, how="left", on=["SMILES"])
property_annotated_smiles_df.head(2)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,year
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.077264,0.6279,3.0,2.0,3.0,57.61,0.777778,1.0,2015
1,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.077264,0.6279,3.0,2.0,3.0,57.61,0.777778,1.0,2016


In [12]:
len(property_annotated_smiles_df)

21857225

# Removal of salts from the compound list

In [13]:
compounds_with_salt = set()

for smile in tqdm(property_annotated_smiles_df['SMILES'].unique()):
    if '.' in smile:
        compounds_with_salt.add(smile)

len(compounds_with_salt)

100%|██████████| 10695979/10695979 [00:11<00:00, 970408.35it/s]


328882

In [34]:
final_compound_df = property_annotated_smiles_df[~property_annotated_smiles_df['SMILES'].isin(compounds_with_salt)]
final_compound_df.shape

MemoryError: Unable to allocate 1.25 GiB for an array with shape (8, 20981136) and data type object

# Rounding all values to 2 decimal points

In [None]:
pd.options.display.float_format = '{:.2f}'.format

# Grouping dataframe based on years

In [20]:
mask = final_compound_df['year'].between('2015', '2017', inclusive='both')
group_1 = final_compound_df[mask]
group_1.drop(columns=['year', 'SMILES'], inplace=True)
group_1.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_1.drop(columns=['year', 'SMILES'], inplace=True)


(8542449, 8)

In [25]:
group_1.astype(float).describe().loc['50%']

mw          402.16
logp          3.60
n_hba         5.00
n_hbd         1.00
rot_bonds     5.00
tpsa         73.36
fsp3          0.36
n_chiral      1.00
Name: 50%, dtype: float64

In [26]:
mask = final_compound_df['year'].between('2018', '2020', inclusive='both')
group_2 = final_compound_df[mask]
group_2.drop(columns=['year', 'SMILES'], inplace=True)
group_2.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_2.drop(columns=['year', 'SMILES'], inplace=True)


(7778712, 8)

In [27]:
group_2.astype(float).describe().loc['50%']

mw          410.20
logp          3.64
n_hba         5.00
n_hbd         1.00
rot_bonds     5.00
tpsa         73.75
fsp3          0.36
n_chiral      1.00
Name: 50%, dtype: float64

In [28]:
mask = final_compound_df['year'].between('2021', '2023', inclusive='both')
group_3 = final_compound_df[mask]
group_3.drop(columns=['year', 'SMILES'], inplace=True)
group_3.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_3.drop(columns=['year', 'SMILES'], inplace=True)


(4659975, 8)

In [29]:
group_3.astype(float).describe().loc['50%']

mw          416.27
logp          3.64
n_hba         5.00
n_hbd         1.00
rot_bonds     5.00
tpsa         73.47
fsp3          0.38
n_chiral      1.00
Name: 50%, dtype: float64

# Other visualizations

In [30]:
final_compound_df.head(2)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,year
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.08,0.63,3.0,2.0,3.0,57.61,0.78,1.0,2015
1,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.08,0.63,3.0,2.0,3.0,57.61,0.78,1.0,2016


In [31]:
final_compound_df.shape

(20981136, 10)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,year
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.08,0.63,3.00,2.00,3.00,57.61,0.78,1.00,2015
24,CCOC(=O)[C@H](CCC1=CC=CC=C1)N[C@@H](C)C(=O)N1C...,376.20,1.60,5.00,2.00,9.00,95.94,0.55,1.00,2015
32,CNCC(=O)N[C@@H](CCCNC(N)=N)C(=O)N[C@@H](C(C)C)...,911.50,-2.30,12.00,13.00,25.00,355.05,0.57,1.00,2015
40,CC1=CC=C(C=C1)S(O)(=O)=O,172.02,1.24,2.00,1.00,1.00,54.37,0.14,1.00,2015
48,CCCCC1=NC(Cl)=C(CO)N1CC1=CC=C(C=C1)C1=C(C=CC=C...,422.16,4.27,6.00,2.00,8.00,92.51,0.27,1.00,2015
...,...,...,...,...,...,...,...,...,...,...
21857220,CCNC(=O)NC1(C)CC[C@H](CCN2CCN(CC2)C2=CC=CC3=C2...,428.26,4.68,4.00,2.00,6.00,47.61,0.62,1.00,2022
21857221,C1=CC=C(C=C1)C1=NC(=NC(=N1)C1=CC=CC=C1)C1=CC=C...,802.28,13.76,6.00,0.00,6.00,53.46,0.00,1.00,2022
21857222,FCC(=O)C1=CC=C(Br)C=N1,216.95,2.00,2.00,0.00,2.00,29.96,0.14,1.00,2022
21857223,NC1=CN=C(COC(=O)C2=CC=CC=C2)N=C1,229.09,1.42,5.00,1.00,3.00,78.10,0.08,1.00,2022
