# Physicochemical diversity of compounds found in patents
This notebook calculates the distribution between the physicochemical properties of the SureChEMBL compounds per year.

# Import Modules

In [1]:
import json
import pandas as pd
from tqdm import tqdm


tqdm.pandas()

# Add path constants

In [2]:
DATA_DIR = '../data'

# Load data file

In [3]:
surechembl_df = pd.read_parquet(f'{DATA_DIR}/surechembl_dump.pq')
surechembl_df.head(2)

Unnamed: 0,SureChEMBL_ID,SMILES,InChIKey,PATENT_ID,PUBLICATION_DATE,Field
0,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2842582-A2,2015-03-04,Description
1,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2838373-A2,2015-02-25,Description


In [4]:
surechembl_df['year'] = surechembl_df['PUBLICATION_DATE'].progress_apply(lambda x: x.split('-')[0])

100%|██████████| 133512452/133512452 [03:11<00:00, 697353.73it/s]


# Filtering unique compounds per year

In [5]:
surechem_df = surechembl_df.drop_duplicates(subset=["InChIKey","year"], keep='first')
surechem_df.reset_index(drop=True, inplace=True)
len(surechem_df)

21857225

In [6]:
year_df = surechem_df[['SMILES', 'year']].copy()
year_df.drop_duplicates(subset=['SMILES'], keep='first', inplace=True)

# Load properties data dump

In [7]:
properties_dict = json.load(open(f'{DATA_DIR}/properties.json'))

In [8]:
properties_df = pd.DataFrame(properties_dict)
properties_df = properties_df.T
properties_df.reset_index(inplace=True)
properties_df.rename(columns={'index': 'SMILES'}, inplace=True)
properties_df.head(2)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,n_ring,n_heteroatoms,so_atoms
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.077264,0.6279,3,2,3,57.61,0.777778,1,1,5,False
1,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,492.210781,1.3164,7,4,11,170.54,0.458333,1,2,11,False


In [9]:
len(properties_df)

10695979

In [10]:
properties_df['so_atoms'].value_counts()

False    10667532
True         5469
Name: so_atoms, dtype: int64

# Combining the properties data with the SureChEMBL compounds

In [11]:
property_annotated_smiles_df = pd.merge(properties_df, year_df, how="left", on=["SMILES"])
property_annotated_smiles_df.head(2)

Unnamed: 0,SMILES,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,n_ring,n_heteroatoms,so_atoms,year
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,217.077264,0.6279,3,2,3,57.61,0.777778,1,1,5,False,2015
1,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,492.210781,1.3164,7,4,11,170.54,0.458333,1,2,11,False,2015


In [12]:
len(property_annotated_smiles_df)

10695979

# Removal of salts from the compound list

In [13]:
compounds_with_salt = set()

for smile in tqdm(property_annotated_smiles_df['SMILES'].unique()):
    if '.' in smile:
        compounds_with_salt.add(smile)

len(compounds_with_salt)

100%|██████████| 10695979/10695979 [00:03<00:00, 3320569.39it/s]


328882

In [14]:
final_compound_df = property_annotated_smiles_df[~property_annotated_smiles_df['SMILES'].isin(compounds_with_salt)]
final_compound_df.shape

(10367097, 13)

# Statistical distribution of molecular properties each year

In [15]:
pd.options.display.float_format = '{:.2f}'.format

In [18]:
t_median = []

for year in tqdm(final_compound_df['year'].unique()):
    k = final_compound_df[final_compound_df['year'] == year].drop(columns=['SMILES', 'year'])
    m = k.median(axis=0).to_dict()
    m['year'] = year
    t_median.append(m)

df_median = pd.DataFrame(t_median)
df_median

100%|██████████| 8/8 [00:34<00:00,  4.31s/it]


Unnamed: 0,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,n_ring,n_heteroatoms,so_atoms,year
0,398.07,3.56,5.0,1.0,5.0,73.58,0.36,1.0,3.0,7.0,0.0,2015
1,419.22,3.76,5.0,1.0,5.0,75.71,0.35,1.0,4.0,7.0,0.0,2016
2,422.22,3.8,5.0,1.0,5.0,72.19,0.38,1.0,4.0,7.0,0.0,2017
3,424.23,3.89,5.0,1.0,5.0,72.7,0.36,1.0,4.0,7.0,0.0,2018
4,430.18,3.82,5.0,1.0,5.0,74.77,0.38,1.0,4.0,7.0,0.0,2019
5,439.2,3.88,5.0,1.0,5.0,75.01,0.37,1.0,4.0,7.0,0.0,2020
6,442.2,3.9,5.0,1.0,5.0,74.95,0.38,1.0,4.0,7.0,0.0,2021
7,461.98,4.13,5.0,1.0,5.0,72.19,0.38,1.0,4.0,7.0,0.0,2022


In [19]:
t_mean = []

for year in tqdm(final_compound_df['year'].unique()):
    k = final_compound_df[final_compound_df['year'] == year].drop(columns=['SMILES', 'year'])
    m = k.mean(axis=0).to_dict()
    m['year'] = year
    t_mean.append(m)

df_mean = pd.DataFrame(t_mean)
df_mean

100%|██████████| 8/8 [00:26<00:00,  3.32s/it]


Unnamed: 0,mw,logp,n_hba,n_hbd,rot_bonds,tpsa,fsp3,n_chiral,n_ring,n_heteroatoms,so_atoms,year
0,412.54,3.98,5.13,1.49,5.95,79.47,0.39,4.4,3.52,7.34,0.0,2015
1,442.26,4.35,5.42,1.55,6.33,83.19,0.38,5.05,3.9,7.73,0.0,2016
2,453.7,4.6,5.36,1.62,6.55,81.65,0.4,7.01,4.05,7.53,0.0,2017
3,454.57,4.67,5.38,1.53,6.54,80.68,0.39,6.57,4.07,7.6,0.0,2018
4,464.69,4.74,5.52,1.56,6.5,82.36,0.4,6.52,4.26,7.75,0.0,2019
5,475.6,4.97,5.59,1.54,6.54,83.17,0.39,6.47,4.47,7.83,0.0,2020
6,477.83,4.95,5.6,1.57,6.53,82.99,0.39,5.78,4.51,7.82,0.0,2021
7,498.71,5.43,5.71,1.5,6.71,81.81,0.39,6.37,4.94,7.83,0.0,2022
