# Physicochemical diversity of compounds found in patents
This notebook calculates the distribution between the physicochemical properties of the SureChEMBL compounds per year.

# Import Modules

In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Add path constants

In [2]:
PROCESSED_DIR = "../data/processed"
MAPPING_DIR = "../data/mappings"

# Load data file

In [3]:
property_df = pd.read_csv(
    f"{MAPPING_DIR}/property_data.tsv", sep="\t", low_memory=False
)

year_df = pd.read_parquet(f"{PROCESSED_DIR}/patent_data_desalted.pq")

In [4]:
property_combined_df = pd.merge(
    property_df, year_df, how="left", left_on="SMILES", right_on="cleaned_smiles"
)
property_combined_df.drop(columns=["cleaned_smiles", "n_chiral"], inplace=True)
property_combined_df.rename(
    columns={"SMILES_x": "desalted_SMILES", "SMILES_y": "SMILES"}, inplace=True
)
property_combined_df.head(2)

Unnamed: 0,desalted_SMILES,mw,logp,n_hba,n_hbd,tpsa,rot_bonds,fsp3,n_ring,n_heteroatoms,SMILES,year
0,C1=C2C(c3ccc(-c4cccc5ccccc45)cc3)=NC(n3c4ccccc...,604.226312,9.5483,5,0,43.81,3,0.047619,10,5,C1OC=C2C1N=C(N=C2C1=CC=C(C=C1)C1=CC=CC2=C1C=CC...,2015
1,C=CCOC(=O)[C@@H]1C[C@]2(C(C)Cc3ccc(OC)c(C=O)c3...,400.152203,2.6315,7,0,88.13,8,0.409091,3,7,COC1=C(C=O)C=C(CC(C)[C@]23C[C@@H](C(=O)OCC=C)C...,2017


# Statistical distribution of molecular properties each year

In [5]:
pd.options.display.float_format = "{:.2f}".format

In [6]:
t_median = []

for year in tqdm(property_combined_df["year"].unique()):
    k = property_combined_df[property_combined_df["year"] == year].drop(
        columns=["desalted_SMILES", "SMILES", "year"]
    )
    m = k.median(axis=0).to_dict()
    m["year"] = year
    t_median.append(m)

df_median = pd.DataFrame(t_median)
df_median.sort_values(by=["year"], inplace=True)
df_median

100%|██████████| 8/8 [00:05<00:00,  1.59it/s]


Unnamed: 0,mw,logp,n_hba,n_hbd,tpsa,rot_bonds,fsp3,n_ring,n_heteroatoms,year
0,394.03,3.52,5.0,1.0,72.94,5.0,0.36,3.0,7.0,2015
5,416.09,3.73,5.0,1.0,75.27,5.0,0.36,4.0,7.0,2016
1,420.15,3.77,5.0,1.0,71.93,5.0,0.38,4.0,7.0,2017
7,421.27,3.85,5.0,1.0,72.47,5.0,0.36,4.0,7.0,2018
3,427.17,3.78,5.0,1.0,74.68,5.0,0.38,4.0,7.0,2019
6,436.26,3.85,5.0,1.0,74.76,5.0,0.37,4.0,7.0,2020
4,440.11,3.87,5.0,1.0,74.69,5.0,0.38,4.0,7.0,2021
2,461.24,4.13,5.0,1.0,72.19,5.0,0.38,4.0,7.0,2022


In [7]:
t_mean = []

for year in tqdm(property_combined_df["year"].unique()):
    k = property_combined_df[property_combined_df["year"] == year].drop(
        columns=["desalted_SMILES", "SMILES", "year"]
    )
    m = k.mean(axis=0).to_dict()
    m["year"] = year
    t_mean.append(m)

df_mean = pd.DataFrame(t_mean)
df_mean.sort_values(by=["year"], inplace=True)
df_mean

100%|██████████| 8/8 [00:03<00:00,  2.47it/s]


Unnamed: 0,mw,logp,n_hba,n_hbd,tpsa,rot_bonds,fsp3,n_ring,n_heteroatoms,year
0,407.84,3.92,5.07,1.49,78.9,5.92,0.39,3.46,7.26,2015
5,437.6,4.29,5.36,1.55,82.64,6.31,0.38,3.84,7.66,2016
1,450.68,4.56,5.33,1.62,81.4,6.52,0.41,4.01,7.49,2017
7,451.07,4.61,5.35,1.53,80.43,6.52,0.39,4.02,7.56,2018
3,460.8,4.67,5.49,1.57,82.21,6.48,0.4,4.21,7.71,2019
6,471.94,4.9,5.56,1.55,82.94,6.52,0.39,4.42,7.8,2020
4,474.9,4.9,5.58,1.57,82.77,6.51,0.39,4.47,7.78,2021
2,498.41,5.43,5.7,1.5,81.79,6.71,0.39,4.94,7.83,2022
