# Overview on the murcko scaffolds found in patents
This notebook shows the distribution of murcko scaffolds in patent docuemnts.

# Import Module

In [1]:
import pandas as pd
from tqdm import tqdm
import pubchempy

from rdkit.Chem import MolFromSmiles, Draw

tqdm.pandas()

# Add path constants

In [2]:
DATA_DIR = "../data/raw"
FIGURE_DIR = "../data/figures"
MAPPINGS_DIR = "../data/mappings"

# Load data

In [3]:
scaffold_df = pd.read_parquet(f"{MAPPINGS_DIR}/unique_scaffold_mapped_patents.pq.gzip")
scaffold_df.head(2)

Unnamed: 0,SureChEMBL_ID,SMILES,InChIKey,PATENT_ID,PUBLICATION_DATE,Field,year,scaffold,scaffold_wt
0,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2842582-A2,2015-03-04,Description,2015,CCC(C)C(C)C1CCCC1C(C)C,217.29
1757,SCHEMBL9,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,OYFJQPXVCSSHAI-QFPUQLAESA-N,EP-2838373-A2,2015-02-25,Description,2015,CC(C)CCC(C)C.CCCC(C)C(CCC1CCCCC1)CC(C)C(C)C1CC...,492.525


In [4]:
scaffold_df.shape

(5163046, 9)

In [5]:
scaffold_df.drop_duplicates(subset=["scaffold", "PATENT_ID"], inplace=True)
scaffold_df.shape

(5163046, 9)

In [6]:
top_scaffolds = scaffold_df["scaffold"].value_counts().reset_index()
top_scaffolds["percent"] = (
    top_scaffolds["scaffold"] / top_scaffolds["scaffold"].sum()
) * 100
top_scaffolds.sort_values("percent", ascending=False, inplace=True)
top_scaffolds = top_scaffolds.head(10)

In [7]:
top_scaffolds

Unnamed: 0,index,scaffold,percent
0,CCC(C)C(C)C1CCCC1C(C)C,1,1.9e-05
3442027,CCC(C)(C)CCC(C)(C)CC,1,1.9e-05
3442034,CC1CC(C)C2CCCCC12,1,1.9e-05
3442033,CC(C1CCCCC1)C(C)C1CCCCC1,1,1.9e-05
3442032,CCCC(C)(C1CCCCC1)C1CCCCC1,1,1.9e-05
3442031,CC1CCCCC1,1,1.9e-05
3442030,CC1CC(C)C2CC3CCCCC3CC12,1,1.9e-05
3442029,CC(C)C1CCCCC1CC1C(C)CCC(C)C1C,1,1.9e-05
3442028,CC1CCCCC1(C)CCC1CCCCC1,1,1.9e-05
3442026,CCC(C)CCCCC(C)CC,1,1.9e-05


# Annotating structures with scaffolds

In [8]:
mols = [MolFromSmiles(smile) for smile in top_scaffolds["index"]]

In [9]:
for smile in top_scaffolds["index"]:
    cmp = pubchempy.get_compounds(smile, namespace="smiles")
    print(cmp[0].molecular_weight)
    print(cmp[0].synonyms, "/n")

None
None /n
170.33
['3,3,6,6-Tetramethyloctane', '62199-46-6', 'Octane, 3,3,6,6-tetramethyl-', 'DTXSID20211232'] /n
152.28
[] /n
222.41
["Cyclohexane, 1,1'-(1,2-dimethyl-1,2-ethanediyl)bis-", '2,3-dicyclohexylbutane', 'Butane, 2,3-dicyclohexyl-', '74663-71-1', 'DTXSID50880733', 'CKYRHDXTXMYCMX-UHFFFAOYSA-N', '(A+/-)-2,3-dicyclohexyl-butane', '(2-Cyclohexyl-1-methylpropyl)cyclohexane #', "Cyclohexane, 1,1'-(1,2-dimethyl-1,2-ethanediyl)bis-, (R*,R*)-(.+/-.)-"] /n
236.4
[] /n
98.19
['METHYLCYCLOHEXANE', '108-87-2', 'Cyclohexane, methyl-', 'Hexahydrotoluene', 'Cyclohexylmethane', 'Toluene hexahydride', 'Hexahydroxytoluene', 'Sextone B', 'Toluene, hexahydro-', 'methyl cyclohexane', 'Metylocykloheksan', 'methyl-cyclohexane', 'HSDB 98', 'NSC 9391', 'UNII-H5WXT3SV31', 'EINECS 203-624-3', 'H5WXT3SV31', 'DTXSID0047749', 'AI3-18132', 'NSC-9391', 'DTXCID4027733', 'CHEBI:165745', 'EC 203-624-3', 'METHYLCYCLOHEXANE-D11 (RING-D11)', 'Metylocykloheksan [Polish]', 'METHYLCYCLOHEXANE (USP-RS)', 'METHYL

In [10]:
for idx, mol in enumerate(mols):
    Draw.MolToFile(mol, f"{FIGURE_DIR}/scaffold_{idx}.png", size=(600, 600), dpi=400)