# Overview on the murcko scaffolds found in patents
This notebook shows the distribution of murcko scaffolds in patent docuemnts.

# Import Module

In [1]:
import pandas as pd
from tqdm import tqdm
import pubchempy

from rdkit.Chem import MolFromSmiles, Draw

tqdm.pandas()

# Add path constants

In [2]:
DATA_DIR = '../data/raw'
FIGURE_DIR = '../data/figures'
MAPPINGS_DIR = '../data/mappings'

# Load data

In [3]:
scaffold_df = pd.read_parquet(f'{MAPPINGS_DIR}/scaffold_mapped_patents.pq')
scaffold_df.head(2)

Unnamed: 0,scaffold,year,PATENT_ID
0,CCC(C)C(C)C1CCCC1C(C)C,2015,EP-2842582-A2
1,CCC(C)C(C)C1CCCC1C(C)C,2015,EP-2838373-A2


In [4]:
scaffold_df.shape

(133010775, 3)

In [5]:
scaffold_df.drop_duplicates(subset=['scaffold', 'PATENT_ID'], inplace=True)

In [6]:
scaffold_df.shape

(85281799, 3)

In [7]:
top_scaffolds = scaffold_df['scaffold'].value_counts().reset_index()
top_scaffolds['percent'] = (top_scaffolds['scaffold'] / top_scaffolds['scaffold'].sum()) * 100
top_scaffolds.sort_values('percent', ascending=False, inplace=True)
top_scaffolds = top_scaffolds.head(10)

In [8]:
top_scaffolds

Unnamed: 0,index,scaffold,percent
0,CCC(C)C(C)C(C)C(C)CC,439038,0.514809
1,CC(C)C1CCCCC1,355532,0.416891
2,CCC1CCCCC1,294393,0.3452
3,CCCCCC(C)C,214865,0.251947
4,CC(C)CCCCC(C)C(C)C,209402,0.245541
5,C.CCCCCCCCCCCCCCCCCC(C)C.CCCCCCCCCCCCCCCCCC(C)C,202378,0.237305
6,CC(C)C1CCCCC1C,196842,0.230814
7,CC1CCC(C(C)(C)C)CC1,194865,0.228495
8,CCCCCCCCCCCCCCCCCC(C)C.[CH2],193933,0.227403
9,CCC(C)C1CC(C)C(C)C1C,189755,0.222504


# Annotating structures with scaffolds

In [9]:
mols = [
    MolFromSmiles(smile)
    for smile in top_scaffolds['index']
]

In [10]:
for smile in top_scaffolds['index']:
    cmp = pubchempy.get_compounds(smile, namespace='smiles')
    print(cmp[0].molecular_weight)
    print(cmp[0].synonyms, '/n')

170.33
['Octane, 3,4,5,6-tetramethyl-', '3,4,5,6-Tetramethyloctane', '62185-21-1', '3,4,5,6-Tetramethyloctane, c', '3,4,5,6-Tetramethyloctane #', '3,4,5,6-Tetramethyloctane, a', '3,4,5,6-Tetramethyloctane, b', '3,4,5,6-Tetramethyloctane, d', '3,4,5,6-Tetramethyloctane, e', '3,4,5,6-Tetramethyloctane, f', 'DTXSID20335689', 'NADJQGPTQSFIHB-UHFFFAOYSA-N'] /n
126.24
['ISOPROPYLCYCLOHEXANE', '696-29-7', 'Hexahydrocumene', 'propan-2-ylcyclohexane', 'Normanthane', 'Cyclohexane, (1-methylethyl)-', 'Cyclohexane, isopropyl-', '(1-Methylethyl)cyclohexane', 'Isopropyl-cyclohexane', '2-Cyclohexylpropane', 'Iso-propylcyclohexane', '1-methylethyl-cyclohexane', 'EINECS 211-792-4', 'NSC 73963', 'NSC-73963', 'EC 211-792-4', '(methylethyl)cyclohexane', 'isopropyl cyclohexane', 'IPX (CHRIS Code)', 'Isopropylcyclohexane, 99%', '(1-methylethyl)-Cyclohexane', 'Ciclohexano, (1-metiletil)-', 'WLN: L6TJ AY1&1', '5S52JAD8P7', 'Cyclohexane, isopropyl- (8CI)', 'DTXSID2061012', '(1-Methylethyl)cyclohexane, 9CI', 'C

In [11]:
for idx, mol in enumerate(mols):
    Draw.MolToFile(mol, f'{FIGURE_DIR}/scaffold_{idx}.png', size=(600, 600), dpi=400) 