# Overview on the murcko scaffolds found in patents
This notebook shows the distribution of murcko scaffolds in patent docuemnts.

# Import Module

In [1]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import pubchempy

from rdkit import RDLogger
from rdkit.Chem import MolFromSmiles, Draw
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles

tqdm.pandas()

In [2]:
RDLogger.DisableLog("rdApp.*")

# Add path constants

In [3]:
PROCESSED_DIR = "../data/processed"
FIGURE_DIR = "../data/figures"

# Load data

In [4]:
murcko_scaffold_df = pd.read_parquet(
    f"{PROCESSED_DIR}/surechembl_generic_mucko.pq", columns=["PATENT_ID", "scaffold"]
)
murcko_scaffold_df.head(2)

Unnamed: 0,PATENT_ID,scaffold
0,EP-2842582-A2,C1CCNC1
1,EP-2838373-A2,C1CCNC1


In [5]:
murcko_scaffold_df.shape

(133512452, 2)

# Get top 10 generic Murcko scaffold

In [6]:
scaffold_patent_dict = defaultdict(set)

for patent_id, scaffold in tqdm(murcko_scaffold_df.values):
    if scaffold is None or scaffold == "":
        continue

    scaffold_patent_dict[scaffold].add(patent_id)

100%|██████████| 133512452/133512452 [01:20<00:00, 1664175.74it/s]


In [7]:
scaffold_patent_dict = {i: len(j) for i, j in scaffold_patent_dict.items()}
total = sum(scaffold_patent_dict.values())
f"Scaffold count: {len(scaffold_patent_dict)}, Total patents: {total}"

'Scaffold count: 2663538, Total patents: 40455241'

In [8]:
total_scaffold_df = pd.DataFrame(
    scaffold_patent_dict.items(), columns=["scaffold", "count"]
).sort_values("count", ascending=False)
total_scaffold_df.head(2)

Unnamed: 0,scaffold,count
3,c1ccccc1,1060538
15,C1CCCCC1,362369


In [9]:
total_scaffold_df["percent"] = (total_scaffold_df["count"] / total) * 100

In [10]:
total_scaffold_df.sort_values("percent", ascending=False, inplace=True)

In [11]:
top_scaffolds = total_scaffold_df.head(10)
top_scaffolds

Unnamed: 0,scaffold,count,percent
3,c1ccccc1,1060538,2.62151
15,C1CCCCC1,362369,0.895728
14,c1ccncc1,312132,0.771549
251,C1CCOCC1,298296,0.737348
27,c1ccc2ccccc2c1,267318,0.660775
53,c1ccc(Cc2ccccc2)cc1,253562,0.626772
19,c1c[nH]cn1,251592,0.621902
42,c1ccc2[nH]ccc2c1,247430,0.611614
97,C1CCOC1,238540,0.589639
44,c1ncc2ncn(C3CCCO3)c2n1,190491,0.470869


# Annotating structures with scaffolds

In [12]:
mols = [MolFromSmiles(smile) for smile in top_scaffolds["scaffold"]]

In [13]:
for smile in top_scaffolds["scaffold"]:
    cmp = pubchempy.get_compounds(smile, namespace="smiles")
    print(cmp[0].synonyms[0])

benzene
CYCLOHEXANE
PYRIDINE
TETRAHYDROPYRAN
naphthalene
Diphenylmethane
imidazole
indole
TETRAHYDROFURAN
SCHEMBL2388906


In [14]:
for idx, mol in enumerate(mols):
    idx += 1
    Draw.MolToFile(
        mol, f"{FIGURE_DIR}/murcko_generic_scaffold_{idx}.png", size=(600, 600), dpi=400
    )