# Chemical-patent distribution plot

In [1]:
import os
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

tqdm.pandas()

# Add path constants

In [2]:
FIG_DIR = "../data/figures"
DATA_DIR = "../data/raw"
os.makedirs(FIG_DIR, exist_ok=True)

# Load data file

In [3]:
df = pd.read_parquet(f"{DATA_DIR}/surechembl_dump.pq")
df.head(2)

Unnamed: 0,SureChEMBL_ID,SMILES,InChIKey,PATENT_ID,PUBLICATION_DATE,Field
0,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2842582-A2,2015-03-04,Description
1,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2838373-A2,2015-02-25,Description


# Create chemical-patent counters

In [4]:
chemical_counter = defaultdict(set)

for chem_idx, patent_id in tqdm(df[["InChIKey", "PATENT_ID"]].values):
    chemical_counter[chem_idx].add(patent_id)

len(chemical_counter)

100%|██████████| 133512452/133512452 [04:37<00:00, 480945.14it/s]


10686744

In [5]:
schembl_chemical_counter = defaultdict(set)

for chem_idx, patent_id in tqdm(df[["SureChEMBL_ID", "PATENT_ID"]].values):
    schembl_chemical_counter[chem_idx].add(patent_id)

len(schembl_chemical_counter)

100%|██████████| 133512452/133512452 [04:38<00:00, 479438.72it/s]


10718652

In [6]:
chem_pat_counts = {
    "< 5": 0,
    "10 - 50": 0,
    "50 - 100": 0,
    "100 - 500": 0,
    "500 - 1000": 0,
    "> 1000": 0,
}

for chem_idx, patent_ids in tqdm(chemical_counter.items()):
    if len(patent_ids) > 1000:
        chem_pat_counts["> 1000"] += 1
    elif len(patent_ids) > 500:
        chem_pat_counts["500 - 1000"] += 1
    elif len(patent_ids) > 100:
        chem_pat_counts["100 - 500"] += 1
    elif len(patent_ids) > 50:
        chem_pat_counts["50 - 100"] += 1
    elif len(patent_ids) > 10:
        chem_pat_counts["10 - 50"] += 1
    else:
        chem_pat_counts["< 5"] += 1

100%|██████████| 10686744/10686744 [00:18<00:00, 582967.99it/s]


In [8]:
count_df = pd.DataFrame.from_dict(chem_pat_counts, orient="index", columns=["count"])
count_df.reset_index(inplace=True)
count_df.rename(columns={"index": "patents"}, inplace=True)
count_df

Unnamed: 0,patents,count
0,< 5,10148500
1,10 - 50,445227
2,50 - 100,37247
3,100 - 500,37157
4,500 - 1000,7000
5,> 1000,11613


In [9]:
count_df["count"].sum()

10686744

# Get information on patent countries

In [10]:
filtered_df = df.drop_duplicates(subset=["InChIKey", "PATENT_ID"])

In [11]:
pd.options.mode.chained_assignment = None  # default='warn'
filtered_df["country"] = filtered_df["PATENT_ID"].progress_apply(
    lambda x: x.split("-")[0]
)

100%|██████████| 115511729/115511729 [03:17<00:00, 583689.51it/s]


In [12]:
total = sum(filtered_df["country"].value_counts())

for country, count in filtered_df["country"].value_counts().items():
    print(f"{country}: {count / total * 100:.2f}%")

US: 57.29%
EP: 26.58%
WO: 16.12%
JP: 0.00%


In [13]:
filtered_df["country_pattype"] = filtered_df["PATENT_ID"].progress_apply(
    lambda x: f"{x.split('-')[0]}-{x.split('-')[-1]}"
)

100%|██████████| 115511729/115511729 [04:04<00:00, 472739.34it/s]


In [14]:
filtered_df["country_pattype_2"] = filtered_df["country_pattype"].progress_apply(
    lambda x: x[:-1]
)

100%|██████████| 115511729/115511729 [02:49<00:00, 681567.03it/s]


In [15]:
total = sum(filtered_df["country_pattype_2"].value_counts())

for country, count in filtered_df["country_pattype_2"].value_counts().items():
    print(f"{country}: {count / total * 100:.2f}%")

US-A: 33.59%
US-B: 23.64%
WO-A: 16.12%
EP-A: 15.59%
EP-B: 10.99%
US-E: 0.06%
US-P: 0.00%
JP-: 0.00%
JP-A: 0.00%
JP-B: 0.00%
US-S: 0.00%
