# Import Module

In [1]:
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import json

In [2]:
from rdkit.Chem import Descriptors, Draw, PandasTools, MolFromSmiles

# Load Data File

In [3]:
with open('data/scaffold_smiles.json', 'r') as f:
    scaffold_dict = json.load(f)

In [4]:
scaffold_df = pd.DataFrame(scaffold_dict)
scaffold_df.head(2)

Unnamed: 0,scaffold,year,patent_ID
0,C1CCNC1,2015,EP-2842582-A2
1,O=C(CNCCCc1ccccc1)N1CCCC1,2015,EP-2838373-A2


In [12]:
len(scaffold_df)

21812332

In [5]:
skipped_smiles = 0
skipped_mol_wt = 0
liste = []

for smiles, year, patent_id in tqdm(scaffold_df.values):
    try:
        molecule = MolFromSmiles(smiles)
        if molecule is None:
            skipped_smiles += 1
            continue
            
        mol_wt = Descriptors.MolWt(molecule)
        
        if mol_wt >= 300:
            liste.append({'smiles': smiles, 'patent_id': patent_id, 'mol_wt': mol_wt})
            
        else:
            skipped_mol_wt += 1
        
    except:
        skipped_smiles += 1
        continue

  2%|▏         | 334008/21812332 [00:41<43:50, 8165.02it/s][13:56:54] Can't kekulize mol.  Unkekulized atoms: 2 6 7


  5%|▍         | 1009103/21812332 [02:10<41:25, 8371.16it/s][13:58:23] Can't kekulize mol.  Unkekulized atoms: 2 6 7
  5%|▌         | 1133292/21812332 [02:27<50:14, 6858.86it/s][13:58:40] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
  6%|▌         | 1316767/21812332 [02:51<51:02, 6692.07it/s][13:59:04] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 12%|█▏        | 2624129/21812332 [06:13<45:56, 6959.83it/s]  [14:02:26] Conflicting single bond directions around double bond at index 32.
[14:02:26]   BondStereo set to STEREONONE and single bond directions set to NONE.
 13%|█▎        | 2885062/21812332 [06:57<53:07, 5938.21it/s]  [14:03:10] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 29 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 48 49 50 51 52 54 55 56 57 58 59 60 61 62 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 10

 23%|██▎       | 4961126/21812332 [12:46<43:21, 6477.35it/s]  [14:08:59] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
 24%|██▍       | 5241633/21812332 [13:20<32:43, 8440.06it/s][14:09:32] Can't kekulize mol.  Unkekulized atoms: 2 6 7


 27%|██▋       | 5797159/21812332 [14:26<32:45, 8148.18it/s][14:10:38] Can't kekulize mol.  Unkekulized atoms: 2 6 7
 27%|██▋       | 5862353/21812332 [14:34<29:23, 9045.75it/s][14:10:47] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14


 27%|██▋       | 5974087/21812332 [14:47<26:31, 9954.05it/s][14:11:00] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 29%|██▉       | 6433802/21812332 [15:49<35:37, 7194.79it/s][14:12:02] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 47 48 49 50 51 52 53 54 55 56 57 58 59 60 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 38%|███▊      | 8277883/21812332 [20:47<31:47, 7096.31it/s]  [14:17:00] Conflicting single bond directions around double bond at index 21.
[14:17:00]   BondStereo set to STEREONONE and single bond directions set to NONE.
 38%|███▊      | 8299682/21812332 [20:52<37:26, 6016.32it/s][14:17:04] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 

 50%|█████     | 10961979/21812332 [28:39<30:22, 5955.12it/s][14:24:52] Conflicting single bond directions around double bond at index 8.
[14:24:52]   BondStereo set to STEREONONE and single bond directions set to NONE.
 51%|█████     | 11017763/21812332 [28:50<28:51, 6234.23it/s]  [14:25:03] Conflicting single bond directions around double bond at index 4.
[14:25:03]   BondStereo set to STEREONONE and single bond directions set to NONE.
 51%|█████     | 11031130/21812332 [28:52<26:05, 6886.35it/s][14:25:05] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
 53%|█████▎    | 1145331

 54%|█████▍    | 11872117/21812332 [30:35<17:36, 9409.64it/s][14:26:48] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 55%|█████▍    | 11913713/21812332 [30:39<15:57, 10341.15it/s][14:26:52] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 55%|█████▌    | 11997815/21812332 [30:48<15:01, 10887.73it/s][14:27:01] Can't kekulize mol.  Unkekulized atoms: 2 3 4
 55%|█████▌    | 12003644/21812332 [30:49<16:06, 10148.82it/s][14:27:02] Can't kekulize mol.  Unkekulized atoms: 2 3 4


 65%|██████▌   | 14195886/21812332 [36:55<23:06, 5494.96it/s][14:33:08] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
 67%|██████▋   | 14720290/21812332 [38:30<19:05, 6193.48it/s][14:34:43] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 11 12 13 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 41 42 43 44 45 47 48 49 50 51 52 53 54 55 56 57 68 69 70 71 72 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 14

 75%|███████▌  | 16369939/21812332 [43:27<09:08, 9928.60it/s][14:39:40] Can't kekulize mol.  Unkekulized atoms: 2 6 7


 77%|███████▋  | 16896903/21812332 [44:22<08:16, 9895.03it/s][14:40:35] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 78%|███████▊  | 16928115/21812332 [44:25<08:43, 9332.78it/s] [14:40:38] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 9 10 11 12 13 14
 78%|███████▊  | 16977227/21812332 [44:30<07:36, 10603.15it/s][14:40:43] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[14:40:46] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[14:40:46] Can't kekulize mol.  Unkekulized atoms: 2 3 4
[14:40:46] Can't kekulize mol.  Unkekulized atoms: 2 3 4


 98%|█████████▊| 21372690/21812332 [57:41<01:44, 4187.85it/s][14:53:54] Conflicting single bond directions around double bond at index 10.
[14:53:54]   BondStereo set to STEREONONE and single bond directions set to NONE.
 98%|█████████▊| 21416101/21812332 [57:51<01:17, 5100.10it/s][14:54:04] Conflicting single bond directions around double bond at index 4.
[14:54:04]   BondStereo set to STEREONONE and single bond directions set to NONE.
 99%|█████████▉| 21574180/21812332 [58:21<00:45, 5190.71it/s][14:54:34] Conflicting single bond directions around double bond at index 20.
[14:54:34]   BondStereo set to STEREONONE and single bond directions set to NONE.
100%|██████████| 21812332/21812332 [59:14<00:00, 6137.02it/s]


In [6]:
(skipped_smiles,skipped_mol_wt, len(liste))

(30, 11610973, 10201329)

In [7]:
df = pd.DataFrame(liste)

In [24]:
df

Unnamed: 0,smiles,patent_id,mol_wt
0,O=C(CCc1ccccc1)NCC(=O)N[C@@H](Cc1c[nH]cn1)C(=O...,EP-2842582-A2,397.479
1,c1ccc(-c2nnn[nH]2)c(-c2ccc(Cn3ccnc3)cc2)c1,EP-2842582-A2,302.341
2,c1ccc(-c2nnn[nH]2)c(-c2ccc(Cn3ccnc3)cc2)c1,US-20150065543-A1,302.341
3,c1ccc(C[P+](c2ccccc2)(c2ccccc2)c2ccccc2)cc1,EP-2670725-B1,353.425
4,c1ccc(C(c2ccccc2)(c2ccccc2)n2nnnc2-c2ccccc2-c2...,US-20150065543-A1,544.662
...,...,...,...
10201324,O=C(NCc1ccc2cc(CNCC34CC(C3)C4)[nH]c2c1)c1cc(=O...,WO-2022254216-A1,427.508
10201325,O=S(=O)(OCCc1cc2ccccc2[nH]1)c1ccccc1,EP-4069694-A1,301.367
10201326,O=C(NCc1ccc2cc(CNCC3CCC3)[nH]c2c1)c1cc(=O)n2c(...,EP-4069694-A1,419.529
10201327,c1cc(N2CCN(CCC3CCCCC3)CC2)c2ccsc2c1,EP-4056568-A1,328.525


In [22]:
g = df['patent_id'].unique()

In [23]:
len(g)

271283

In [28]:
(271283/502175)*100

54.02160601383979

In [29]:
502175-271283

230892

In [30]:
(230892/502175)*100

45.97839398616021

In [25]:
h = scaffold_df['patent_ID'].unique()

In [26]:
len(h)

502175

In [27]:
502175-271283

230892

In [18]:
mol_wt_above_300 = (10201329/(21812332-30))*100
mol_wt_above_300

46.76869502356973

In [19]:
mol_wt_below_300 = (11610973/(21812332-30))*100
mol_wt_below_300

53.23130497643027

In [None]:
patent_ids_list = scaffold_df.groupby('scaffold')['patent_ID'].unique().apply(list).reset_index(name='patent_IDs')

In [None]:
patent_ids_list['Number of patents'] = patent_ids_list['patent_IDs'].str.len()

In [None]:
patent_ids_list = patent_ids_list.sort_values(by=['Number of patents'], ascending=False)
patent_ids_list

In [None]:
patent_ids_list.head(10)

In [None]:
m = patent_ids_list['Number of patents'] < 10
patent_ids_list[m]['Number of patents'].hist()

In [None]:
patent_ids_list.head(10)

# Calculating Scaffolds per year

In [None]:
scaffold_count_df = pd.DataFrame(scaffold_df['year'].value_counts()).reset_index()
scaffold_count_df = scaffold_count_df.sort_values('year', ascending=True)
scaffold_count_df

In [None]:
grouped_df = scaffold_df.groupby('patent_ID')['scaffold'].count().reset_index()
grouped_df

In [None]:
grouped_df = grouped_df.groupby('scaffold')['patent_ID'].count().reset_index()
grouped_df

# Visualisation

In [None]:
plt.figure(figsize = (18,11))
ax = sns.barplot(x= scaffold_count_df['year'], y= scaffold_count_df['count'], color='#ffbe7d')
sns.set(context='poster')
sns.set_style("white")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("Year", fontsize=18)
plt.ylabel("Number of scaffolds", fontsize=18)
for c in ax.containers:
    ax.bar_label(c, labels=[f'{x:,.0f}' for x in c.datavalues], padding= 2, fontsize=14)
plt.show()

In [None]:
plt.figure(figsize = (18,11))
ax1 = sns.barplot(x= grouped_df['scaffold'], y= grouped_df['patent_ID'], color='#ffbe7d')
sns.set(context='poster')
sns.set_style("white")
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("Number of scaffold", fontsize=18)
plt.ylabel("Number of Patents", fontsize=18)
for c in ax1.containers:
    ax1.bar_label(c, labels=[f'{x:,.0f}' for x in c.datavalues], padding= 2, fontsize=14)
    
plt.xlim(-0.5,20.5)    
plt.show()