In [None]:
# Want to start by checking which molecules are duplicates.
# For examples we have 241905 and 1497

In [3]:
import psycopg2
import pandas as pd
from rdkit import Chem

In [33]:
from tqdm import tqdm

In [6]:
# Establish a connection
# You must have the DB container running to run this cell successfully.
# Connection parameters
db_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': '',
    'host': '127.0.0.1',
    'port': '5432'
}

# Establish a connection to the PostgreSQL database
connection = psycopg2.connect(**db_params)

# Execute an SQL statement
query = "SELECT molecule_id, smiles, molecular_weight FROM molecule"
df = pd.read_sql_query(query, connection)

# Close the connection
connection.close()

  df = pd.read_sql_query(query, connection)


In [7]:
df

Unnamed: 0,molecule_id,smiles,molecular_weight
0,331406,COc1cccc(c1c1ccccc1P(c1ccccc1)c1ccccc1)OC,398.441986
1,140360,COc1ccc(P(c2ccccc2SC)c2ccccc2SC)c(C)c1,398.532990
2,331409,C1CCC(CC1)P(c1ccccc1)C1CCCCC1,274.388000
3,2027,CN(c1ccccc1c1ccccc1P(C12CC3CC(C2)CC(C1)C3)C12C...,497.707001
4,2036,CCC1(CC)O[C@@H]2[C@@H](O1)C(c1cc(C(C)(C)C)cc(C...,1049.558960
...,...,...,...
330962,608,Cc1cc(C)cc(P(c2cc(C)cc(C)c2)c2ccccc2C=O)c1,346.410004
330963,461,CCO[Si](CCP(c1ccccc1)c1ccccc1)(OCC)OCC,376.509003
330964,1064,Cc1c(C)n(C(C)C)c(=NP(N=c2n(C(C)C)c(C)c(C)n2C(C...,462.666992
330965,523,CN(C)/N=C/c1ccc(P(c2ccc(/C=N/N(C)C)s2)c2ccc(/C...,490.664001


In [15]:
# Sanity check to see if the data is correct
df[(df["molecule_id"]==241905) | (df["molecule_id"]==1497)]

Unnamed: 0,molecule_id,smiles,molecular_weight
143223,241905,[H]P([H])C,48.025002
329868,1497,CP,48.025002


In [27]:
# These two molecules are the same so lets check if rdkit will return the same smiles string when canonicalizing them
mol_241905 = df[df["molecule_id"]==241905]["smiles"].to_list()[0]
mol_1497 = df[df["molecule_id"]==1497]["smiles"].to_list()[0]

# Double check with the molecular weight. Use difference is less than some tolerance 1e-6.
mol_241905_weight = df[df["molecule_id"]==241905]["molecular_weight"].to_list()[0]
mol_1497_weight = df[df["molecule_id"]==1497]["molecular_weight"].to_list()[0]

a = Chem.CanonSmiles(mol_241905)
b = Chem.CanonSmiles(mol_241905)

In [29]:
print(a, mol_241905_weight)
print(b, mol_1497_weight)
print(a == b)
print(abs(mol_241905_weight - mol_1497_weight) < 0.000001)

CP 48.025001525878906
CP 48.025001525878906
True
True


In [31]:
all_data_dict = df.to_dict(orient="records")

In [41]:
# Store each record in a hashmap with the CANONICAL smile as the key, and value as a tuple of molecule_id and molecular_weight
# When storing, check to see if the CANONICAL smile is already there, if so check its weight and add the tuple of molecule_ids
# the original  
hashmap = {}
duplicates = []
for entry in tqdm(all_data_dict):
    try:
        # Canonicalize smiles
        canonical_smile = Chem.CanonSmiles(entry["smiles"])
        # Check if its in hashmap
        if canonical_smile not in hashmap:
            # add it
            hashmap[canonical_smile] = (entry["molecule_id"], entry["smiles"], entry["molecular_weight"])
        else:
            a = hashmap[canonical_smile] # Get matched molecule tuple data
            b = (entry["molecule_id"], entry["smiles"], entry["molecular_weight"])
            duplicates.append((a, b))
    except:
        # See which molecules cause issues if any, deal with them later
        print(entry["smiles"])

  0%|                                                                                        | 0/330967 [00:00<?, ?it/s][23:43:15] Explicit valence for atom # 7 B, 6, is greater than permitted
  0%|▏                                                                           | 653/330967 [00:00<01:38, 3339.89it/s]

c1ccc(cc1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)c1ccccc1


  1%|▌                                                                          | 2331/330967 [00:00<01:20, 4099.64it/s][23:43:15] Explicit valence for atom # 7 C, 6, is greater than permitted
  1%|▋                                                                          | 3150/330967 [00:00<01:24, 3897.22it/s]

c1ccc(cc1)P([C]1234[BH]567[BH]891[BH]1%103[BH]3%112[BH]245[BH]456[CH]678[BH]791[BH]1%10%11[BH]324[BH]5671)c1ccccc1


  2%|█▎                                                                         | 5580/330967 [00:01<01:22, 3940.69it/s][23:43:16] Explicit valence for atom # 7 B, 6, is greater than permitted
[23:43:16] Explicit valence for atom # 7 C, 6, is greater than permitted
  2%|█▍                                                                         | 6369/330967 [00:01<01:23, 3905.70it/s]

C1CCC(CC1)P([B]1234[BH]567[BH]891[BH]1%102[BH]2%114[BH]435[CH]357[BH]768[CH]691[BH]1%102[BH]%1143[BH]5761)C1CCCCC1
c1ccc(cc1)P([C]1234[CH]567[BH]893[BH]3%102[BH]2%111[BH]145[BH]456[BH]678[BH]79%10[BH]83%11[BH]214[BH]5678)c1ccccc1


100%|█████████████████████████████████████████████████████████████████████████| 330967/330967 [02:02<00:00, 2711.21it/s]


In [43]:
len(duplicates)

78

In [44]:
duplicates

[((1519, 'COP', 64.02400207519531),
  (241851, '[H]P([H])OC', 64.02400207519531)),
 ((241739, '[H]P([H])C(C)C', 76.0790023803711),
  (1495, 'CC(C)P', 76.0790023803711)),
 ((2045, 'C1CC2CCCC(C1)P2B1Nc2ccccc2c2c1cccc2', 319.1969909667969),
  (1006, 'c1ccc2c(c1)NB(P1C3CCCC1CCC3)c1ccccc1-2', 319.1969909667969)),
 ((1521, 'CSP', 80.09200286865234),
  (241909, '[H]P([H])SC', 80.09200286865234)),
 ((2063,
   'COC1=CC=C(OC)C(C2=C(C(C)C)C=C(C(C)C)C=C2C(C)C)=C1P3C(C)(C)CC4(OCCO4)CC3(C)C',
   554.7520141601562),
  (1900,
   'COc1ccc(OC)c(P2C(C)(C)CC3(CC2(C)C)OCCO3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C',
   554.7520141601562)),
 ((1494, 'CC(C)PC(C)C', 118.16000366210938),
  (41951, '[H]P(C(C)C)C(C)C', 118.16000366210938)),
 ((2054, 'PC12CC3CC(C2)CC(C1)C3', 168.22000122070312),
  (1296, 'PC12CC3CC(CC(C3)C1)C2', 168.22000122070312)),
 ((820,
   'COC1=[C@@](C2=CC=CC=C2C=C1)[C@@]3=C4C(C=CC=C4)=CC=C3P(OC5=CC=C(C=CC=C6)C6=[C@]57)OC8=[C@]7C(C=CC=C9)=C9C=C8',
   598.6380004882812),
  (821,
   'COC1=[C@@](C2=CC=C

In [45]:
# Save the object
import pickle

In [46]:
with open('duplicates.pkl', 'wb') as file:
    pickle.dump(duplicates, file)