In [1]:
# Imports for the 
import re
import os
import sqlite3
from pathlib import Path
import pandas as pd
import rdkit
from rdkit.Chem import Descriptors, AllChem
import numpy as np
from sklearn.cluster import AffinityPropagation

In [2]:
if os.path.exists("./MPro_database.db"):
  os.remove("./MPro_database.db")

In [3]:
# Create database file and establish a connection
database_path = "./MPro_database.db"
conn = sqlite3.connect(database_path)
drop_order = ('compounds', 'assays')

In [4]:
# Create a database of compounds and their SMILES string
conn.execute('''
CREATE TABLE compounds
(
    CID PRIMARY KEY,
    SMILES
)
        ''')

<sqlite3.Cursor at 0x7ff44dc93dc0>

In [5]:
df = pd.read_csv('activity_data.csv')

for i in range(len(df)):
    row = df.iloc[i]
    values  =  (row['CID'], 
         row['SMILES']
        )
    conn.execute('INSERT INTO compounds (CID, SMILES) VALUES (?,?)',
        values)

In [6]:
# Creates and populates a table of assays, including a foreign key linking the assays to the compounds table
conn.execute('''
CREATE TABLE assays
(
    CID PRIMARY KEY,
    canonical_CID,
    r_inhibition_at_20_uM,
    r_inhibition_at_50_uM,
    r_avg_IC50,
    f_inhibition_at_20_uM,
    f_inhibition_at_50_uM,
    f_avg_IC50,
    f_avg_pIC50,
    relative_solubility_at_20_uM,
    relative_solubility_at_100_uM,
    trypsin_IC50,
    NMR_std_ratio,
    acrylamide,
    chloroacetamide,
    series,
    frag_id,
    compoundID,
    FOREIGN KEY (CompoundID) REFERENCES compounds (CID)
)
        ''')


<sqlite3.Cursor at 0x7ff44da793b0>

In [7]:
df_short = df.iloc[:,1:]

for i in range(len(df_short)):
    row = df_short.iloc[i]
    values = []
    
    for element in row[:]:
        values.append(element)
    values.append(row[0])
    conn.execute('INSERT INTO assays (CID, canonical_CID, r_inhibition_at_20_uM, r_inhibition_at_50_uM, r_avg_IC50, f_inhibition_at_20_uM, f_inhibition_at_50_uM, f_avg_IC50, f_avg_pIC50, relative_solubility_at_20_uM, relative_solubility_at_100_uM, trypsin_IC50, NMR_std_ratio, acrylamide, chloroacetamide, series, frag_id, compoundID) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
             (values)
                )

In [8]:
def retrieve_SMILES(compoundID: str):
    list_smiles = []
    SMILES = conn.execute('''
        SELECT SMILES
        FROM compounds
        WHERE 
            CID = ?
    ''', (compoundID,))
    for sm in SMILES:
        list_smiles.append(sm)
    return (list_smiles[0][0])
        
def retrieve_assays(compoundID: str):
    list_assays = []
    ASSAYS = conn.execute('''
        SELECT *
        FROM assays
        WHERE 
            compoundID = ?
    ''', (compoundID,))
    
    for assay in ASSAYS:
        list_assays.append(assay)
    return (list_assays[0][2:-1])

In [9]:
# Demonstrate we can retrieve the assay data and SMILES string from an ID

print(retrieve_SMILES('MAT-POS-be048f2c-5'))
print("Assay results equal" + retrieve_assays('MAT-POS-be048f2c-5'))

CCn1nccc1C(=O)N1Cc2ccc(Cl)cc2C2(CCN(c3cncc4ccccc34)C2=O)C1
(None, None, None, None, None, 0.297012069374338, None, None, None, None, None, 0, 0, '3-aminopyridine-like', None)


In [10]:
#for table_name in drop_order:
 #   conn.execute('DROP TABLE IF EXISTS ' + table_name)

In [11]:
# Molecular representations

conn.execute('''
    ALTER TABLE compounds
    ADD fingerprint''')

#conn.execute('INSERT INTO compunds (column name) WHERE CID=?', (row[0],))


<sqlite3.Cursor at 0x7ff44da79730>

In [23]:
# Rdkit has compatibility to save a bit representation of the molecular fingerprint. Here we save the fingerprint array in a dictionary
# and save a list of the bit representations of the fingerprints
fingerprint_dict = {}
fingerprint_bit_list =[]

for index, row in df.iterrows():
    id = row['CID']
    smiles_str = retrieve_SMILES(id)
    mol = rdkit.Chem.MolFromSmiles(smiles_str)
    fingerprint_dict[id] =  AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=2048)
    fingerprint_bit_list.append(np.array(fingerprint_dict[id]))
    #conn.execute('INSERT INTO compounds (fingerprint) VALUES (?)', (fingerprint,))

In [26]:
# 
from rdkit.DataManip.Metric import GetTanimotoSimMat

mols = [rdkit.Chem.MolFromSmiles(retrieve_SMILES(row['CID'])) for index, row in df.iterrows()]
morganfps = [AllChem.GetMorganFingerprintAsBitVect(m,2) for m in mols]
sim_matrix = GetTanimotoSimMat(morganfps)

In [31]:
def tri2mat(tri_arr):
    n = len(tri_arr)
    m = int((np.sqrt(1 + 4 * 2 * n) + 1) / 2)
    arr = np.ones([m, m])
    for i in range(m):
        for j in range(i):
            arr[i][j] = tri_arr[i + j - 1]
            arr[j][i] = tri_arr[i + j - 1]
    return arr

sim_matrix = tri2mat(sim_matrix)

In [13]:
# Use the inbuilt dice similarity metric to create a distance matrix between the compounds
num_data = index
sim_matrix = np.zeros((num_data+1, num_data+1))
id_list = fingerprint_dict.keys()

for i, i_id in enumerate(id_list):
    for j, j_id in enumerate(id_list):
        sim_matrix[i, j] = rdkit.DataStructs.FingerprintSimilarity(fingerprint_dict[i_id], 
                                fingerprint_dict[j_id], metric=rdkit.DataStructs.DiceSimilarity)
print(sim_matrix)

[[1.         0.82352941 0.83823529 ... 0.21238938 0.38938053 0.40776699]
 [0.82352941 1.         0.86567164 ... 0.1981982  0.37837838 0.3960396 ]
 [0.83823529 0.86567164 1.         ... 0.1981982  0.37837838 0.3960396 ]
 ...
 [0.21238938 0.1981982  0.1981982  ... 1.         0.59090909 0.35897436]
 [0.38938053 0.37837838 0.37837838 ... 0.59090909 1.         0.82051282]
 [0.40776699 0.3960396  0.3960396  ... 0.35897436 0.82051282 1.        ]]


In [32]:
# Using a sklearn clustering method that allows a similarity matrix as input. 
# Returns a list of cluster indices
cluster = AffinityPropagation(affinity = 'precomputed').fit_predict(sim_matrix)




In [15]:
print(max(cluster))


174
