In [1]:
import requests
import string
import os

In [2]:
#Download ZINC 2D 

mw_values = ["200", "250", "300", "325", "350", "375", "400", "425", "450", "500", ">500"]
mw_cols = list(string.ascii_uppercase[:11])

molecular_weight = dict(zip(mw_values, mw_cols))
print(molecular_weight)

logp_values = ["-1", "0", "1", "2", "2.5", "3", "3.5", "4", "4.5", "5", ">5"]
logp_rows = mw_cols

logp = dict(zip(logp_values, logp_rows))
print(logp)

{'200': 'A', '250': 'B', '300': 'C', '325': 'D', '350': 'E', '375': 'F', '400': 'G', '425': 'H', '450': 'I', '500': 'J', '>500': 'K'}
{'-1': 'A', '0': 'B', '1': 'C', '2': 'D', '2.5': 'E', '3': 'F', '3.5': 'G', '4': 'H', '4.5': 'I', '5': 'J', '>5': 'K'}


In [7]:
# Download Smiles
def fetch_zinc2D_smiles(mol_weight_range, logp_range, download_path, subset=None):
    
    if (mol_weight_range is None or logp_range is None) and subset is None:
        raise ValueError("Missing parameters")
    
    # Subsets defined by ZINC
    subsets = {
        # First tuple is start and end columns, second tuple is start and end rows
        "Drug-Like": [(1, 9), (0, 9)],
        "Lead-Like": [(2, 4), (0, 7)],
        "Lugs": [(4, 8), (0, 7)],
        "Goldilocks": [(2, 4), (3, 5)],
        "Fragments": [(0, 1), (0, 6)],
        "Flagments": [(1, 3), (0, 6)],
        "Big-n-Greasy": [(9, 10), (8, 10)],
        "Shards": [(0, 0), [0, 10]]
    }

    # This are the values that ZINC accepts
    mw_values = [200, 250, 300, 325, 350, 375, 400, 425, 450, 500, 550]
    logp_values = [-1, 0, 1, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6]
    # ZINC molecular weight is categorized in columns from A to K
    mw_cols = list(string.ascii_uppercase[:11])
    # LogP is categorized in rows from A to K
    logp_rows = mw_cols
    
    molecular_weight = dict(zip(mw_values, mw_cols))
    logp = dict(zip(logp_values, logp_rows))
    
    if subset is None:
        mw_lower_bound, mw_upper_bound = mol_weight_range
        logp_lower_bound, logp_upper_bound = logp_range
        
        # Discretize mol weight and logp values:
        mw_lower_bound = discretize_values(mw_lower_bound, mw_values, "Molecular weight")
        mw_upper_bound = discretize_values(mw_upper_bound, mw_values, "Molecular weight", lower=False)
        logp_lower_bound = discretize_values(logp_lower_bound, logp_values, "LogP")
        logp_upper_bound = discretize_values(logp_upper_bound, logp_values, "LogP", lower=False)

        start_col = mw_cols.index(molecular_weight[mw_lower_bound])
        end_col = mw_cols.index(molecular_weight[mw_upper_bound]) 
       
        start_row = logp_rows.index(logp[logp_lower_bound])
        end_row = logp_rows.index(logp[logp_upper_bound])
    else:
        start_col, end_col = subsets[subset][0]
        start_row, end_row = subsets[subset][1]
    
    col_list = mw_cols[start_col:end_col + 1]
    row_list = logp_rows[start_row:end_row + 1]
    
    base_url = "http://files.docking.org/2D/"
    # Get urls and download files
    for col in col_list:
        for row in row_list:
            tranch = col + row
            url = base_url + tranch + "/" + tranch
            # Each tranch is divided into various files from A to E
            for f in ["A", "B", "C", "E"]:
                for j in ["A", "B"]:
                    url_download = url + f + j + ".smi"
                    try:
                        r = requests.get(url_download, allow_redirects=True)
                    except:
                        print("Could not download file from {}".format(url_download))
                    
                    file_name = tranch + f + j + ".smi"
                    file_path = os.path.join(download_path, file_name)
                    with open(file_path, "wb") as file:
                        file.write(r.content)
                

In [9]:
path = "./zinc"
fetch_zinc2D_smiles(mol_weight_range=(300, 350), logp_range=(2, 3), download_path=path, subset="Goldilocks")

In [75]:
def ZINC(download_path, subset="Lead-Like", mol_weight_range=None, logp_range=None):
    
    if (mol_weight_range is None or logp_range is None) and subset is None:
        raise ValueError("Missing parameters")

    if len(mol_weight_range) > 2 or len(logp_range) > 2:
        raise ValueError("Range only takes two numbers")
    
    mw_lower_bound, mw_upper_bound = mol_weight_range
    logp_lower_bound, logp_upper_bound = logp_range
    
    mw_values = [200, 250, 300, 325, 350, 375, 400, 425, 450, 500, 550]
    logp_values = [-1, 0, 1, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6]
    
    # Discretize mol weight and logp values:
    mw_lower_bound = discretize_values(mw_lower_bound, mw_values, "Molecular weight")
    mw_upper_bound = discretize_values(mw_upper_bound, mw_values, "Molecular weight", lower=False)
    logp_lower_bound = discretize_values(logp_lower_bound, logp_values, "LogP")
    logp_upper_bound = discretize_values(logp_upper_bound, logp_values, "LogP", lower=False)
    
    print(mw_lower_bound)
    print(mw_upper_bound)
    print(logp_lower_bound)
    print(logp_upper_bound)
    
ZINC(download_path="./", subset=None, mol_weight_range=(215, 334), logp_range=(1.25, 3.75))

200
350
1
4


In [74]:
def discretize_values(value, bins, name, lower=True):
    
    for i in range(len(bins) - 1):
        if value < bins[0]:
            raise ValueError("{} must be at least {}".format(name, bins[0]))
        elif value >= bins[-1]:
            value = bins[-1]
        if value > bins[i] and value < bins[i + 1]:
            if lower:
                value = bins[i]
            else:
                value = bins[i + 1]

    return value

In [80]:
!ls -l 

total 96
-rw-rw-r-- 1 daniel daniel  6712 sep  7 18:55 chembl.ipynb
-rw-rw-r-- 1 daniel daniel  2880 sep  8 09:47 goldilocks_inStock.uri
-rw-rw-r-- 1 daniel daniel 58866 sep  8 08:35 mols.smi
-rw-rw-r-- 1 daniel daniel    72 sep  7 17:29 pubchem.ipynb
drwxrwxr-x 3 daniel daniel  4096 sep  8 09:37 zinc
-rw-rw-r-- 1 daniel daniel  2880 sep  7 19:27 ZINC-downloader-2D-smi.uri
-rw-rw-r-- 1 daniel daniel  8649 sep  8 11:18 zinc.ipynb


In [89]:
file_wait_ok = "./ZINC-downloader-2D-smi.uri"
file_in_stock = "./goldilocks_inStock.uri"

with open(file_wait_ok, "r") as f:
    urls_ok = f.read()

with open(file_in_stock, "r") as f:
    urls_stock = f.read()
    
urls_ok == urls_stock

True

In [90]:
smi_wait_ok = "./wait_ok.smi"
smi_in_stock = "./in_stock.smi"

with open(smi_wait_ok, "r") as f:
    smi_ok = f.readlines()

with open(smi_in_stock, "r") as f:
    smi_stock = f.readlines()
    
len(smi_ok) == len(smi_stock)

True