In [1]:
import os
import re
import numpy
import pandas
import zipfile
import argparse
import tempfile
import threading
import subprocess
from glob import glob
from tqdm import tqdm
from pathlib import Path
from random import shuffle

In [21]:
def read_shuffled_sample_compounds(input_file):
    return [compound.strip().split(',') for compound in open(input_file)]

def extract_resorted_pdbqt_archives(compounds,tranche):
    indices = numpy.where(compounds == tranche)[0]
    tranche_zincIDs = numpy.char.add(compounds[indices][:,1].astype(str), '.pdbqt')
    tranche_zip = zipfile.ZipFile(f'./tinyPDBQT_filtered/{tranche}.zip', 'r')
    pdbqt_fileslist = tranche_zip.namelist()
    pdbqt_files = [tranche_zip.read(f'{zincID}') for zincID in sort_sublist_by_original_list(pdbqt_fileslist, tranche_zincIDs)]
    return pdbqt_files

def sort_sublist_by_original_list(full_list, sublist):
    order_dict = {element: index for index, element in enumerate(full_list)}
    return sorted(sublist, key=lambda x: order_dict[x])

In [20]:
parser = argparse.ArgumentParser(description='Extract random lines from a file by chance and print the last column.')
parser.add_argument('-i','--input_file', type=str, required=True, help='Path to the input file containing the smiles String, the Tranche and the zinc-ID.')
parser.add_argument('-N','--random_lines',type=int, default=100, help='Number of random compounds to be extraced from the compressed database.')
parser.add_argument('-d','--outputDirectory',type=str, default='./tempPDBQT/', help='Output directory that will contain the random subsample of compounds.')

# set argument variables
args = parser.parse_args(["-i", "./tinyT38DrugDB.csv", 
                          "-N", "10000",
                          "-d", "./myPDBQTdockingFiles"])
input_file = args.input_file
random_lines = args.random_lines
outputDirectory = args.outputDirectory

# subsample the N random compounds from the list of molecules
compounds = pandas.read_csv(input_file).sample(random_lines).values
tranches = numpy.unique(compounds[:,2])
pdbqt_ligands = []

progress_bar = tqdm(tranches, position=1, bar_format='{desc}|{bar:60}|{n}/{total} [{elapsed}<{remaining}]')
for tranche in progress_bar:
    progress_bar.set_description(f'Extracting from {tranche}...\n')
    pdbqt_ligands.extend(extract_resorted_pdbqt_archives(compounds,tranche))

# writing compounds to files in specified directory
Path(outputDirectory).mkdir(parents=True, exist_ok=True)
for pdbqt_ligand in tqdm(pdbqt_ligands, desc=f'Writing Files to {outputDirectory}...\n', bar_format='{desc}|{bar:60}|{n}/{total} [{elapsed}<{remaining}]'):
    zincID = re.search('ZINC[0-9]+',str(pdbqt_ligand)).group(0)
    with open(f'{outputDirectory}/{zincID}.pdbqt','wb') as outputFile:
        outputFile.write(pdbqt_ligand)

Extracting from IFAARN...: |████████████████████████████████████████████████████████████|149/149


 Fetched 10000 Compounds from the PDBQT zip archives!


Fetching Compounds from zip...|████████████████████████████████████████████████████████████|10000/10000
