In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import multiprocessing
import time
from tqdm import tqdm_notebook as tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [7]:
def get_fps(candidates):
    smiles, fps = [], []
    for sm in tqdm(candidates):
        mol = Chem.MolFromSmiles(sm)
        if mol is None:
            continue
        smiles.append(sm)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) # ECFP4
        fps.append(fp)
    return smiles, fps

In [49]:
def choose_pair_1(smiles, fps, n):
    m = 0
    N = len(smiles)
    for i in tqdm(range((m*N)//n, ((m+1)*N//n))):
        tanimotos = np.array(DataStructs.BulkTanimotoSimilarity(fps[i],fps)) # Tanimoto similarity
        tanimotos[i] = 0 # Choose second largest (not the self)
        arg_i = np.argmax(tanimotos)
        with open('../data/chembl_24_bert.csv', 'a') as f:
            f.write('%s,%s,,,%f\n' %(smiles[i],smiles[arg_i],tanimotos[arg_i]))
            
def choose_pair_2(smiles, fps, n):
    m = 1
    N = len(smiles)
    for i in tqdm(range((m*N)//n, ((m+1)*N//n))):
        tanimotos = np.array(DataStructs.BulkTanimotoSimilarity(fps[i],fps)) # Tanimoto similarity
        tanimotos[i] = 0 # Choose second largest (not the self)
        arg_i = np.argmax(tanimotos)
        with open('../data/chembl_24_bert.csv', 'a') as f:
            f.write('%s,%s,,,%f\n' %(smiles[i],smiles[arg_i],tanimotos[arg_i]))

In [50]:
num_process = 2

df = pd.read_csv('../data/chembl_24.csv')
candidates = df['canonical_smiles'].values
start = time.time()
smiles, fps = get_fps(candidates[:10000])
elapsed = time.time()-start
print('FP: %f seconds elapsed.' %elapsed)
N = len(smiles)
print('N=%d'%N)

with open('../data/chembl_24_bert.csv', 'a') as f:
    f.write('first,second,first_sp,second_sp,tanimoto\n')
# Execute multiprocessed choose_pair
start = time.time()
process_1 = multiprocessing.Process(target=choose_pair_1, args=([smiles, fps, num_process]))
process_2 = multiprocessing.Process(target=choose_pair_2, args=([smiles, fps, num_process]))
process_1.start()
process_2.start()
process_1.join()
process_2.join()
elapsed = time.time()-start
print('Pair: %f seconds elapsed.' %elapsed)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

FP: 5.899448 seconds elapsed.
N=10000
Pair: 14.623738 seconds elapsed.


In [36]:
100000/32.42

3084.515731030228

In [27]:
import sys
a = np.zeros([1000,1000])
sys.getsizeof(a)

8000112

In [None]:
def choose_pair(smiles, fps, n,m):
    N = len(smiles)
    if m<n-1:
        lst = smiles[m*(N//n):(m+1)*(N//n)]
    else:
        lst = smiles[m*(N//n):]
    for i,sm in tqdm(enumerate(lst)):
        tanimotos = np.array(DataStructs.BulkTanimotoSimilarity(fps[i],fps)) # Tanimoto similarity
        idx = np.argpartition(tanimotos,-2)[-2] # Choose second largest (not the self)
        #idx = tanimotos.argmax()
        with open('../data/chembl_24_bert.csv', 'a') as f:
            f.write('%s,%s,,,%f\n' %(sm,smiles[idx],tanimotos[idx]))

In [51]:
import numpy as np
from sklearn.model_selection import train_test_split

In [56]:
df_train, df_test = train_test_split(df, test_size=0.5)
print('Train size: %d' %len(df_train))
print('Test size: %d' %len(df_test))

Train size: 861147
Test size: 861148


In [57]:
df_train.to_csv('../data/chembl24_train.csv', index=False)
df_test.to_csv('../data/chembl24_test.csv', index=False)