In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
import seaborn as sns

import tmap, os

from chembench import dataset

from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
%matplotlib inline
tqdm.pandas(ascii=True)
np.random.seed(123)





In [2]:
n_fold = 5

In [5]:
esol = dataset.load_ESOL()
lipop = dataset.load_Lipop()
malaria = dataset.load_Malaria()
hiv = dataset.load_HIV()
bace = dataset.load_BACE()
bbbp = dataset.load_BBBP()

total samples: 1128
total samples: 4200
total samples: 9999
total samples: 41127
total samples: 1513
total samples: 2039


In [6]:
random_seeds = [8, 32, 64, 128, 512, 1024, 2048, 4096, 8192, 16384]
cols = ['rd_%s' % i for i in random_seeds]

data_save_folder = './rand_split_results/'
if not os.path.exists(data_save_folder):
    os.makedirs(data_save_folder)

    
for data in [esol, lipop, malaria, hiv, bace, bbbp ]:
    
    task_name = data.task_name
    df = pd.DataFrame(data.x, columns = ['smiles'])
    df[task_name] = data.y

    ncols = []
    for seed, col in zip(random_seeds, cols):
        kf = KFold(n_splits=5, shuffle = True, random_state=seed)
        gb = {}
        i = 1
        for tr, ts in kf.split(range(len(df))):
            gb.update(dict(zip(ts, [i for x in ts])))
            i += 1
        ncol = pd.Series(gb).sort_index()
        ncols.append(ncol)
        
    dfc = pd.concat(ncols, axis=1)
    dfc.columns = cols
    df.join(dfc).to_csv(os.path.join(data_save_folder, 'rd_split_%s.csv' % task_name))

In [7]:
dfc

Unnamed: 0,rd_8,rd_32,rd_64,rd_128,rd_512,rd_1024,rd_2048,rd_4096,rd_8192,rd_16384
0,3,4,3,2,2,1,2,1,5,4
1,4,3,5,3,2,3,4,3,3,2
2,5,3,1,4,4,3,3,3,4,4
3,4,3,3,1,3,4,4,4,1,3
4,3,3,4,2,5,3,2,4,5,5
...,...,...,...,...,...,...,...,...,...,...
2034,3,5,5,3,1,3,4,5,3,2
2035,4,2,2,2,1,1,2,2,1,4
2036,3,4,2,3,4,5,2,4,3,4
2037,3,4,4,5,4,5,1,2,5,5
