In [4]:
import pandas as pd

# Dataset for training and testing preparation

## Reading the files

In [5]:
df_actives = pd.read_csv('./procesed/actives/final_actives.csv')
df_inactives = pd.read_csv('./procesed/inactives/final_inactives.csv')
df_new_positives = pd.read_csv('./procesed/inconclusives/new_positives.csv')

print(f'There are {len(df_actives) + len(df_new_positives)} active molecules in the dataset')
print(f'There are {len(df_inactives)} inactive molecules in the dataset')


There are 203 active molecules in the dataset
There are 4681 inactive molecules in the dataset


In [14]:
df_concat = pd.concat([df_actives, df_inactives, df_new_positives]).sample(frac=1).sample(frac=1).sample(frac=1).sample(frac=1).reset_index(drop=True)
df_concat_wo_dup = df_concat.drop_duplicates(subset=['Smiles'])

## Spliting the dataset for training and teting with sklearn

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df = df_concat_wo_dup.sample(frac = 1).sample(frac = 1).sample(frac = 1).sample(frac = 1).sample(frac = 1)

# Spliting the data
df_train, df_test = train_test_split(df, test_size=0.2)

print(f'There are {len(df)} molecules in the dataset')
print(f'There are {len(df_train)} molecules in training dataset')
print(f'There are {len(df_test)} molecules in test dataset')

There are 4881 molecules in the dataset
There are 3904 molecules in training dataset
There are 977 molecules in test dataset


### Oversampling the training dataset

In [10]:
df = df_train.copy()
df = df.sample(frac=1).sample(frac=1) # Just a casual shuffling



active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

print('Oversampling active ones')

# Actives
df_active = df[active].sample(frac=2, replace=True)

# Inactives
df_inactive = df[inactive].sample(frac=1)

# Concatening
df = pd.concat([df_active, df_inactive]).sample(frac=1).sample(frac=1).sample(frac=1)

# Analizing again the active an inactive molecules
active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

print(' - - - - - - - - - - - - ')

df_train_labeled = df[['Smiles', 'is_active']]
df_train_labeled.to_csv('./data_for_training_and_testing/training_ds.csv', index=False)

Ther are 161 different active molecules
Ther are 3743 different inactive molecules
Oversampling active ones
Ther are 322 different active molecules
Ther are 3743 different inactive molecules
 - - - - - - - - - - - - 


### Saving the test set

In [11]:
df = df_test.copy()
df = df.sample(frac=1) # Just a casual shuffling

active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

#df['is_active'] = np.where(df['pChEMBL Value'] >= active_cutoff, 1, 0)
df_train_labeled = df[['Smiles', 'is_active']]
df_train_labeled.to_csv('./data_for_training_and_testing/test_ds.csv', index=False)

Ther are 39 different active molecules
Ther are 938 different inactive molecules


### Creating and saving a dummy training set

In [12]:
df_actives = df[active]
num_actives = len(df_actives)

df_inactives = df[inactive].sample(num_actives)

# Concatening
df_dummy_training = pd.concat([df_actives, df_inactives]).sample(frac=1).sample(frac=1).sample(frac=1)
df_dummy_training.to_csv('./data_for_training_and_testing/dummy_training_ds.csv', index=False)

---

## actives + inconclusives + decoys

Decoy molecules where generated with DUD-E to enhance the feature extraction from the molecules. As seen in this paper, this aproach is used to make benchmark dataset, since this aproach test the models in its performance to distinguish between active molecules from similar inactive molecules with out bias.
* Réau M, Langenfeld F, Zagury J-F, Lagarde N and Montes M (2018) Decoys Selection in Benchmarking Datasets: Overview and Perspectives. Front. Pharmacol. 9:11. doi: 10.3389/fphar.2018.00011

Decoys were generated from the active molecules.

In [15]:
df_actives = pd.read_csv('./procesed/actives/final_actives.csv')
df_decoys = pd.read_csv('./procesed/actives/decoys/decoy_final.csv')
df_inactives = pd.read_csv('./procesed/inactives/final_inactives.csv')
df_new_positives = pd.read_csv('./procesed/inconclusives/new_positives.csv')

print(f'There are {len(df_actives) + len(df_new_positives)} active molecules in the dataset')
print(f'There are {len(df_inactives) + len(df_decoys)} inactive molecules in the dataset')

There are 203 active molecules in the dataset
There are 5246 inactive molecules in the dataset


In [16]:
df_concat = pd.concat([df_actives, df_decoys, df_inactives, df_new_positives]).sample(frac=1).sample(frac=1).sample(frac=1).sample(frac=1).reset_index(drop=True)
df_concat_wo_dup = df_concat.drop_duplicates(subset=['Smiles'])

In [17]:
df = df_concat_wo_dup.sample(frac = 1).sample(frac = 1).sample(frac = 1).sample(frac = 1).sample(frac = 1)

# Spliting the data
df_train, df_test = train_test_split(df, test_size=0.2)

print(f'There are {len(df)} molecules in the dataset')
print(f'There are {len(df_train)} molecules in training dataset')
print(f'There are {len(df_test)} molecules in test dataset')


There are 5446 molecules in the dataset
There are 4356 molecules in training dataset
There are 1090 molecules in test dataset


In [20]:
df = df_train.copy()
df = df.sample(frac=1).sample(frac=1) # Just a casual shuffling

active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

print('Oversampling active ones')

# Actives
# NOT OVERSAMPLING
df_active = df[active].sample(frac=1, replace=True)

# Inactives
df_inactive = df[inactive].sample(frac=1)

# Concatening
df = pd.concat([df_active, df_inactive]).sample(frac=1).sample(frac=1).sample(frac=1)

# Analizing again the active an inactive molecules
active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

print(' - - - - - - - - - - - - ')

df_train_labeled = df[['Smiles', 'is_active']]
df_train_labeled.to_csv('./data_for_training_and_testing/act_inc_dec/training_wdec_ds.csv', index=False)

Ther are 163 different active molecules
Ther are 4193 different inactive molecules
Oversampling active ones
Ther are 163 different active molecules
Ther are 4193 different inactive molecules
 - - - - - - - - - - - - 


In [19]:
df = df_test.copy()
df = df.sample(frac=1) # Just a casual shuffling

active = df['is_active'] == 1
inactive = df['is_active'] == 0
print(f'Ther are {len(df[active])} different active molecules')
print(f'Ther are {len(df[inactive])} different inactive molecules')

#df['is_active'] = np.where(df['pChEMBL Value'] >= active_cutoff, 1, 0)
df_train_labeled = df[['Smiles', 'is_active']]
df_train_labeled.to_csv('./data_for_training_and_testing/act_inc_dec/test_wdec_ds.csv', index=False)

Ther are 37 different active molecules
Ther are 1053 different inactive molecules
