The goal of this notebook is to explore the distributions of the test, valid and train data sets.

In [1]:
import pandas as pd
import numpy as np

In [4]:
test_df = pd.read_csv(r'..\shareddata\random_split_enantiomer_data_test.csv')
valid_df =pd.read_csv(r'..\shareddata\random_split_enantiomer_data_valid.csv')
train_df = pd.read_csv(r'..\shareddata\random_split_enantiomer_data_train.csv')

In [None]:
test_df.head()
#### Notes ####
# index column pairs enantiomers together. So we should be able to count pairs 
# Unnamed 0 is the true index

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,SMILES,RT,Speed,i-PrOH_proportion,Literature,Column
0,14433,14433,7721,COc1ccc(cc1)C(=O)[C@H](Br)[C@H](O)c2ccccc2,14.3,1.0,0.1,10.1002/adsc.202000080,IC
1,24541,24541,13412,CCCCC[C@@H](C=C)n1ccc2cc(ccc12)C#Cc3ccccc3,8.8,1.0,0.001,10.1002/chem.202004613,ODH
2,24868,24868,13597,O[C@]1(Cc2ccccc2C1=O)[C@@H](C=C)c3ccccc3,12.5,1.0,0.2,10.1021/acs.orglett.7b02577,ADH
3,13202,13202,7048,COC(=O)C[C@@]1(C(=O)N(C(=O)OC(C)(C)C)c2ccccc12...,17.1,1.0,0.2,10.31635/ccschem.020.202000443,IB
4,12373,12373,6593,COC(=O)[C@@H](CN(Cc1ccccc1)Cc2ccccc2)Cc3ccc(OC...,16.0,0.5,0.02,10.1002/anie.201412132,IA


In [8]:
print(f'test shape {test_df.shape} and valid shape {valid_df.shape} ')

test shape (1278, 9) and valid shape (1278, 9) 


In [7]:
# count enantiomer pairs in test and valid
test_enantiomer_pair_count=len(test_df['index'])-len(test_df['index'].drop_duplicates())
valid_enantiomer_pair_count=len(valid_df['index'])-len(valid_df['index'].drop_duplicates())
print(f'the number of enantiomer pairs in the test and valid set are {test_enantiomer_pair_count} and {valid_enantiomer_pair_count} respectively')

the number of enantiomer pairs in the test and valid set are 32 and 25 respectively


is it a problem that there is a low proportion of enantiomers in the test and valid data sets?
Are they a fair test if the traing data contains the other enantiomer? 
Maybe we should test data splitting on the enantiomer index to better test model generalizability?

In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity

mol_list = [Chem.MolFromSmiles(smiles) for smiles in test_df['SMILES']]
# Generate Morgan fingerprints
radius = 2  # Define the radius of the fingerprint
num_bits = 2048  # Define the number of bits in the fingerprint
fp_list = [AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits) for mol in mol_list]

# Calculate Tanimoto similarities
similarity_matrix = []
for i in range(len(fp_list)):
    row = []
    for j in range(len(fp_list)):
        similarity = TanimotoSimilarity(fp_list[i], fp_list[j])
        row.append(similarity)
    similarity_matrix.append(row)

# Print the similarity matrix
print(np.array(similarity_matrix))



[[1.         0.13235294 0.19298246 ... 0.0754717  0.13114754 0.125     ]
 [0.13235294 1.         0.16176471 ... 0.0625     0.09589041 0.13846154]
 [0.19298246 0.16176471 1.         ... 0.09259259 0.14516129 0.12068966]
 ...
 [0.0754717  0.0625     0.09259259 ... 1.         0.05263158 0.10204082]
 [0.13114754 0.09589041 0.14516129 ... 0.05263158 1.         0.1       ]
 [0.125      0.13846154 0.12068966 ... 0.10204082 0.1        1.        ]]
