# Import packages

In [1]:
import pandas as pd
import numpy as np

# Import data

In [4]:
df = pd.read_excel('abdollahzadeh2022_SI.xlsx')

In [5]:
df.sample(5)

Unnamed: 0,HBA,HBD,HBA:HBD,T(K),Tc(K),Pc(bar),ω,Density (kg/m3),LSSVR,ARD%,AARD%
858,Methyl triphenylphosphonium bromide,Ethylene glycol,1;5.25,333.15,666.5,42.132,0.9354,1194.7,1194.233058,0.039084,
1048,Tetrabutylammonium chloride,Aspartic acid,9;1,323.15,785.64,14.575,0.8843,950.1,948.392188,0.179751,
1005,"N,N diethylenethanol ammonium chloride",Glycerol,1;3,333.15,695.61,33.703,1.2865,1184.0,1181.93775,0.174177,
343,Choline chloride,"1,2-propanediol",1;4,293.15,621.24,39.62,0.9402,1066.3,1070.790251,0.421106,
92,Acetyl choline chloride,Imidazole,1;2,313.15,641.14,43.55,0.4345,1114.1,1115.550655,0.130209,


# Select descriptors for the model and add SMILES

## Select descriptors

In [6]:
df = df.drop(['Tc(K)','Pc(bar)','ω','LSSVR','ARD%','AARD%'], axis=1)
df[['rate_HBA','rate_HBD']] = df['HBA:HBD'].str.split(';',expand=True, n=1)
df = df.drop(['HBA:HBD'], axis=1)

In [7]:
df.sample(5)

Unnamed: 0,HBA,HBD,T(K),Density (kg/m3),rate_HBA,rate_HBD
895,Methyl triphenylphosphonium bromide,Glycerol,333.15,1273.6,1,3.0
1168,Tetrabutylammonium chloride,Triethylene glycol,303.15,985.0,4,1.0
109,Acetyl choline chloride,Levulinic acid,313.15,1129.4,1,1.0
1036,Tetrabutylammonium chloride,Asinine,323.15,977.3,7,1.0
929,"N,N diethylenethanol ammonium chloride",Ethylene glycol,303.15,1096.5,1,2.03


In [8]:
from urllib.request import urlopen
from urllib.parse import quote

def CIRconvert(ids):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Did not work'


## Add SMILES of HBAs

In [9]:
# Step 1: Get unique values from df['HBA']
HBA_unique_values = df['HBA'].unique().tolist()

# Step 2: Create a dictionary mapping HBA names to SMILES strings
HBA_smiles_dict = {compound: CIRconvert(compound) for compound in HBA_unique_values}



In [10]:
# Find keys in HBA_smiles_dict with the value 'Did not work'
HBA_failed_conversions = [key for key, value in HBA_smiles_dict.items() if value == 'Did not work']

# Print the list of failed conversions
HBA_failed_conversions

['N,N diethylenethanol ammonium chloride', 'Trimethylglicine']

In [11]:
# Manually setting new values for existing keys:
df['HBA'] = df['HBA'].replace('N,N diethylenethanol ammonium chloride', 'n,n-Diethylethanolammonium chloride')
df['HBA'] = df['HBA'].replace('Trimethylglicine', 'Betaine')

HBA_smiles_dict['n,n-Diethylethanolammonium chloride'] = 'CC(C)(C)NCCO.Cl' # n,n-Diethylethanolammonium chloride
#HBA_smiles_dict['Trimethylglicine'] = 'C[N+](C)(C)CC([O-])=O' #This is also betaine

del HBA_smiles_dict['Trimethylglicine']
del HBA_smiles_dict['N,N diethylenethanol ammonium chloride']

In [12]:
# Step 3: Use map() to create the new column
df['HBA_SMILES'] = df['HBA'].map(HBA_smiles_dict)

## Add SMILES of HBDs

In [13]:
# Step 1: Get unique values from df['HBD']
HBD_unique_values = df['HBD'].unique().tolist()

# Step 2: Create a dictionary mapping HBA names to SMILES strings
HBD_smiles_dict = {compound: CIRconvert(compound) for compound in HBD_unique_values}

In [14]:
# Find keys in HBA_smiles_dict with the value 'Did not work'
HBD_failed_conversions = [key for key, value in HBD_smiles_dict.items() if value == 'Did not work']

# Print the list of failed conversions
HBD_failed_conversions



['Nfurfuryl alcohol', 'Asinine']

In [15]:
# Manually setting new values:
df['HBD'] = df['HBD'].replace('Nfurfuryl alcohol', 'Furfulyl alcohol')
df['HBD'] = df['HBD'].replace('Asinine', 'Arginine')

HBD_smiles_dict['Furfulyl alcohol'] = 'C1=COC(=C1)CO' # Furfulyl alcohol
HBD_smiles_dict['Arginine'] = 'C(C[C@@H](C(=O)O)N)CN=C(N)N' # Arginine

del HBD_smiles_dict['Nfurfuryl alcohol']
del HBD_smiles_dict['Asinine']

In [16]:
# Step 3: Use map() to create the new column
df['HBD_SMILES'] = df['HBD'].map(HBD_smiles_dict)

In [17]:
df.sample(5)

Unnamed: 0,HBA,HBD,T(K),Density (kg/m3),rate_HBA,rate_HBD,HBA_SMILES,HBD_SMILES
251,Betaine,Lactic acid,343.15,1169.9,1,2,C[N+](C)(C)CC([O-])=O,CC(O)C(O)=O
307,benzyldimethyl(2-hydroxyethyl) ammonium chloride,D-Mannose,328.15,1243.0,1,1,[Cl-].C[N+](C)(CCO)Cc1ccccc1,OC[C@@H](O)[C@@H](O)[C@H](O)[C@H](O)C=O
341,Choline chloride,"1,2-propanediol",313.15,1053.3,1,3,[Cl-].C[N+](C)(C)CCO,CC(O)CO
1152,Tetrabutylammonium chloride,Triethylene glycol,353.15,991.0,1,1,[Cl-].CCCC[N+](CCCC)(CCCC)CCCC,OCCOCCOCCO
77,Acetyl choline chloride,Guaiacol,303.15,1144.8,1,4,[Cl-].CC(=O)OCC[N+](C)(C)C,COc1ccccc1O


# Statistics of the dataset

In [22]:
df.describe(include='all')

Unnamed: 0,HBA,HBD,T(K),Density (kg/m3),rate_HBA,rate_HBD,HBA_SMILES,HBD_SMILES
count,1239,1239,1239.0,1239.0,1239.0,1239.0,1239,1239
unique,16,45,,,12.0,15.0,16,42
top,Choline chloride,Ethylene glycol,,,1.0,1.0,[Cl-].C[N+](C)(C)CCO,OCCO
freq,439,214,,,1133.0,428.0,439,225
mean,,,323.330589,1145.145359,,,,
std,,,20.084247,88.511771,,,,
min,,,283.15,928.0,,,,
25%,,,305.675,1089.65,,,,
50%,,,323.15,1149.0,,,,
75%,,,338.15,1217.15,,,,


# Splitting the dataset into train-test

## Random split

In [23]:
df_random = df
np.random.seed(42)
df_random['Split'] = np.random.choice(['Train', 'Test'], size=len(df), p=[0.8, 0.2])

In [24]:
df_random.to_csv('DES_data_clean_random.csv', index=False)

## Same compositions are kept together

In [36]:
df_temp = df
df_temp['HBA_HBD__comp_unique'] = df_temp['HBA'] + '||' + df_temp['HBD'] + '||' + df_temp['rate_HBA']  + ':' + df_temp['rate_HBD']

grouped = df_temp.groupby('HBA_HBD__comp_unique')

unique_pairs = list(grouped.groups.keys())
np.random.seed(42)  # for reproducibility
np.random.shuffle(unique_pairs)

n_total = len(df)
n_test_target = int(0.2 * n_total)
n_test_current = 0

# Dictionary to tag group membership
set_assignment = {}

for pair in unique_pairs:
    group_size = len(grouped.groups[pair])
    if n_test_current < n_test_target:
        set_assignment[pair] = 'Test'
        n_test_current += group_size
    else:
        set_assignment[pair] = 'Train'

# Apply assignment to a new column
df_temp['Split'] = df_temp['HBA_HBD__comp_unique'].map(set_assignment)

# Optionally drop the helper column
df_temp.drop(columns='HBA_HBD__comp_unique', inplace=True)

df_temp.to_csv('DES_data_clean_temp.csv', index=False)

In [37]:
df_temp

Unnamed: 0,HBA,HBD,T(K),Density (kg/m3),rate_HBA,rate_HBD,HBA_SMILES,HBD_SMILES,Split
0,Acetyl choline chloride,"1,2,4-triazole",303.15,1129.3,1,1,[Cl-].CC(=O)OCC[N+](C)(C)C,[nH]1cncn1,Train
1,Acetyl choline chloride,"1,2,4-triazole",313.15,1124.9,1,1,[Cl-].CC(=O)OCC[N+](C)(C)C,[nH]1cncn1,Train
2,Acetyl choline chloride,"1,2,4-triazole",323.15,1119.3,1,1,[Cl-].CC(=O)OCC[N+](C)(C)C,[nH]1cncn1,Train
3,Acetyl choline chloride,"1,2,4-triazole",333.15,1113.3,1,1,[Cl-].CC(=O)OCC[N+](C)(C)C,[nH]1cncn1,Train
4,Acetyl choline chloride,"1,2,4-triazole",343.15,1108.1,1,1,[Cl-].CC(=O)OCC[N+](C)(C)C,[nH]1cncn1,Train
...,...,...,...,...,...,...,...,...,...
1234,Betaine,Phenylacetic acid,298.15,1160.0,1,2,C[N+](C)(C)CC([O-])=O,OC(=O)Cc1ccccc1,Test
1235,Betaine,Phenylacetic acid,323.15,1150.0,1,2,C[N+](C)(C)CC([O-])=O,OC(=O)Cc1ccccc1,Test
1236,Betaine,Phenylacetic acid,333.15,1130.0,1,2,C[N+](C)(C)CC([O-])=O,OC(=O)Cc1ccccc1,Test
1237,Betaine,Phenylacetic acid,348.15,1120.0,1,2,C[N+](C)(C)CC([O-])=O,OC(=O)Cc1ccccc1,Test


## Same compounds and composition are kept together

In [40]:
df_comp = df
df_comp['HBA_HBD_unique'] = df_temp['HBA'] + '||' + df_temp['HBD']

grouped = df_comp.groupby('HBA_HBD_unique')

unique_pairs = list(grouped.groups.keys())
np.random.seed(42)  # for reproducibility
np.random.shuffle(unique_pairs)

n_total = len(df_comp)
n_test_target = int(0.2 * n_total)
n_test_current = 0

# Dictionary to tag group membership
set_assignment = {}

for pair in unique_pairs:
    group_size = len(grouped.groups[pair])
    if n_test_current < n_test_target:
        set_assignment[pair] = 'Test'
        n_test_current += group_size
    else:
        set_assignment[pair] = 'Train'

# Apply assignment to a new column
df_comp['Split'] = df_comp['HBA_HBD_unique'].map(set_assignment)

# Optionally drop the helper column
df_comp.drop(columns='HBA_HBD_unique', inplace=True)

df_comp.to_csv('DES_data_clean_comp.csv', index=False)

In [41]:
print(df_comp['Split'].value_counts())

Split
Train    986
Test     253
Name: count, dtype: int64
