In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('original/full_original_dataset.xlsx')
print(df.shape)
df.head(3)

(1258, 44)


Unnamed: 0,Entry ID,Ligand conc. (mM),Volume ratio of solvent A,Volume ratio of solvent B,Molar mass of solvent A,Density of solvent A,Boiling point of solvent A,Melting point of solvent A,Dipole moment of solvent A,Solubility in water of solvent A,...,Ionic radius of metal,Standard entropy of metal (J/mol.K),log D,Reference,train-val-test,Ligand ID,Ligand SMILES,Metal,Solvent A,Solvent B
0,0,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,1.03,56.9,0.477121,1,train,LigID_0,CN(C(C1=CC=CC(C(N(C2=CC=CC=C2)C)=O)=N1)=O)C3=C...,La,CH3Cl,
1,1,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,1.02,72.0,0.30103,1,train,LigID_0,CN(C(C1=CC=CC(C(N(C2=CC=CC=C2)C)=O)=N1)=O)C3=C...,Ce,CH3Cl,
2,2,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,0.99,73.9,0.477121,1,train,LigID_0,CN(C(C1=CC=CC(C(N(C2=CC=CC=C2)C)=O)=N1)=O)C3=C...,Pr,CH3Cl,


In [3]:
from rdkit import Chem

In [4]:
def get_canonical_smiles(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(molecule, canonical=True)

df['SMILES'] = df['Ligand SMILES'].apply(get_canonical_smiles)

In [5]:
# Convert ligand concentration
df['Ligand_conc_M'] = df['Ligand conc. (mM)']/1000

In [6]:
# Convert Temperature to Kelvin and create a new column 'Temperature_K'
df['Temperature_K'] = df['Temperature'] - 273 + 273.15

In [7]:
# Merge DOI

df_ref = pd.read_excel('original/full_original_dataset.xlsx', sheet_name = 'references')
df_ref.head(3)

Unnamed: 0,ID,Reference,DOI
0,1,"A. Shimada, T. Yaita, H. Narita, S. Tachimori,...",https://doi.org/10.1081/SEI-120030392
1,2,"M. R. Healy, A. S. Ivanov, Y. Karslyan, V. S. ...",https://doi.org/10.1002/chem.201806443
2,3,"E. Mowafy, D. Mohamed, Sep. Purif. Technol. 20...",https://doi.org/10.1016/j.seppur.2014.03.005


In [8]:
# Sort by DOI ID
merged_df = pd.merge(df, df_ref, left_on='Reference', right_on='ID', how='left')

merged_df = merged_df.sort_values(by='ID')

In [9]:
# Merge acid type
acid_type_data = {
    'Acid_type': ['HNO3', 'HCl', 'H2SO4', 'Citric Acid', 'organic acid'],
    'Dipole_moment_D': [2.17, 1.05, 2.72, 3.33, 1.4]
}
acid_type_df = pd.DataFrame(acid_type_data)

merged_df = pd.merge(merged_df, acid_type_df, left_on='Acid dipole', right_on='Dipole_moment_D', how='left')

In [10]:
merged_df.head(3)

Unnamed: 0,Entry ID,Ligand conc. (mM),Volume ratio of solvent A,Volume ratio of solvent B,Molar mass of solvent A,Density of solvent A,Boiling point of solvent A,Melting point of solvent A,Dipole moment of solvent A,Solubility in water of solvent A,...,Solvent A,Solvent B,SMILES,Ligand_conc_M,Temperature_K,ID,Reference_y,DOI,Acid_type,Dipole_moment_D
0,0,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,CH3Cl,,CN(C(=O)c1cccc(C(=O)N(C)c2ccccc2)n1)c1ccccc1,0.5,273.15,1,"A. Shimada, T. Yaita, H. Narita, S. Tachimori,...",https://doi.org/10.1081/SEI-120030392,HNO3,2.17
1,50,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,CH3Cl,,CN(C(=O)c1cccc(C(=O)N(C)c2ccccc2)n1)c1ccccc1,0.5,273.15,1,"A. Shimada, T. Yaita, H. Narita, S. Tachimori,...",https://doi.org/10.1081/SEI-120030392,HNO3,2.17
2,49,500.0,10,0,50.49,1.003,249.3,175.8,1.9,5.325,...,CH3Cl,,CN(C(=O)c1cccc(C(=O)N(C)c2ccccc2)n1)c1ccccc1,0.5,273.15,1,"A. Shimada, T. Yaita, H. Narita, S. Tachimori,...",https://doi.org/10.1081/SEI-120030392,HNO3,2.17


In [11]:
merged_df.to_excel('output_cleaned_data.xlsx', index=False)