In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('original/tabularized_results.xlsx')
print(df.shape)
df.head(3)

(1104, 17)


Unnamed: 0,file,extractant1,extractant1 conc. (M),extractant2,extractant2 conc. (M),holdback,holdback conc. (M),acid conc. (M),nitrate conc. (M),solvent1,solvent1 vol fraction,solvent2,solvent2 vol fraction,temperature (C),time (min),metal,logD
0,./ST100.json,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)N(CCCCCCCC)CCC...,0.2,,0.0,O=C(N1CCOCC1)c2nc(P(O)([O-])=O)cnc2.[Na+],0.1,0.097724,0.597724,tph,0.95,1-octanol,0.05,22.0,60.0,Am,1.033585
1,./ST100.json,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)N(CCCCCCCC)CCC...,0.2,,0.0,O=C(N1CCOCC1)c2nc(P(O)([O-])=O)cnc2.[Na+],0.1,0.097724,0.597724,tph,0.95,1-octanol,0.05,22.0,60.0,Eu,2.058418
2,./ST100.json,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)N(CCCCCCCC)CCC...,0.2,,0.0,O=C(N1CCOCC1)c2nc(P(O)([O-])=O)cnc2.[Na+],0.1,0.009772,0.509772,tph,0.95,1-octanol,0.05,22.0,60.0,Am,-0.201349


### Filter metals

In [3]:
An_list = ['Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
metal_pattern = '|'.join(An_list)

df_filtered = df[df['metal'].str.contains(metal_pattern, na=False, regex=True)]

print(df_filtered.shape)
df_filtered.head(2)

(592, 17)


Unnamed: 0,file,extractant1,extractant1 conc. (M),extractant2,extractant2 conc. (M),holdback,holdback conc. (M),acid conc. (M),nitrate conc. (M),solvent1,solvent1 vol fraction,solvent2,solvent2 vol fraction,temperature (C),time (min),metal,logD
0,./ST100.json,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)N(CCCCCCCC)CCC...,0.2,,0.0,O=C(N1CCOCC1)c2nc(P(O)([O-])=O)cnc2.[Na+],0.1,0.097724,0.597724,tph,0.95,1-octanol,0.05,22.0,60.0,Am,1.033585
2,./ST100.json,CCCCCCCCN(CCCCCCCC)C(=O)COCC(=O)N(CCCCCCCC)CCC...,0.2,,0.0,O=C(N1CCOCC1)c2nc(P(O)([O-])=O)cnc2.[Na+],0.1,0.009772,0.509772,tph,0.95,1-octanol,0.05,22.0,60.0,Am,-0.201349


### Remove Nan or empty

In [4]:
# Remove rows where there is 'extractant2'

df_filtered = df_filtered[df_filtered['extractant2'].isna()]

df_filtered.shape

(507, 17)

In [5]:
# Remove rows where there is 'holdback'

df_filtered = df_filtered[df_filtered['holdback'].isna()]

df_filtered.shape

(312, 17)

In [6]:
# Remove rows where value is NaN

df_filtered['temperature (C)'] = pd.to_numeric(df_filtered['temperature (C)'], errors='coerce')

df_filtered = df_filtered[df_filtered['temperature (C)'].notna()]

df_filtered.shape

(307, 17)

### Calculation

In [7]:
from rdkit import Chem

In [8]:
def get_canonical_smiles(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(molecule, canonical=True)

df_filtered['SMILES'] = df_filtered['extractant1'].apply(get_canonical_smiles)

df_filtered.shape

(307, 18)

In [9]:
# Convert T to Kelvin and create a new column 'Temperature_K'
df_filtered['Temperature_K'] = df_filtered['temperature (C)'] + 273.15

df_filtered.shape

(307, 19)

In [10]:
# Convert volumne ratio
df_filtered['volume_fraction_B'] = df_filtered['solvent2 vol fraction']

df_filtered.shape

(307, 20)

In [11]:
df_filtered.to_excel('output_cleaned_data.xlsx', index=False)