In [1]:
import pandas as pd

In [2]:
df_template = pd.read_excel('../data_template/data_template.xlsx', header=1)
df_template

Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments


In [3]:
df_cleanData = pd.read_excel('output_cleaned_data.xlsx')
print(df_cleanData.shape)
df_cleanData.columns

(1300, 34)


Index(['SMILES_orig', 'Name', 'Extractant Concentration (M)', 'Solvent',
       'Metal Identity', 'Oxidation Number', 'Isotope', 'Acid Type',
       'Acid Concentration (M)', 'Add. Ligand',
       'Add. ligand Concentration (M)', 'Temperature (Â°C)',
       'Contact Time (Min)', 'Distribution Coefficient', 'Chemical Name',
       'Abbreviation', 'Formula', 'Molecular Mass (g/mol)', 'Chemical Group',
       'Additional', 'InChI', 'InChIKey', 'Removed Values from Du', 'Date',
       'Reference', 'Link', 'SMILES', 'Temperature_K', 'log_D', 'Solvent A',
       'Solvent B', 'volume ratio', 'volume fraction B', 'DOI'],
      dtype='object')

In [4]:
df_final = df_template.copy()
df_final['Entry_ID'] = [i for i in range(df_cleanData.shape[0])]
df_final['Extractants_count'] = [1] * df_cleanData.shape[0]
df_final['Comments'] = ['IDEaL'] * df_cleanData.shape[0]
df_final.head(3)

Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments
0,0,1,,,,,,,,,,,,,,,IDEaL
1,1,1,,,,,,,,,,,,,,,IDEaL
2,2,1,,,,,,,,,,,,,,,IDEaL


In [5]:
df_final['SMILES'] = df_cleanData['SMILES']
df_final['Extractant_conc_M'] = df_cleanData['Extractant Concentration (M)']
df_final['Solvent_A'] = df_cleanData['Solvent A']
df_final['Solvent_B'] = df_cleanData['Solvent B']

df_cleanData['Oxidation Number'] = df_cleanData['Oxidation Number'].fillna('')
def get_metal_string(row):
    # If oxidation number is available, use it
    if row['Oxidation Number'] != '':
        return f"{row['Metal Identity']}({row['Oxidation Number']})"
    # common case. Note that we introduce approximation here
    if row['Metal Identity'] == 'Pu':
        return f"{row['Metal Identity']}(IV)"
    elif row['Metal Identity'] == 'U':
        return f"{row['Metal Identity']}(VI)"
    elif row['Metal Identity'] == 'Th':
        return f"{row['Metal Identity']}(IV)"
    elif row['Metal Identity'] == 'Np':
        return f"{row['Metal Identity']}(VI)"
    elif row['Metal Identity'] == 'Am':
        return f"{row['Metal Identity']}(III)"
    else:
        return f"{row['Metal Identity']}(III)"

# Apply the function to each row in df_cleanData
df_final['Metal'] = df_cleanData.apply(get_metal_string, axis=1)


df_final['Metal_conc_mM'] = [0] * df_cleanData.shape[0]

df_final['Acid_type'] = df_cleanData['Acid Type']
df_final['Acid_conc_M'] = df_cleanData['Acid Concentration (M)']
df_final['Temperature_K'] = df_cleanData['Temperature_K']
df_final['Distribution_ratio'] = df_cleanData['Distribution Coefficient']
df_final['Log_D'] = df_cleanData['log_D']
df_final['DOI'] = df_cleanData['DOI']

df_final['Volume_fraction_B'] = df_cleanData['volume fraction B']
df_final['Volume_fraction_A'] = 1- df_cleanData['volume fraction B']

df_final.head(3)

Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments
0,0,1,S=P(S)(c1ccccc1Cl)c1ccccc1Cl,0.5,toluene,,1.0,0.0,Am(III),0,HNO3,0.4,294.15,0.4,-0.39794,https://doi.org/10.1524/ract.2010.1708,IDEaL
1,1,1,CCCCC(CC)CP(=O)(CC(=O)N(CC(C)C)CC(C)C)c1ccccc1,0.25,Tetrachloroethylene,,1.0,0.0,Am(III),0,HNO3,3.0,298.15,1.6,0.20412,https://doi.org/10.1080/07366298608917877,IDEaL
2,2,1,CC(C)CCCCCP(=O)(CC(=O)N(CC(C)C)CC(C)C)c1ccccc1,0.25,Tetrachloroethylene,,1.0,0.0,Am(III),0,HNO3,3.0,298.15,3.6,0.556303,https://doi.org/10.1080/07366298608917877,IDEaL


In [6]:
output_file = "output_db_An_IDEaL.xlsx"
df_final.to_excel(output_file, index=False)

### Check Nan

In [7]:
# Check for NaN values
has_nan = df_final[df_final.columns].isnull().any()

print("Does the column contain Nan values?")
print(has_nan)

Does the column contain Nan values?
Entry_ID              False
Extractants_count     False
SMILES                False
Extractant_conc_M     False
Solvent_A             False
Solvent_B              True
Volume_fraction_A     False
Volume_fraction_B     False
Metal                 False
Metal_conc_mM         False
Acid_type             False
Acid_conc_M           False
Temperature_K         False
Distribution_ratio    False
Log_D                 False
DOI                   False
Comments              False
dtype: bool


In [8]:
# Check for empty strings
has_empty_strings = (df_final[df_final.columns] == '').any()

print("Does the column contain empty strings?")
print(has_nan)

Does the column contain empty strings?
Entry_ID              False
Extractants_count     False
SMILES                False
Extractant_conc_M     False
Solvent_A             False
Solvent_B              True
Volume_fraction_A     False
Volume_fraction_B     False
Metal                 False
Metal_conc_mM         False
Acid_type             False
Acid_conc_M           False
Temperature_K         False
Distribution_ratio    False
Log_D                 False
DOI                   False
Comments              False
dtype: bool


Note that if the boolean is not False (except Solvent_B), we need to go back to clean

### Check data type

In [9]:
from pandas.api.types import is_numeric_dtype

print(is_numeric_dtype(df_final['Extractant_conc_M']))
print(is_numeric_dtype(df_final['Metal_conc_mM']))
print(is_numeric_dtype(df_final['Acid_conc_M']))
print(is_numeric_dtype(df_final['Temperature_K']))
print(is_numeric_dtype(df_final['Distribution_ratio']))
print(is_numeric_dtype(df_final['Log_D']))
print(is_numeric_dtype(df_final['Volume_fraction_B']))
print(is_numeric_dtype(df_final['Volume_fraction_A']))

True
True
True
True
True
True
True
True
