In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('original/Monoamide_Literature_Review.xlsx', header=1)
df.shape

(1006, 24)

### Filter metals

In [3]:
An_list = ['Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
metal_pattern = '|'.join(An_list)

df_filtered = df[df['Metal Identity'].str.contains(metal_pattern, na=False, regex=True)]

df_filtered.shape

(1006, 24)

### Remove Nan

In [4]:
# If more than one metal, delete the corresponding row
df_filtered = df_filtered[~df_filtered['Metal Identity'].str.contains(',', na=False)]

df_filtered.shape

(807, 24)

In [5]:
df_filtered = df_filtered[~(df_filtered['Metal Identity'] == 'Am(VI)')]

df_filtered.shape

(780, 24)

In [6]:
# Remove rows where 'Distribution Coefficient' is NaN or zero
df_filtered['Distribution Coefficient'] = pd.to_numeric(df_filtered['Distribution Coefficient'], errors='coerce')
df_filtered = df_filtered[df_filtered['Distribution Coefficient'].notna() & (df_filtered['Distribution Coefficient'] > 0)]

df_filtered.shape

(758, 24)

In [7]:
# Remove rows where 'Extractant Concentration (M)' is NaN
df_filtered['Extractant Concentration (M)'] = pd.to_numeric(df_filtered['Extractant Concentration (M)'], errors='coerce')
df_filtered = df_filtered[df_filtered['Extractant Concentration (M)'].notna()]

df_filtered.shape

(684, 24)

In [8]:
# Remove rows where 'Temperature (Â°C)' is NaN
df_filtered['Temperature (C)'] = pd.to_numeric(df_filtered['Temperature (C)'], errors='coerce')
df_filtered = df_filtered[df_filtered['Temperature (C)'].notna()]

df_filtered.shape

(525, 24)

In [9]:
# Remove rows where 'Acid Concentration (M)' is NaN
df_filtered['Acid Concentration (M)'] = pd.to_numeric(df_filtered['Acid Concentration (M)'], errors='coerce')
df_filtered = df_filtered[df_filtered['Acid Concentration (M)'].notna()]

df_filtered.shape

(525, 24)

In [10]:
# Remove rows where 'DOI' is NaN
df_filtered = df_filtered[df_filtered['DOI'].notna()]

df_filtered.shape

(313, 24)

### File Nan

In [11]:
# If metal concentration is empty, replace with zero
df_filtered['Metal Concentration (mM)'].fillna(0, inplace=True)

### Calculation

In [12]:
from rdkit import Chem

In [13]:
def get_canonical_smiles(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(molecule, canonical=True)

df_filtered = df_filtered.rename(columns={'SMILES': 'SMILES_orig'})
df_filtered['SMILES'] = df_filtered['SMILES_orig'].apply(get_canonical_smiles)

df_filtered.shape

(313, 25)

In [14]:
import re

In [15]:
# Convert T to Kelvin and create a new column 'Temperature_K'
df_filtered['Temperature_K'] = df_filtered['Temperature (C)'] + 273.15

df_filtered.shape

(313, 26)

In [16]:
# Calculate the base-10 logarithm and create a new column 'log_D'
df_filtered['log_D'] = np.log10(df_filtered['Distribution Coefficient'])

df_filtered.shape

(313, 27)

In [17]:
df_filtered.to_excel('output_cleaned_data.xlsx', index=False)