In [53]:
import pandas as pd

### Util to standardize solvents

In [54]:
solvent_mapping_df = pd.read_excel('../db_Ln_conditions_desp.xlsx', sheet_name='solvents')
solvent_mapping_df.shape

(44, 5)

In [55]:
# Create a mapping dictionary by splitting multiple names in 'Name_in_other_dataset' and mapping them to 'Solvent'
solvent_mapping = {}
for index, row in solvent_mapping_df.iterrows():
    # Check if 'Name_in_other_dataset' is not empty or NaN
    if pd.notna(row['Name_in_other_dataset']):
        # Split the 'Name_in_other_dataset' and map each to the corresponding 'Solvent'
        for name in row['Name_in_other_dataset'].split(';'):
            if name.strip():  # Ensure it's not just an empty string after stripping
                solvent_mapping[name.strip()] = row['Solvent']

solvent_mapping

{'hydrogenated tetrapropene': 'hydrogenated tetrapropylene',
 'TPH (hydrogenated tetrapropene)': 'hydrogenated tetrapropylene',
 'tph': 'hydrogenated tetrapropylene',
 'HPT': 'hydrogenated tetrapropylene',
 'Chloroform': 'chloroform',
 'Toluene': 'toluene',
 'n-Dodecane': 'n-dodecane',
 '1-butyl-3-methylimidazolium bis(trifluoromethane)sulfonimide': '[C4mim][Tf2N]',
 'Nitrobenzene': 'nitrobenzene',
 '1,2-Dichloroethane': '1,2-dichloroethane',
 'Octanol': '1-octanol',
 'octanol': '1-octanol',
 '1-Octanol': '1-octanol',
 'n-octanol': '1-octanol',
 'n-octan-1-ol': '1-octanol',
 'n-Hexane': 'n-hexane',
 'Diethylether': 'diethylether',
 'Benzene': 'benzene',
 'phenyl trifluoromethyl sulfone (FS-13)': 'phenyl trifluoromethyl sulfone',
 'FS-13': 'phenyl trifluoromethyl sulfone',
 'tetrachloromethane': 'CCl4',
 'Tetrachloromethane': 'CCl4',
 'Tetrachloroethane': 'tetrachloroethane',
 '1,1,2,2-tetrachloroethane': 'tetrachloroethane',
 'nphe': '2-nitrophenyl hexyl ether',
 '2-nphe': '2-nitrophen

In [56]:
# Define a function to standardize solvent names
def standardize_solvent(solvent):
    if solvent in solvent_mapping:
        return solvent_mapping[solvent]
    return solvent

### Integ ORNL and DGA dataset

In [57]:
df_dga = pd.read_excel('../dataset_DGA/output_db_Ln_DGA.xlsx')

# standardize solvent name
df_dga['Solvent_A'] = df_dga['Solvent_A'].apply(standardize_solvent)
df_dga['Solvent_B'] = df_dga['Solvent_B'].apply(standardize_solvent)

print(df_dga.shape)

(3113, 17)


In [58]:
df_ornl = pd.read_excel('../dataset_ORNL_update/output_db_Ln_ORNL.xlsx')

# standardize solvent name
df_ornl['Solvent_A'] = df_ornl['Solvent_A'].apply(standardize_solvent)
df_ornl['Solvent_B'] = df_ornl['Solvent_B'].apply(standardize_solvent)

print(df_ornl.shape)

(1258, 17)


In [59]:
# Create a set of unique combinations of the columns
cols_to_compare = ['SMILES', 'Solvent_A', 'Metal']
unique_combinations_df = df_dga[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered = df_ornl[~df_ornl[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_df.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_dga_ornal = pd.concat([df_dga, df_filtered], ignore_index=True)

print(df_combined_dga_ornal.shape)

(3986, 17)


### Integ ACSEPT dataset

In [60]:
df_acsept = pd.read_excel('../dataset_ACSEPT/output_db_Ln_ACSEPT.xlsx')

# standardize solvent name
df_acsept['Solvent_A'] = df_acsept['Solvent_A'].apply(standardize_solvent)
df_acsept['Solvent_B'] = df_acsept['Solvent_B'].apply(standardize_solvent)

print(df_acsept.shape)

(290, 17)


In [61]:
# Create a set of unique combinations
cols_to_compare = ['SMILES', 'Solvent_A', 'Metal']
unique_combinations_2 = df_combined_dga_ornal[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered_2 = df_acsept[~df_acsept[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_2.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_ornl_dga_acsept = pd.concat([df_combined_dga_ornal, df_filtered_2], ignore_index=True)

print(df_combined_ornl_dga_acsept.shape)

(4267, 17)


### Integ IDEaL dataset

In [62]:
df_ideal = pd.read_excel('../dataset_IDEaL/output_db_Ln_IDEaL.xlsx')

# standardize solvent name
df_ideal['Solvent_A'] = df_ideal['Solvent_A'].apply(standardize_solvent)
df_ideal['Solvent_B'] = df_ideal['Solvent_B'].apply(standardize_solvent)

print(df_ideal.shape)

(1492, 17)


In [63]:
# Create a set of unique combinations
cols_to_compare = ['SMILES', 'Solvent_A']
unique_combinations_3 = df_combined_ornl_dga_acsept[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered_3 = df_ideal[~df_ideal[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_3.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_all = pd.concat([df_combined_ornl_dga_acsept, df_filtered_3], ignore_index=True)

print(df_combined_all.shape)

(5086, 17)


In [64]:
df_combined_all.to_excel("output_integ_data.xlsx", index=False)