In [8]:
import pandas as pd

### Util to standardize solvents

In [9]:
solvent_mapping_df = pd.read_excel('../db_An_conditions_desp.xlsx', sheet_name='solvents')
solvent_mapping_df.shape

(36, 5)

In [10]:
# Create a mapping dictionary by splitting multiple names in 'Name_in_other_dataset' and mapping them to 'Solvent'
solvent_mapping = {}
for index, row in solvent_mapping_df.iterrows():
    # Check if 'Name_in_other_dataset' is not empty or NaN
    if pd.notna(row['Name_in_other_dataset']):
        # Split the 'Name_in_other_dataset' and map each to the corresponding 'Solvent'
        for name in row['Name_in_other_dataset'].split(';'):
            if name.strip():  # Ensure it's not just an empty string after stripping
                solvent_mapping[name.strip()] = row['Solvent']

solvent_mapping

{'hydrogenated tetrapropene': 'hydrogenated tetrapropylene',
 'TPH (hydrogenated tetrapropene)': 'hydrogenated tetrapropylene',
 'tph': 'hydrogenated tetrapropylene',
 'HPT': 'hydrogenated tetrapropylene',
 'Chloroform': 'chloroform',
 'Toluene': 'toluene',
 'p-diisopropylbenzene': '1,4-diisopropylbenzene',
 'n-Dodecane': 'n-dodecane',
 '1-butyl-3-methylimidazolium bis(trifluoromethane)sulfonimide': '[C4mim][Tf2N]',
 'Nitrobenzene': 'nitrobenzene',
 '1,2-Dichloroethane': '1,2-dichloroethane',
 'Octanol': '1-octanol',
 'octanol': '1-octanol',
 '1-Octanol': '1-octanol',
 'n-octanol': '1-octanol',
 'n-octan-1-ol': '1-octanol',
 'n-Hexane': 'n-hexane',
 'Diethylether': 'diethylether',
 'Benzene': 'benzene',
 'phenyl trifluoromethyl sulfone (FS-13)': 'phenyl trifluoromethyl sulfone',
 'FS-13': 'phenyl trifluoromethyl sulfone',
 'tetrachloromethane': 'CCl4',
 'Tetrachloromethane': 'CCl4',
 'tetrachloroethylene': 'Tetrachloroethylene',
 'Tetrachloroethane': 'tetrachloroethane',
 '1,1,2,2-tetr

In [11]:
# Define a function to standardize solvent names
def standardize_solvent(solvent):
    if solvent in solvent_mapping:
        return solvent_mapping[solvent]
    return solvent

### Integ DGA and Logan dataset

In [12]:
df_dga = pd.read_excel('../dataset_DGA/output_db_An_DGA.xlsx')

# standardize solvent name
df_dga['Solvent_A'] = df_dga['Solvent_A'].apply(standardize_solvent)
df_dga['Solvent_B'] = df_dga['Solvent_B'].apply(standardize_solvent)

print(df_dga.shape)

(1466, 17)


In [13]:
df_logan = pd.read_excel('../dataset_monoamide/output_db_An_Logan.xlsx')

# standardize solvent name
df_logan['Solvent_A'] = df_logan['Solvent_A'].apply(standardize_solvent)
df_logan['Solvent_B'] = df_logan['Solvent_B'].apply(standardize_solvent)

print(df_logan.shape)

(313, 17)


In [14]:
# Create a set of unique combinations of the columns
cols_to_compare = ['SMILES', 'Solvent_A', 'Metal']
unique_combinations_df = df_dga[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered = df_logan[~df_logan[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_df.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_dga_logan = pd.concat([df_dga, df_filtered], ignore_index=True)

print(df_combined_dga_logan.shape)

(1779, 17)


### Integ ACSEPT dataset

In [15]:
df_acsept = pd.read_excel('../dataset_ACSEPT/output_db_An_ACSEPT.xlsx')

# standardize solvent name
df_acsept['Solvent_A'] = df_acsept['Solvent_A'].apply(standardize_solvent)
df_acsept['Solvent_B'] = df_acsept['Solvent_B'].apply(standardize_solvent)

print(df_acsept.shape)

(307, 17)


In [16]:
# Create a set of unique combinations
cols_to_compare = ['SMILES', 'Solvent_A', 'Metal']
unique_combinations_2 = df_combined_dga_logan[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered_2 = df_acsept[~df_acsept[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_2.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_dga_logan_acsept = pd.concat([df_combined_dga_logan, df_filtered_2], ignore_index=True)

print(df_combined_dga_logan_acsept.shape)

(2083, 17)


### Integ IDEaL dataset

In [17]:
df_ideal = pd.read_excel('../dataset_IDEaL/output_db_An_IDEaL.xlsx')

# standardize solvent name
df_ideal['Solvent_A'] = df_ideal['Solvent_A'].apply(standardize_solvent)
df_ideal['Solvent_B'] = df_ideal['Solvent_B'].apply(standardize_solvent)

print(df_ideal.shape)

(1300, 17)


In [18]:
# Create a set of unique combinations
cols_to_compare = ['SMILES', 'Solvent_A']
unique_combinations_3 = df_combined_dga_logan_acsept[cols_to_compare].drop_duplicates()

# Filter the second dataframe
df_filtered_3 = df_ideal[~df_ideal[cols_to_compare].apply(tuple, axis=1).isin(unique_combinations_3.apply(tuple, axis=1))]

# Integrate the dataframes
df_combined_all = pd.concat([df_combined_dga_logan_acsept, df_filtered_3], ignore_index=True)

print(df_combined_all.shape)

(2989, 17)


In [19]:
df_combined_all.to_excel("output_integ_data.xlsx", index=False)