In [1]:
import pandas as pd

df = pd.read_csv("combined_data_no_duplicates_and_ave_duplicates.csv")

# Remove rows where 'solvent_name' contains '/'
df = df[~df['solvent_name'].str.contains('/', na=False)]

# Remove specific rows based on solvent_name and solvent_smiles
rows_to_remove = [
    ("ethyl acetate", "CCOC(=O)C1=NN(C(=N1)C(Cl)(Cl)Cl)C2=C(C=C(C=C2)Cl)Cl"),
    ("acetonitrile", "CCN"),
    ("trifluoroethanol", "FCC(F)(F)O"),
    ("ethanol", "#NAME?")
]
df = df[~df[['solvent_name', 'solvent_smiles']].apply(tuple, axis=1).isin(rows_to_remove)]

# Remove duplicate rows based on solvent_name, solute_name, and solvent_smiles
df = df.drop_duplicates(subset=['solvent_name', 'solute_name', 'solvent_smiles'])

# Ensure LogS is numeric
df['LogS'] = pd.to_numeric(df['LogS'], errors='coerce')

#remove any solute_name" which contain "DONOTUSE"
df = df[~df['solute_name'].str.contains('DONOTUSE', na=False)]

#remove any solvent_smiles" which have "#NAME?"
df = df[~df['solvent_smiles'].str.contains('#NAME?', na=False)]

# Remove rows where 'LogS' column is NaN or missing
df = df.dropna(subset=["LogS"])

# Save the cleaned dataset
df.to_csv("cleaned_data.csv", index=False)

In [2]:
import pandas as pd

data = pd.read_csv('cleaned_data.csv')

# Replace the solute_name values according to your specifications
data['solute_name'] = data['solute_name'].replace({
    'paracetamol': 'acetaminophen',
    'isophthalic acid': '1,3-benzenedicarboxylic acid',
    'citric acid monohydrate': 'citric acid',  
    'citric acid anhydrous': 'citric acid',  
    '2-hydroxypropane-1,2,3-tricarboxylic acid': 'citric acid',
    'benzoin': '2-hydroxy-1,2-diphenylethanone',
    '3-((6-O-(6-deoxy-??-L-mannopyranosyl)-??-D-glucopyranosyl)oxy)-2-(3,4-dihydroxyphenol)-5,7-dihydroxy-4H-1-benzopyran-4-one': 'rutin',
    '2-(2,4,6-trichlorophenoxy)ethyl bromide': '2-(2-bromoethoxy)-1,3,5-trichlorobenzene',
    '4-Hydroxybenzoic acid': '4-hydroxybenzoic acid',
    '(2S)-5,7-dihydroxy-2-(3-hydroxy-4-methoxyphenyl)-2,3-dihydrochromen-4-one': 'hesperetin',
    'Benzoic acid': 'benzoic acid',
    'hexanedioic acid': 'adipic acid'
})

# Replace the inchikey for "salicylic acid" with the exact match for the specified InChIKey
data.loc[(data['solute_name'] == 'salicylic acid') & (data['solute_inchikey'] == 'BSYNRYMUTXBXSQ-WXRBYKJCNA-N'), 'solute_inchikey'] = 'YGSDEFSMJLZEOE-BGGKNDAXNA-N'

data.to_csv('cleaned_data_2.csv', index=False)

In [3]:
import pandas as pd
df = pd.read_csv("cleaned_data_2.csv")

# Handle duplicates by averaging LogS values
def aggregate_duplicates(group):
    first_smiles = group['solvent_smiles'].iloc[0]  # Take the first solvent_smiles
    avg_logs = group['LogS'].mean()  # Average LogS values
    group = group.iloc[0:1]  # Keep only the first row
    group['LogS'] = avg_logs  # Assign averaged LogS
    group['solvent_smiles'] = first_smiles  # Ensure first solvent_smiles is kept
    return group

df = df.groupby(['solvent_name', 'solute_name', 'solute_inchikey'], as_index=False).apply(aggregate_duplicates)
print(df)
df.to_csv("cleaned_data_3.csv", index=False)

                                                 solute_name  \
0    937                       phenanthrene-9-carboxaldehyde   
1    638                                            biphenyl   
2    694                                  decafluorobiphenyl   
3    811                                    hexachloroethane   
4    884                                         naphthalene   
...                                                      ...   
7197 7363                                           warfarin   
7198 7396                                           xanthine   
7199 7397                                       xanthopterin   
7200 7398  {(2Z)-1-[(6-Chloro-3-pyridinyl)methyl]-3-methy...   
7201 7399  {1-[(6-Chloro-3-pyridinyl)methyl]-4,5-dihydro-...   

                             solvent_name       solvent_smiles  \
0    937   1,1,2-trichlorotrifluoroethane  ClC(F)(F)C(Cl)(Cl)F   
1    638                1,2-dibromoethane               BrCCBr   
2    694                1,2-dibro