In [20]:
import pandas as pd
import numpy as np
import sys

def build_frontier_from_offsprings(all_offsprings, criteria):
    try:
        frontier = all_offsprings.iloc[[0]]  # initialises frontier with first compound in sim_props
    except:
        print(all_offsprings)
        sys.exit()
    for i in range(1, all_offsprings.shape[0]): # sweep sim_props: for each compound i in sim_props...
        s = ((all_offsprings.iloc[i][criteria.iloc[:,0]].values >= frontier[criteria.iloc[:,0]].values) & 
             (criteria.iloc[:,1].values == 'max')) | ((all_offsprings.iloc[i][criteria.iloc[:,0]].values <= frontier[criteria.iloc[:,0]].values) & 
                                                       (criteria.iloc[:,1].values == 'min'))
        domij = np.all(s, axis=1)
        domedij = np.any(s, axis=1)        
        if domij.any():
            frontier = frontier.drop(index=frontier[domij].index)  # remove dominated compounds
            frontier = pd.concat([frontier, all_offsprings.iloc[[i]]], ignore_index=True)  # add new compound
        elif domedij.any():
            if any(all_offsprings.iloc[[i]][col].isin(frontier[col]).any() for col in frontier.columns):
                s = ((all_offsprings.iloc[i][criteria.iloc[:,0]].values <= frontier[criteria.iloc[:,0]].values) & 
                    (criteria.iloc[:,1].values == 'max')) | ((all_offsprings.iloc[i][criteria.iloc[:,0]].values >= frontier[criteria.iloc[:,0]].values) & 
                                                       (criteria.iloc[:,1].values == 'min'))
                domedij = np.all(s, axis=1)
                if domedij.any():
                    continue                
            frontier = pd.concat([frontier, all_offsprings.iloc[[i]]], ignore_index=True)   
    return frontier

def process_and_compare_files(file_paths, criteria):
    combined_data = pd.DataFrame()
    file_origin_dict = {}  # Dictionary to track the origin of each compound

    # Combine all files into a single DataFrame and track origins
    for file_path in file_paths:
        try:
            data = pd.read_csv(file_path)
            for index, row in data.iterrows():
                compound_key = tuple(row.tolist())
                file_origin_dict.setdefault(compound_key, []).append(file_path)
            combined_data = pd.concat([combined_data, data], ignore_index=True)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    # Remove duplicate compounds while preserving order
    combined_data = combined_data.drop_duplicates().reset_index(drop=True)

    # Map origins to the combined data
    combined_data['FileOrigin'] = [file_origin_dict[tuple(row.tolist())] for _, row in combined_data.iterrows()]

    non_dominated = build_frontier_from_offsprings(combined_data, criteria)

    # Count contributions from each file
    file_contributions = {}
    for origins in non_dominated['FileOrigin']:
        for origin in origins:
            file_contributions[origin] = file_contributions.get(origin, 0) + 1

    # Print file contributions
    for file_path in file_paths:
        count = file_contributions.get(file_path, 0)
        print(f"File: {file_path}, Non-dominated results: {count}")

    # Print non-dominating molecules
    print("Non-dominating molecules:")
    non_dominated.to_csv("non_dominated_output.csv", index=False)
    print(non_dominated)
    print(combined_data[~combined_data.index.isin(non_dominated.index)])



# Example usage
file_names = [
    './resultscmaes/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10_cleaned_final.csv',
    './resultscmaes_around/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCC=CCC=CCC=CCCCCCCC(C)C(=O)S(=O)(=O)O_0.9_50_10_cleaned_final.csv',
    './resultsnsga/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10_cleaned_final.csv',  
    './resultsnsga_around/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCC=CCC=CCC=CCCCCCCC(C)C(=O)S(=O)(=O)O_0.9_50_10_cleaned_final.csv'
]

criteria = pd.DataFrame(columns=['variable_name', 'rule'])
criteria.loc[0] = ['XLogP', 'max']
criteria.loc[1] = ['Complexity', 'min']
criteria.loc[2] = ['MolecularWeight', 'min']
criteria.loc[3] = ['RefLikeness', 'min']

process_and_compare_files(file_names, criteria)


File: ./resultscmaes/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10_cleaned_final.csv, Non-dominated results: 12
File: ./resultscmaes_around/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCC=CCC=CCC=CCCCCCCC(C)C(=O)S(=O)(=O)O_0.9_50_10_cleaned_final.csv, Non-dominated results: 15
File: ./resultsnsga/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10_cleaned_final.csv, Non-dominated results: 1
File: ./resultsnsga_around/CCCCCCC(C)CCCCCCCCCOS(=O)(=O)O_0.9_50_10/CCC=CCC=CCC=CCCCCCCC(C)C(=O)S(=O)(=O)O_0.9_50_10_cleaned_final.csv, Non-dominated results: 0
Non-dominating molecules:
                             SMILES  Complexity  MolecularWeight  XLogP  \
0    CCCCCCCCC(CCCCCCCC)OS(=O)(=O)O       311.6            336.5    5.7   
1        CCCCCCCCCCCCCCSOS(=O)(=O)O       291.5            326.5    5.2   
2       CCCCCCCCCCCCCCCCCS(=O)(=O)O       299.2            320.5    5.7   
3     CCCCCCCCCCCCCCCC=COS(=O)(=O)O       349.7         