In [7]:
import os
import pandas as pd


def merge_datasets(directory):
    # Dictionary to hold pairs of filenames: {protein_name: {metric: [main_file, pareto_file]}}
    file_pairs = {}

    # List all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Determine if the file is a pareto file
            is_pareto = "_pareto_" in filename
            # Extract the metric and protein name from the filename
            parts = filename.replace("_pareto", "").split("_")
            protein_name = parts[-1].split(".")[0]
            metric = "_".join(parts[:-1])

            # Initialize dictionary structure
            if protein_name not in file_pairs:
                file_pairs[protein_name] = {}
            if metric not in file_pairs[protein_name]:
                file_pairs[protein_name][metric] = [
                    None,
                    None,
                ]  # [main_file, pareto_file]

            # Assign filename to the correct slot
            if is_pareto:
                file_pairs[protein_name][metric][1] = filename
            else:
                file_pairs[protein_name][metric][0] = filename

    # Define the subdirectory
    subdirectory = "concat"

    # Merge datasets
    for protein, metrics in file_pairs.items():
      for metric, files in metrics.items():
        main_file, pareto_file = files
        if main_file and pareto_file:  # Check if both files are available
          df_main = pd.read_csv(os.path.join(directory, main_file))
          df_pareto = pd.read_csv(os.path.join(directory, pareto_file))
          # Merge the main and pareto datasets
          df_merged = pd.concat([df_main, df_pareto], ignore_index=True)
          # Optional: Save the merged dataset to a new CSV file in the subdirectory
          output_filename = f"{metric}_concat_{protein}.csv"
          output_path = os.path.join(directory, subdirectory, output_filename)
          os.makedirs(os.path.dirname(output_path), exist_ok=True)
          df_merged.to_csv(output_path, index=False)
          print(f"Merged dataset saved to {output_path}")


if __name__ == "__main__":
    # Specify the directory containing your CSV files
    directory = "./"
    merge_datasets(directory)

Merged dataset saved to ./concat/strain_enrichment_metrics_concat_S1PR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_S1PR1.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_S1PR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_GPR40.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_GPR40.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_GPR40.csv


In [6]:
pd.read_csv("concat/energy

['concat', 'strain_enrichment_metrics_S1PR1.csv', 'temp', 'strain_roc_metrics_S1PR1.csv', 'combine_data.ipynb', 'strain_roc_metrics_GPR40.csv', 'strain_enrichment_metrics_GPR40.csv', 'subdir', 'strain_log_aucs_S1PR1.csv', 'strain_roc_metrics_pareto_S1PR1.csv', 'strain_enrichment_metrics_merged_GPR40.csv', 'strain_log_aucs_merged_S1PR1.csv', 'strain_enrichment_metrics_pareto_S1PR1.csv', 'strain_roc_metrics_merged_GPR40.csv', 'combined_data.csv', 'strain_log_aucs_pareto_GPR40.csv', 'strain_roc_metrics_merged_S1PR1.csv', 'strain_enrichment_metrics_pareto_GPR40.csv', 'strain_log_aucs_pareto_S1PR1.csv', 'strain_enrichment_metrics_merged_S1PR1.csv', 'strain_roc_metrics_pareto_GPR40.csv', 'strain_log_aucs_merged_GPR40.csv', 'strain_log_aucs_GPR40.csv']


Unnamed: 0,Protein,Strain Energy Cutoff_x,EF1%,EF5%,deltaEF1%,deltaEF5%,Strain Energy Cutoff_y,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),Strain Energy Cutoff,ROC_AUC,Actives,Total Count,deltaAUC
0,S1PR1,No Cutoff,3.271028,21.028037,0.000000,0.000000,No Cutoff,3.133979,0.000000,No Cutoff,0.800310,214,9107,0.000000
1,S1PR1,No Cutoff,3.271028,21.028037,0.000000,0.000000,No Cutoff,3.133979,0.000000,4,0.867996,23,1436,0.067686
2,S1PR1,No Cutoff,3.271028,21.028037,0.000000,0.000000,No Cutoff,3.133979,0.000000,4.5,0.866756,33,1859,0.066446
3,S1PR1,No Cutoff,3.271028,21.028037,0.000000,0.000000,No Cutoff,3.133979,0.000000,5.0,0.866626,37,2308,0.066316
4,S1PR1,No Cutoff,3.271028,21.028037,0.000000,0.000000,No Cutoff,3.133979,0.000000,5.5,0.864223,45,2773,0.063913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31153,GPR40,Top 20 Pareto Ranks,2.702703,13.513514,-5.084183,-29.109437,8.0,4.208038,-0.062802,Top 10 Pareto Ranks,0.811881,34,337,-0.061697
31154,GPR40,Top 20 Pareto Ranks,2.702703,13.513514,-5.084183,-29.109437,8.0,4.208038,-0.062802,Top 20 Pareto Ranks,0.821776,74,896,-0.051802
31155,GPR40,Top 20 Pareto Ranks,2.702703,13.513514,-5.084183,-29.109437,8.0,4.208038,-0.062802,No Cutoff,0.873578,244,8762,0.000000
31156,GPR40,Top 20 Pareto Ranks,2.702703,13.513514,-5.084183,-29.109437,8.0,4.208038,-0.062802,Top 10 Pareto Ranks,0.811881,34,337,-0.061697
