In [4]:
import os
import pandas as pd


def merge_datasets(directory):
    # Dictionary to hold pairs of filenames: {protein_name: {metric: [main_file, pareto_file]}}
    file_pairs = {}

    # List all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Determine if the file is a pareto file
            is_pareto = "_pareto_" in filename
            # Extract the metric and protein name from the filename
            parts = filename.replace("_pareto", "").split("_")
            protein_name = parts[-1].split(".")[0]
            metric = "_".join(parts[:-1])

            # Initialize dictionary structure
            if protein_name not in file_pairs:
                file_pairs[protein_name] = {}
            if metric not in file_pairs[protein_name]:
                file_pairs[protein_name][metric] = [
                    None,
                    None,
                ]  # [main_file, pareto_file]

            # Assign filename to the correct slot
            if is_pareto:
                file_pairs[protein_name][metric][1] = filename
            else:
                file_pairs[protein_name][metric][0] = filename

    # Define the subdirectory
    subdirectory = "concat"

    # Merge datasets
    for protein, metrics in file_pairs.items():
      for metric, files in metrics.items():
        main_file, pareto_file = files
        if main_file and pareto_file:  # Check if both files are available
          df_main = pd.read_csv(os.path.join(directory, main_file))
          df_pareto = pd.read_csv(os.path.join(directory, pareto_file))
          # Merge the main and pareto datasets
          df_merged = pd.concat([df_main, df_pareto], ignore_index=True)
          # Optional: Save the merged dataset to a new CSV file in the subdirectory
          output_filename = f"{metric}_concat_{protein}.csv"
          output_path = os.path.join(directory, subdirectory, output_filename)
          os.makedirs(os.path.dirname(output_path), exist_ok=True)
          df_merged.to_csv(output_path, index=False)
          print(f"Merged dataset saved to {output_path}")


if __name__ == "__main__":
    # Specify the directory containing your CSV files
    directory = "./"
    merge_datasets(directory)

Merged dataset saved to ./concat/strain_enrichment_metrics_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_OPRK.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_OPRK.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_OPRK.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_ACM2.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_ACM2.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_ACM2.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_OPRM.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_OPRM.csv
Merged dataset save

In [5]:
import pandas as pd
import os

pd.set_option("display.max_rows", 200)

# Get all files in the 'concat' subdirectory of the current working directory
files = os.listdir("concat")

print(files)
# Step 1: Identify Unique Proteins

proteins = set(file.split("_")[3] for file in files if len(file.split("_")) >= 4)

# Initialize an empty DataFrame for the final combined data
combined_df = pd.DataFrame()

# Step 2 & 3: Process each protein
for protein in proteins:
    # Initialize empty lists to store DataFrames for each type
    enrichment_metrics_dfs = []
    log_aucs_dfs = []
    roc_metrics_dfs = []
    
    # Filter files for the current protein and process according to type
    for file in files:
        if protein in file:
            file_path = os.path.join("concat", file)  # Include the 'concat' directory in the file path
            if "enrichment_metrics" in file:
                enrichment_metrics_dfs.append(pd.read_csv(file_path))
            elif "log_aucs" in file:
                log_aucs_dfs.append(pd.read_csv(file_path))
            elif "roc_metrics" in file:
                roc_metrics_dfs.append(pd.read_csv(file_path))

    # Concatenate DataFrames of the same type
    enrichment_df = pd.concat(enrichment_metrics_dfs)
    log_aucs_df = pd.concat(log_aucs_dfs)
    roc_metrics_df = pd.concat(roc_metrics_dfs)

    # Step 4: Merge the three DataFrames for each protein
    merged_df = enrichment_df.merge(
        log_aucs_df, on=["Protein", "Strain Energy Cutoff"]
    ).merge(roc_metrics_df, on=["Protein", "Strain Energy Cutoff"])

    # Append to the final DataFrame
    combined_df = pd.concat([combined_df, merged_df], ignore_index=True)

# Resulting combined_df contains all data
display(combined_df)

# Save the final DataFrame to a CSV file
# combined_df.to_csv("combined_data.csv", index=False)

data = combined_df

['strain_enrichment_metrics_concat_CCR5.csv', 'strain_roc_metrics_concat_CXCR4.csv', 'strain_roc_metrics_concat_CRFR1.csv', 'strain_roc_metrics_concat_OPRX.csv', 'strain_log_aucs_concat_AA2AR.csv', 'strain_enrichment_metrics_concat_OPRK.csv', 'strain_roc_metrics_concat_SMO.csv', 'strain_enrichment_metrics_concat_CRFR1.csv', 'strain_roc_metrics_concat_OPRM.csv', 'strain_enrichment_metrics_concat_CXCR4.csv', 'strain_log_aucs_concat_ADRB1.csv', 'strain_enrichment_metrics_concat_OPRM.csv', 'strain_enrichment_metrics_concat_OPRX.csv', 'strain_log_aucs_concat_ACM2.csv', 'strain_roc_metrics_concat_OPRK.csv', 'strain_log_aucs_concat_ACM3.csv', 'strain_roc_metrics_concat_CCR5.csv', 'strain_log_aucs_concat_ADRB2.csv', 'strain_enrichment_metrics_concat_ADRB2.csv', 'strain_roc_metrics_concat_ADRB1.csv', 'strain_log_aucs_concat_OPRM.csv', 'strain_log_aucs_concat_OPRX.csv', 'strain_enrichment_metrics_concat_ADRB1.csv', 'strain_enrichment_metrics_concat_ACM2.csv', 'strain_roc_metrics_concat_ADRB2.csv

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
1,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
2,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
3,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
4,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
391,P2Y12,7.0,21.296296,50.000000,11.002179,4.411765,4.585583,0.327116,0.872398,108,7946,0.000347
392,P2Y12,7.5,21.008403,50.420168,10.714286,4.831933,4.554007,0.295540,0.866973,119,8530,-0.005077
393,P2Y12,8.0,19.696970,50.000000,9.402852,4.411765,4.490139,0.231672,0.865426,132,9067,-0.006625
394,P2Y12,Top 10 Pareto Ranks,0.000000,0.000000,-10.294118,-45.588235,2.707243,-1.551224,0.826221,23,347,-0.045830


In [9]:
# Assuming df is your DataFrame
df = data.drop_duplicates(subset=["Protein", "Strain Energy Cutoff"])
display(df)
#save to directory above cwd
df.to_csv("../combined_data.csv", index=False)

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.439024,3.902439,0.000000,0.000000,1.343819,0.000000,0.483312,205,10379,0.000000
8,CCR5,4,4.166667,4.166667,1.727642,0.264228,1.291297,-0.052522,0.499054,24,2756,0.015742
9,CCR5,4.5,2.380952,2.380952,-0.058072,-1.521487,1.218683,-0.125136,0.498375,42,3455,0.015062
10,CCR5,5.0,5.084746,8.474576,2.645721,4.572137,1.461704,0.117885,0.528920,59,4167,0.045608
11,CCR5,5.5,4.166667,8.333333,1.727642,4.430894,1.491911,0.148092,0.530043,72,4843,0.046730
...,...,...,...,...,...,...,...,...,...,...,...,...
391,P2Y12,7.0,21.296296,50.000000,11.002179,4.411765,4.585583,0.327116,0.872398,108,7946,0.000347
392,P2Y12,7.5,21.008403,50.420168,10.714286,4.831933,4.554007,0.295540,0.866973,119,8530,-0.005077
393,P2Y12,8.0,19.696970,50.000000,9.402852,4.411765,4.490139,0.231672,0.865426,132,9067,-0.006625
394,P2Y12,Top 10 Pareto Ranks,0.000000,0.000000,-10.294118,-45.588235,2.707243,-1.551224,0.826221,23,347,-0.045830


In [8]:
# list unique names in the 'Protein' column
unique_proteins = df["Protein"].unique()
# alphabetically sorted
unique_proteins.sort()
print(unique_proteins)

['AA2AR' 'ACM2' 'ACM3' 'ADRB1' 'ADRB2' 'CCR5' 'CRFR1' 'CXCR4' 'DRD3'
 'GPR40' 'HRH1' 'MGLUR1' 'MGLUR5' 'OPRD' 'OPRK' 'OPRM' 'OPRX' 'OX2R'
 'P2Y12' 'PAR1' 'S1PR1' 'SMO']
