In [1]:
import os
import pandas as pd


def merge_datasets(directory):
    # Dictionary to hold pairs of filenames: {protein_name: {metric: [cutoff_file, pareto_file]}}
    file_pairs = {}

    # List all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Determine if the file is a pareto file
            is_pareto = "_pareto_" in filename
            # Extract the metric and protein name from the filename
            parts = filename.replace("_pareto", "").split("_")
            protein_name = parts[-1].split(".")[0]
            metric = "_".join(parts[:-1])

            # Initialize dictionary structure
            if protein_name not in file_pairs:
                file_pairs[protein_name] = {}
            if metric not in file_pairs[protein_name]:
                file_pairs[protein_name][metric] = [
                    None,
                    None,
                ]  # [main_file, pareto_file]

            # Assign filename to the correct slot
            if is_pareto:
                file_pairs[protein_name][metric][1] = filename
            else:
                file_pairs[protein_name][metric][0] = filename

    # Define the subdirectory
    subdirectory = "concat"

    # Merge datasets
    for protein, metrics in file_pairs.items():
      for metric, files in metrics.items():
        main_file, pareto_file = files
        if main_file and pareto_file:  # Check if both files are available
          df_main = pd.read_csv(os.path.join(directory, main_file))
          df_pareto = pd.read_csv(os.path.join(directory, pareto_file))
          # Merge the main and pareto datasets
          df_merged = pd.concat([df_main, df_pareto], ignore_index=True)
          # Optional: Save the merged dataset to a new CSV file in the subdirectory
          output_filename = f"{metric}_concat_{protein}.csv"
          output_path = os.path.join(directory, subdirectory, output_filename)
          os.makedirs(os.path.dirname(output_path), exist_ok=True)
          df_merged.to_csv(output_path, index=False)
          print(f"Merged dataset saved to {output_path}")


if __name__ == "__main__":
    # Specify the directory containing your CSV files
    directory = "./"
    merge_datasets(directory)

Merged dataset saved to ./concat/strain_enrichment_metrics_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_CXCR4.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_CRFR1.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_OPRK.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_OPRK.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_OPRK.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_ACM2.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_ACM2.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_ACM2.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_OPRM.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_OPRM.csv
Merged dataset save

In [2]:
import pandas as pd
import os

pd.set_option("display.max_rows", 200)

# Get all files in the 'concat' subdirectory of the current working directory
files = os.listdir("concat")

print(files)
# Step 1: Identify Unique Proteins

proteins = set(file.split("_")[3] for file in files if len(file.split("_")) >= 4)

# Initialize an empty DataFrame for the final combined data
combined_df = pd.DataFrame()

# Step 2 & 3: Process each protein
for protein in proteins:
    # Initialize empty lists to store DataFrames for each type
    enrichment_metrics_dfs = []
    log_aucs_dfs = []
    roc_metrics_dfs = []
    
    # Filter files for the current protein and process according to type
    for file in files:
        if protein in file:
            file_path = os.path.join("concat", file)  # Include the 'concat' directory in the file path
            if "enrichment_metrics" in file:
                enrichment_metrics_dfs.append(pd.read_csv(file_path))
            elif "log_aucs" in file:
                log_aucs_dfs.append(pd.read_csv(file_path))
            elif "roc_metrics" in file:
                roc_metrics_dfs.append(pd.read_csv(file_path))

    # Concatenate DataFrames of the same type
    # Drop duplicates to avoid merge issues
    enrichment_df = pd.concat(enrichment_metrics_dfs)
    enrichment_df.drop_duplicates(inplace=True)
    log_aucs_df = pd.concat(log_aucs_dfs)
    log_aucs_df.drop_duplicates(inplace=True)
    roc_metrics_df = pd.concat(roc_metrics_dfs)
    roc_metrics_df.drop_duplicates(inplace=True)

    # Step 4: Merge the three DataFrames for each protein
    merged_df = enrichment_df.merge(
        log_aucs_df, on=["Protein", "Strain Energy Cutoff"]
    ).merge(roc_metrics_df, on=["Protein", "Strain Energy Cutoff"])

    # Append to the final DataFrame
    combined_df = pd.concat([combined_df, merged_df], ignore_index=True)

print(combined_df.shape)
print(merged_df.shape)
display(merged_df.head(10))
# Resulting combined_df contains all data
display(combined_df)

# Save the final DataFrame to a CSV file
combined_df.to_csv("combined_data.csv", index=False)

data = combined_df

['strain_enrichment_metrics_concat_CCR5.csv', 'strain_roc_metrics_concat_CXCR4.csv', 'strain_roc_metrics_concat_CRFR1.csv', 'strain_roc_metrics_concat_OPRX.csv', 'strain_log_aucs_concat_AA2AR.csv', 'strain_enrichment_metrics_concat_OPRK.csv', 'strain_roc_metrics_concat_SMO.csv', 'strain_enrichment_metrics_concat_CRFR1.csv', 'strain_roc_metrics_concat_OPRM.csv', 'strain_enrichment_metrics_concat_CXCR4.csv', 'strain_log_aucs_concat_ADRB1.csv', 'strain_enrichment_metrics_concat_OPRM.csv', 'strain_enrichment_metrics_concat_OPRX.csv', 'strain_log_aucs_concat_ACM2.csv', 'strain_roc_metrics_concat_OPRK.csv', 'strain_log_aucs_concat_ACM3.csv', 'strain_roc_metrics_concat_CCR5.csv', 'strain_log_aucs_concat_ADRB2.csv', 'strain_enrichment_metrics_concat_ADRB2.csv', 'strain_roc_metrics_concat_ADRB1.csv', 'strain_log_aucs_concat_OPRM.csv', 'strain_log_aucs_concat_OPRX.csv', 'strain_enrichment_metrics_concat_ADRB1.csv', 'strain_enrichment_metrics_concat_ACM2.csv', 'strain_roc_metrics_concat_ADRB2.csv

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.912621,4.368932,0.0,0.0,1.372233,0.0,0.485772,206,10381,0.0
1,CCR5,4,8.0,8.0,5.087379,3.631068,1.384735,0.012502,0.519078,25,2757,0.033306
2,CCR5,4.5,4.651163,4.651163,1.738541,0.282231,1.272428,-0.099805,0.510033,43,3456,0.024262
3,CCR5,5.0,6.666667,10.0,3.754045,5.631068,1.518871,0.146638,0.536768,60,4168,0.050996
4,CCR5,5.5,5.479452,9.589041,2.566831,5.220109,1.549993,0.17776,0.536478,73,4844,0.050706
5,CCR5,6.0,4.545455,7.954545,1.632833,3.585613,1.453773,0.08154,0.523602,88,5556,0.03783
6,CCR5,7.0,4.310345,6.034483,1.397723,1.665551,1.521585,0.149352,0.510734,116,6797,0.024962
7,CCR5,7.5,3.846154,5.384615,0.933532,1.015683,1.471464,0.099232,0.502175,130,7299,0.016404
8,CCR5,8.0,3.472222,6.25,0.559601,1.881068,1.471278,0.099045,0.502503,144,7781,0.016731
9,CCR5,Top 10 Pareto Ranks,0.0,62.5,-2.912621,58.131068,4.532142,3.159909,0.777859,8,349,0.292087


Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.912621,4.368932,0.000000,0.000000,1.372233,0.000000,0.485772,206,10381,0.000000
1,CCR5,4,8.000000,8.000000,5.087379,3.631068,1.384735,0.012502,0.519078,25,2757,0.033306
2,CCR5,4.5,4.651163,4.651163,1.738541,0.282231,1.272428,-0.099805,0.510033,43,3456,0.024262
3,CCR5,5.0,6.666667,10.000000,3.754045,5.631068,1.518871,0.146638,0.536768,60,4168,0.050996
4,CCR5,5.5,5.479452,9.589041,2.566831,5.220109,1.549993,0.177760,0.536478,73,4844,0.050706
...,...,...,...,...,...,...,...,...,...,...,...,...
237,P2Y12,7.0,22.018349,50.458716,12.262251,4.605057,4.617227,0.341858,0.873457,109,7948,0.000863
238,P2Y12,7.5,21.666667,50.833333,11.910569,4.979675,4.582612,0.307244,0.867979,120,8532,-0.004616
239,P2Y12,8.0,20.300752,50.375940,10.544654,4.522281,4.515195,0.239826,0.866340,133,9069,-0.006255
240,P2Y12,Top 10 Pareto Ranks,0.000000,4.166667,-9.756098,-41.686992,2.847280,-1.428088,0.836458,24,344,-0.036136


In [3]:
# list unique names in the 'Protein' column
unique_proteins = df["Protein"].unique()
# alphabetically sorted
unique_proteins.sort()
print(unique_proteins)

NameError: name 'df' is not defined

In [4]:
pd.set_option("display.max_rows", 300)

display(data)

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.912621,4.368932,0.0,0.0,1.372233,0.0,0.485772,206,10381,0.0
1,CCR5,4,8.0,8.0,5.087379,3.631068,1.384735,0.012502,0.519078,25,2757,0.033306
2,CCR5,4.5,4.651163,4.651163,1.738541,0.282231,1.272428,-0.099805,0.510033,43,3456,0.024262
3,CCR5,5.0,6.666667,10.0,3.754045,5.631068,1.518871,0.146638,0.536768,60,4168,0.050996
4,CCR5,5.5,5.479452,9.589041,2.566831,5.220109,1.549993,0.17776,0.536478,73,4844,0.050706
5,CCR5,6.0,4.545455,7.954545,1.632833,3.585613,1.453773,0.08154,0.523602,88,5556,0.03783
6,CCR5,7.0,4.310345,6.034483,1.397723,1.665551,1.521585,0.149352,0.510734,116,6797,0.024962
7,CCR5,7.5,3.846154,5.384615,0.933532,1.015683,1.471464,0.099232,0.502175,130,7299,0.016404
8,CCR5,8.0,3.472222,6.25,0.559601,1.881068,1.471278,0.099045,0.502503,144,7781,0.016731
9,CCR5,Top 10 Pareto Ranks,0.0,62.5,-2.912621,58.131068,4.532142,3.159909,0.777859,8,349,0.292087
