In [1]:
import os
import pandas as pd


def merge_datasets(directory):
    # Dictionary to hold pairs of filenames: {protein_name: {metric: [cutoff_file, pareto_file]}}
    file_pairs = {}

    # List all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            # Determine if the file is a pareto file
            is_pareto = "_pareto_" in filename
            # Extract the metric and protein name from the filename
            parts = filename.replace("_pareto", "").split("_")
            protein_name = parts[-1].split(".")[0]
            metric = "_".join(parts[:-1])

            # Initialize dictionary structure
            if protein_name not in file_pairs:
                file_pairs[protein_name] = {}
            if metric not in file_pairs[protein_name]:
                file_pairs[protein_name][metric] = [
                    None,
                    None,
                ]  # [main_file, pareto_file]

            # Assign filename to the correct slot
            if is_pareto:
                file_pairs[protein_name][metric][1] = filename
            else:
                file_pairs[protein_name][metric][0] = filename

    # Define the subdirectory
    subdirectory = "concat"

    # Merge datasets
    for protein, metrics in file_pairs.items():
      for metric, files in metrics.items():
        main_file, pareto_file = files
        if main_file and pareto_file:  # Check if both files are available
          df_main = pd.read_csv(os.path.join(directory, main_file))
          df_pareto = pd.read_csv(os.path.join(directory, pareto_file))
          # Merge the main and pareto datasets
          df_merged = pd.concat([df_main, df_pareto], ignore_index=True)
          # Optional: Save the merged dataset to a new CSV file in the subdirectory
          output_filename = f"{metric}_concat_{protein}.csv"
          output_path = os.path.join(directory, subdirectory, output_filename)
          os.makedirs(os.path.dirname(output_path), exist_ok=True)
          df_merged.to_csv(output_path, index=False)
          print(f"Merged dataset saved to {output_path}")


if __name__ == "__main__":
    # Specify the directory containing your CSV files
    directory = "./"
    merge_datasets(directory)

Merged dataset saved to ./concat/strain_log_aucs_concat_PPARG-3b1m.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_PPARG-3b1m.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_PPARG-3b1m.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_ESR1ago-2qzo.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_ESR1ago-2qzo.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_ESR1ago-2qzo.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_FEN1-5fv7.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_FEN1-5fv7.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_FEN1-5fv7.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_ESR1ant-2iog.csv
Merged dataset saved to ./concat/strain_roc_metrics_concat_ESR1ant-2iog.csv
Merged dataset saved to ./concat/strain_enrichment_metrics_concat_ESR1ant-2iog.csv
Merged dataset saved to ./concat/strain_log_aucs_concat_OPRK1-6b73.csv
Merged dataset s

In [2]:
import pandas as pd
import os

pd.set_option("display.max_rows", 200)

# Get all files in the 'concat' subdirectory of the current working directory
files = os.listdir("concat")

print(files)
# Step 1: Identify Unique Proteins

proteins = set(file.split("_")[3] for file in files if len(file.split("_")) >= 4)

# Initialize an empty DataFrame for the final combined data
combined_df = pd.DataFrame()

# Step 2 & 3: Process each protein
for protein in proteins:
    # Initialize empty lists to store DataFrames for each type
    enrichment_metrics_dfs = []
    log_aucs_dfs = []
    roc_metrics_dfs = []
    
    # Filter files for the current protein and process according to type
    for file in files:
        if protein in file:
            file_path = os.path.join("concat", file)  # Include the 'concat' directory in the file path
            if "enrichment_metrics" in file:
                enrichment_metrics_dfs.append(pd.read_csv(file_path))
            elif "log_aucs" in file:
                log_aucs_dfs.append(pd.read_csv(file_path))
            elif "roc_metrics" in file:
                roc_metrics_dfs.append(pd.read_csv(file_path))

    # Concatenate DataFrames of the same type
    # Drop duplicates to avoid merge issues
    enrichment_df = pd.concat(enrichment_metrics_dfs)
    enrichment_df.drop_duplicates(inplace=True)
    log_aucs_df = pd.concat(log_aucs_dfs)
    log_aucs_df.drop_duplicates(inplace=True)
    roc_metrics_df = pd.concat(roc_metrics_dfs)
    roc_metrics_df.drop_duplicates(inplace=True)

    # Step 4: Merge the three DataFrames for each protein
    merged_df = enrichment_df.merge(
        log_aucs_df, on=["Protein", "Strain Energy Cutoff"]
    ).merge(roc_metrics_df, on=["Protein", "Strain Energy Cutoff"])

    # Append to the final DataFrame
    combined_df = pd.concat([combined_df, merged_df], ignore_index=True)

print(combined_df.shape)
print(merged_df.shape)
display(merged_df.head(10))
# Resulting combined_df contains all data
display(combined_df)

# Save the final DataFrame to a CSV file
combined_df.to_csv("combined_data.csv", index=False)

data = combined_df

['strain_roc_metrics_concat_IDH1-4umx.csv', 'strain_log_aucs_concat_KAT2A-5mlj.csv', 'strain_log_aucs_concat_ADRB2-4lde.csv', 'strain_enrichment_metrics_concat_VDR-3a2j.csv', 'strain_enrichment_metrics_concat_PKM2-3gr4.csv', 'strain_log_aucs_concat_ESR1ago-2qzo.csv', 'strain_log_aucs_concat_OPRK1-6b73.csv', 'strain_log_aucs_concat_ALDH1-5l2m.csv', 'strain_enrichment_metrics_concat_PPARG-3b1m.csv', 'strain_enrichment_metrics_concat_ESR1ant-2iog.csv', 'strain_log_aucs_concat_IDH1-4umx.csv', 'strain_log_aucs_concat_MAPK1-4zzn.csv', 'strain_enrichment_metrics_concat_FEN1-5fv7.csv', 'strain_roc_metrics_concat_PPARG-3b1m.csv', 'strain_roc_metrics_concat_ESR1ago-2qzo.csv', 'strain_log_aucs_ADRB2_concat_4lde.csv', 'strain_log_aucs_concat_VDR-3a2j.csv', 'strain_log_aucs_concat_MTORC1-4dri.csv', 'strain_roc_metrics_concat_VDR-3a2j.csv', 'strain_enrichment_metrics_concat_TP53-3zme.csv', 'strain_log_aucs_concat_FEN1-5fv7.csv', 'strain_roc_metrics_concat_KAT2A-5mlj.csv', 'strain_enrichment_metrics_

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,VDR-3a2j,No Cutoff,0.43592,3.225806,0.0,0.0,1.171234,0.0,0.440936,1147,513419,0.0
1,VDR-3a2j,4,0.877193,4.035088,0.441273,0.809281,1.406952,0.235718,0.497745,570,168079,0.056809
2,VDR-3a2j,4.5,1.083591,4.024768,0.647672,0.798961,1.368004,0.196769,0.489612,646,197462,0.048676
3,VDR-3a2j,5.0,0.848656,3.818953,0.412737,0.593147,1.342305,0.17107,0.486711,707,226948,0.045775
4,VDR-3a2j,5.5,0.794702,3.84106,0.358782,0.615253,1.30295,0.131716,0.478476,755,255588,0.03754
5,VDR-3a2j,6.0,0.625,3.5,0.18908,0.274194,1.267673,0.096438,0.469186,800,282833,0.02825
6,VDR-3a2j,7.0,0.682594,3.299204,0.246674,0.073397,1.234507,0.063273,0.459781,879,332423,0.018845
7,VDR-3a2j,7.5,0.655738,3.497268,0.219818,0.271461,1.223051,0.051816,0.456846,915,354023,0.015911
8,VDR-3a2j,8.0,0.636267,3.605514,0.200347,0.379708,1.219942,0.048707,0.454677,943,373842,0.013741
9,VDR-3a2j,Top 10 Pareto Ranks,0.0,0.0,-0.43592,-3.225806,0.011935,-1.1593,0.00821,1,610,-0.432726


Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,ADRB2-4lde,No Cutoff,18.181818,27.272727,0.0,0.0,3.179881,0.0,0.653074,33,456283,0.0
1,ADRB2-4lde,4,7.142857,21.428571,-11.038961,-5.844156,2.709591,-0.47029,0.672234,14,128515,0.019161
2,ADRB2-4lde,4.5,6.666667,20.0,-11.515152,-7.272727,2.53001,-0.64987,0.653962,15,154406,0.000889
3,ADRB2-4lde,5.0,5.882353,29.411765,-12.299465,2.139037,2.821649,-0.358231,0.682582,17,180628,0.029508
4,ADRB2-4lde,5.5,5.882353,23.529412,-12.299465,-3.743316,2.799646,-0.380235,0.67428,17,206464,0.021207
5,ADRB2-4lde,6.0,14.285714,33.333333,-3.896104,6.060606,3.423574,0.243693,0.715477,21,231728,0.062404
6,ADRB2-4lde,7.0,14.814815,29.62963,-3.367003,2.356902,3.211432,0.031551,0.68709,27,278751,0.034016
7,ADRB2-4lde,7.5,14.285714,28.571429,-3.896104,1.298701,3.102725,-0.077155,0.679338,28,299676,0.026264
8,ADRB2-4lde,8.0,14.285714,25.0,-3.896104,-2.272727,3.06123,-0.118651,0.676062,28,319263,0.022989
9,ADRB2-4lde,Top 10 Pareto Ranks,0.0,0.0,-18.181818,-27.272727,2.877513,-0.302368,0.858559,2,557,0.205485


In [4]:
# list unique names in the 'Protein' column
unique_proteins = combined_df["Protein"].unique()
# alphabetically sorted
unique_proteins.sort()
print(unique_proteins)

['ADRB2-4lde' 'ADRB2_4lde' 'ALDH1-5l2m' 'ESR1ago-2qzo' 'ESR1ant-2iog'
 'FEN1-5fv7' 'GBA-2v3d' 'IDH1-4umx' 'KAT2A-5mlj' 'MAPK1-4zzn'
 'MTORC1-4dri' 'OPRK1-6b73' 'PKM2-3gr4' 'PPARG-3b1m' 'TP53-3zme'
 'VDR-3a2j']


In [4]:
pd.set_option("display.max_rows", 300)

display(data)

Unnamed: 0,Protein,Strain Energy Cutoff,EF1%,EF5%,deltaEF1%,deltaEF5%,Linear Log10 AUC (x10),Delta Linear Log10 AUC (x10),ROC_AUC,Actives,Total Count,deltaAUC
0,CCR5,No Cutoff,2.912621,4.368932,0.0,0.0,1.372233,0.0,0.485772,206,10381,0.0
1,CCR5,4,8.0,8.0,5.087379,3.631068,1.384735,0.012502,0.519078,25,2757,0.033306
2,CCR5,4.5,4.651163,4.651163,1.738541,0.282231,1.272428,-0.099805,0.510033,43,3456,0.024262
3,CCR5,5.0,6.666667,10.0,3.754045,5.631068,1.518871,0.146638,0.536768,60,4168,0.050996
4,CCR5,5.5,5.479452,9.589041,2.566831,5.220109,1.549993,0.17776,0.536478,73,4844,0.050706
5,CCR5,6.0,4.545455,7.954545,1.632833,3.585613,1.453773,0.08154,0.523602,88,5556,0.03783
6,CCR5,7.0,4.310345,6.034483,1.397723,1.665551,1.521585,0.149352,0.510734,116,6797,0.024962
7,CCR5,7.5,3.846154,5.384615,0.933532,1.015683,1.471464,0.099232,0.502175,130,7299,0.016404
8,CCR5,8.0,3.472222,6.25,0.559601,1.881068,1.471278,0.099045,0.502503,144,7781,0.016731
9,CCR5,Top 10 Pareto Ranks,0.0,62.5,-2.912621,58.131068,4.532142,3.159909,0.777859,8,349,0.292087
