In [1]:
import pandas as pd

In [2]:
# run ImmunoMTL first:
# python3 eval_ImmunoMTL.py
# python3 eval_ImmunoMTL_shuffle.py
# python3 eval_ImmunoSTL.py

In [13]:
import os
import pandas as pd

def load_and_clean_predictions(directory=".", out="."):
    prediction_dfs = {}

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            try:
                df = pd.read_csv(filepath)
                # Drop columns if they exist
                df = df.drop(columns=["ImmunoMTL_score", "MMS_Cluster"], errors="ignore")

                prediction_dfs[filename] = df
                cleaned_path = os.path.join(out, filename)
                df.to_csv(cleaned_path, index=False)
            except Exception as e:
                print(f"[ERROR] Failed to load {filename}: {e}")
    return prediction_dfs

# === Usage ===
cleaned_predictions = load_and_clean_predictions("../pred_results/immunomtl", out = "../pred_results")

# Preview an example
for name, df in cleaned_predictions.items():
    print(f"\n{name}")
    print(df.shape)
    print(df.head())


mRNA.csv
(222, 3)
       Peptide          MHC  Label
0  ITVNASRPQPF  HLA-A*32:01      0
1     RLAAAVRF  HLA-A*32:01      0
2  AVMHLDHSDTI  HLA-A*32:01      0
3  VSYANSCANPV  HLA-C*17:01      0
4   MSAEVNLAGL  HLA-C*17:01      1

zero1.csv
(1172, 3)
     Peptide          MHC  Label
0  FYVFDEPLL  HLA-A*24:07      0
1  RYARTIFNF  HLA-A*24:07      0
2  IFKDSTMHI  HLA-A*24:07      0
3  LFYVYYNLF  HLA-A*24:07      0
4  VYYNLFLLF  HLA-A*24:07      0

BenchmarkSet.csv
(2495, 3)
     Peptide          MHC  Label
0   FLKEKGGL  HLA-B*08:01      1
1  ELRRKMMYM  HLA-B*08:01      1
2  QIKVRVDMV  HLA-B*08:01      1
3  FLRGRAYGL  HLA-B*08:01      1
4  HSKKKCDEL  HLA-B*08:01      1

zero2.csv
(77, 3)
      Peptide          MHC  Label
0    RRFFPYYV  HLA-B*27:03      1
1  FLPSDFFPSV  HLA-A*02:17      1
2   WLSLLVPFV  HLA-A*02:17      1
3   YLPGVIAAI  HLA-A*02:17      1
4    GPISGHVL  HLA-B*81:01      1


In [14]:
#bigMHC 
#conda env: pbert
# pre install bigMHC
# 2135  python3 predict.py -o ../../ImmunoMTL/pred_results/bigmhc/BenchmarkSet_bigmhc.csv -i ../../ImmunoMTL/pred_results/BenchmarkSet.csv -m IM -d all -a 1 -p 0 
# 2136  python3 predict.py -o ../../ImmunoMTL/pred_results/bigmhc/mRNA_bigmhc.csv -i ../../ImmunoMTL/pred_results/mRNA.csv -m IM -d all -a 1 -p 0 
# 2139  python3 predict.py -o ../../ImmunoMTL/pred_results/bigmhc/zero1_bigmhc.csv -i ../../ImmunoMTL/pred_results/zero1.csv -m IM -d all -a 1 -p 0 
# 2140  python3 predict.py -o ../../ImmunoMTL/pred_results/bigmhc/zero2_bigmhc.csv -i ../../ImmunoMTL/pred_results/zero2.csv -m IM -d all -a 1 -p 0


In [15]:
#for munis

processed = {}
for name, df in cleaned_predictions.items():
    try:
        # Rename columns
        df = df.rename(columns={"Peptide": "pep", "MHC": "mhc"})

        # Create 'left' and 'right' columns
        df.insert(loc=df.columns.get_loc("mhc") + 1, column="left", value="")
        df.insert(loc=df.columns.get_loc("left") + 1, column="right", value="")

        processed[name] = df
        print(f"Processed: {name}")
        print(df.head())
        df.to_csv(f"../pred_results/munis/{name}")
    except Exception as e:
        print(f"Failed to process {name}: {e}")


Processed: mRNA.csv
           pep          mhc left right  Label
0  ITVNASRPQPF  HLA-A*32:01                 0
1     RLAAAVRF  HLA-A*32:01                 0
2  AVMHLDHSDTI  HLA-A*32:01                 0
3  VSYANSCANPV  HLA-C*17:01                 0
4   MSAEVNLAGL  HLA-C*17:01                 1
Processed: zero1.csv
         pep          mhc left right  Label
0  FYVFDEPLL  HLA-A*24:07                 0
1  RYARTIFNF  HLA-A*24:07                 0
2  IFKDSTMHI  HLA-A*24:07                 0
3  LFYVYYNLF  HLA-A*24:07                 0
4  VYYNLFLLF  HLA-A*24:07                 0
Processed: BenchmarkSet.csv
         pep          mhc left right  Label
0   FLKEKGGL  HLA-B*08:01                 1
1  ELRRKMMYM  HLA-B*08:01                 1
2  QIKVRVDMV  HLA-B*08:01                 1
3  FLRGRAYGL  HLA-B*08:01                 1
4  HSKKKCDEL  HLA-B*08:01                 1
Processed: zero2.csv
          pep          mhc left right  Label
0    RRFFPYYV  HLA-B*27:03                 1
1  FLPSDFFPSV  H

In [16]:
#munis
#conda env: munis
# pre install munis
# 2072  python predict.py --outdir ../ImmunoMTL/pred_results/munis/ --peptides ../ImmunoMTL/pred_results/munis/BenchmarkSet.csv 
# 2073  python predict.py --outdir ../ImmunoMTL/pred_results/munis/ --peptides ../ImmunoMTL/pred_results/munis/mRNA.csv 
# 2075  python predict.py --outdir ../ImmunoMTL/pred_results/munis/ --peptides ../ImmunoMTL/pred_results/munis/zero1.csv 
# 2076  python predict.py --outdir ../ImmunoMTL/pred_results/munis/ --peptides ../ImmunoMTL/pred_results/munis/zero2.csv


In [25]:
#PRIME
#cd bin 
#python3 PRIME_predict.py --input ~/bin/ImmunoMTL/pred_results/zero1.csv --hla MHC --pep Peptide --l Label --out ~/bin/ImmunoMTL/pred_results/prime2/zero1_prime.csv
#python3 PRIME_predict.py --input ~/bin/ImmunoMTL/pred_results/zero2.csv --hla MHC --pep Peptide --l Label --out ~/bin/ImmunoMTL/pred_results/prime2/zero2_prime.csv
#python3 PRIME_predict.py --input ~/bin/ImmunoMTL/pred_results/BenchmarkSet.csv --hla MHC --pep Peptide --l Label --out ~/bin/ImmunoMTL/pred_results/prime2/BenchmarkSet_prime.csv
#python3 PRIME_predict.py --input ~/bin/ImmunoMTL/pred_results/mRNA.csv --hla MHC --pep Peptide --l Label --out ~/bin/ImmunoMTL/pred_results/prime2/mRNA_prime.csv

In [26]:
#netMHCpan
# 2037  python3 BA_predict.py --input ~/bin/ImmunoMTL/pred_results/mRNA.csv --hla MHC --pep Peptide --l Label --out ~/bin/ImmunoMTL/pred_results/netMHCpan4.1b/mRNA_netMHCpan.csv

In [17]:
import os
import pandas as pd
import numpy as np

# === CONFIG ===
folders = {
    "immunostl": "../pred_results/immunostl",
    "munis": "../pred_results/munis",
    "deepimmuno": "../pred_results/deepimmuno",
    "bigmhc": "../pred_results/bigmhc",
    "prime2": "../pred_results/prime2",
    "shuffle": "../pred_results/immunomtl_shuffle",
}

external_tools = {
    "immunostl": ("immunostl", "", "Predicted Score"),
    "BigMHC_IM": ("bigmhc", "_bigmhc", "BigMHC_IM"),
    "munis": ("munis", "_munis_predictions", "score"),
    "PRIME_score": ("prime2", "_prime", "PRIME_score"),
    "ImmunoMTL_shuffle": ("shuffle", "", "Predicted Score"),
}

datasets = ["zero1", "zero2", "BenchmarkSet"]
output_dir = "../analysis/"
os.makedirs(output_dir, exist_ok=True)

for dataset in datasets:
    print(f"[INFO] Processing {dataset}")
    df_base = pd.read_csv(os.path.join("../pred_results/immunomtl", f"{dataset}.csv"))

    # Ensure necessary columns
    if "ImmunoMTL_score" not in df_base.columns:
        df_base["ImmunoMTL_score"] = np.nan
    if "Peptide" not in df_base.columns or "MHC" not in df_base.columns:
        raise ValueError("Input files must contain 'Peptide' and 'MHC' columns")

    df_base["pMHC"] = df_base["Peptide"] + df_base["MHC"]

    # Add scores from external tools
    for key, (folder_key, suffix, col) in external_tools.items():
        file_path = os.path.join(folders[folder_key], f"{dataset}{suffix}.csv")
        if os.path.exists(file_path):
            try:
                ext_df = pd.read_csv(file_path)

                # Handle special column names
                if "pep" in ext_df.columns and "mhc" in ext_df.columns:
                    ext_df["pMHC"] = ext_df["pep"] + ext_df["mhc"]
                elif "Peptide" in ext_df.columns and "MHC" in ext_df.columns:
                    ext_df["pMHC"] = ext_df["Peptide"] + ext_df["MHC"]
                else:
                    print(f"[WARN] {key} missing peptide/MHC columns for pMHC construction")
                    df_base[key] = np.nan
                    continue

                # Handle score/label column renaming if needed
                # Always rename the score column to match the key (tool name)
                if key == "munis" and "score" in ext_df.columns:
                    ext_df = ext_df.rename(columns={"score": key})
                elif key == "BigMHC_IM" and "BigMHC_IM" in ext_df.columns:
                    ext_df = ext_df.rename(columns={"BigMHC_IM": key})
                elif key == "immunostl" and "Predicted Score" in ext_df.columns:
                    ext_df = ext_df.rename(columns={"Predicted Score": key})
                elif col in ext_df.columns:
                    ext_df = ext_df.rename(columns={col: key})
                else:
                    print(f"[WARN] Cannot find suitable column to rename for {key}")
                    df_base[key] = np.nan
                    continue
                df_base = df_base.merge(ext_df[["pMHC", key]], on="pMHC", how="left")
            except Exception as e:
                print(f"[WARN] {key} column issue in {file_path}: {e}")
                df_base[key] = np.nan
        else:
            print(f"[INFO] {key} not found for {dataset}")
            df_base[key] = np.nan

    # Save individual file
    print(df_base.shape)
    output_path = os.path.join(output_dir, f"{dataset}_pred.csv")
    df_base.to_csv(output_path, index=False)
    print(f"[INFO] Saved to {output_path}")

[INFO] Processing zero1
[INFO] ImmunoMTL_shuffle not found for zero1
(1172, 11)
[INFO] Saved to ../analysis/zero1_pred.csv
[INFO] Processing zero2
[INFO] ImmunoMTL_shuffle not found for zero2
(77, 11)
[INFO] Saved to ../analysis/zero2_pred.csv
[INFO] Processing BenchmarkSet
(2495, 11)
[INFO] Saved to ../analysis/BenchmarkSet_pred.csv


In [11]:
#Vali pool
mRNA_result = pd.read_csv("../pred_results/immunomtl/mRNA.csv")

mRNA_result["pMHC"] = mRNA_result["Peptide"]+mRNA_result["MHC"]

mRNA_result_bigmhc = pd.read_csv("../pred_results/bigmhc/mRNA_bigmhc.csv")
mRNA_result["BigMHC_IM"] = mRNA_result_bigmhc["BigMHC_IM"]

mRNA_result_prime = pd.read_csv("../pred_results/prime2/mRNA_prime.csv")
mRNA_result["PRIME_score"] = mRNA_result_prime["PRIME_score"]
mRNA_result["PRIME_rank"] = mRNA_result_prime["PRIME_rank"]

mRNA_result_munis = pd.read_csv("../pred_results/munis/mRNA_munis_predictions.csv")
mRNA_result["munis"] = mRNA_result_munis["score"]

mRNA_result_netMHCpan = pd.read_csv("../pred_results/netMHCpan4.1b/mRNA_netMHCpan.csv")
mRNA_result["netMHCpan_score"] = mRNA_result_netMHCpan["EL-score"]
mRNA_result["netMHCpan_rank"] = mRNA_result_netMHCpan["EL_Rank"]

#mRNA_result.to_csv("../analysis/mRNA_rank.csv")

In [12]:
patient_info = pd.read_csv("../data/mRNAvaccine_pID.csv")
patient_info["pMHC"] = patient_info["Peptide"]+patient_info["MHC"]


mRNA_result = pd.merge(mRNA_result, patient_info[["pMHC", "patientID"]], on="pMHC", how="left")
mRNA_result.to_csv("../analysis/mRNA_pred.csv", index = False)
mRNA_result

Unnamed: 0,Peptide,MHC,MMS_Cluster,Label,ImmunoMTL_score,pMHC,BigMHC_IM,PRIME_score,PRIME_rank,munis,netMHCpan_score,netMHCpan_rank,patientID
0,ITVNASRPQPF,HLA-A*32:01,0.0,0,9.296952e-01,ITVNASRPQPFHLA-A*32:01,0.000514,0.002020,19.170,0.151489,0.0126,4.6814,1
1,RLAAAVRF,HLA-A*32:01,0.0,0,2.207086e-08,RLAAAVRFHLA-A*32:01,0.006755,0.004967,8.816,0.271729,0.0734,1.5100,5
2,AVMHLDHSDTI,HLA-A*32:01,0.0,0,4.035173e-01,AVMHLDHSDTIHLA-A*32:01,0.000256,0.014185,3.145,0.095215,0.0201,3.6059,5
3,VSYANSCANPV,HLA-C*17:01,0.0,0,3.250802e-07,VSYANSCANPVHLA-C*17:01,0.028055,0.000734,37.307,0.005650,0.0019,14.5042,1
4,MSAEVNLAGL,HLA-C*17:01,0.0,1,4.604495e-02,MSAEVNLAGLHLA-C*17:01,0.100420,0.018708,2.849,0.228760,0.1320,1.5382,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,FFPYNYHSQK,HLA-C*04:01,3.0,0,1.400116e-03,FFPYNYHSQKHLA-C*04:01,0.331144,0.011502,3.973,0.008041,0.0012,12.7056,14
218,TEYKLVVVGAC,HLA-C*04:01,3.0,0,9.890347e-01,TEYKLVVVGACHLA-C*04:01,0.035441,0.000607,41.408,0.005188,0.0000,61.6667,20
219,SFASFQRDLF,HLA-C*04:01,3.0,0,7.486498e-12,SFASFQRDLFHLA-C*04:01,0.140121,0.011137,4.101,0.176514,0.0053,5.4123,20
220,SSDGNLNEL,HLA-C*08:02,3.0,0,1.820668e-01,SSDGNLNELHLA-C*08:02,0.276676,0.284666,0.003,0.997559,0.9733,0.0079,29
