In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
import bisect
#import altair as alt
pd.options.mode.chained_assignment = None
from Bio import SeqIO

In [2]:
# Scaling factor for entrapment calculations, based on the ratio on the number of entrapment peptide sequences and human peptide sequences in the database

scaling_factor = 1 + 3105275 / 3608159
scaling_factor

1.8606258759661092

In [3]:
# Local path to files downloaded from PRIDE
data_path = "D:\\PIP_ECHO_PRIDE\\"


entrapment_seq_path = data_path + r"Proteomes\EntrapmentProteinPeptideSequences_50percent.txt"
with open(entrapment_seq_path) as f:
    entrapment_seqs = f.readlines()

entrapment_seqs_set = set()
for seq in entrapment_seqs:
    entrapment_seqs_set.add(seq.replace("\n", ""))


In [4]:
# Build a dictionary linking protein accessions to their species of origin (For MaxQuant analysis)
accession_dict = dict()

yeast_fasta_sequences = SeqIO.parse(open(data_path + r"Proteomes\uniprot_SCerevisiae_6k_03_2024.fasta"),'fasta')
yeast_records = list(yeast_fasta_sequences)
for record in yeast_records:
    accession_dict[record.id.split('|')[1]] = 'Yeast'

human_fasta_sequences = SeqIO.parse(open(data_path + r"Proteomes\uniprot_HSapiens_80k_03_2024.fasta"),'fasta')
human_records = list(human_fasta_sequences)
for record in human_records:
    accession_dict[record.id.split('|')[1]] = 'Human'

human_fasta_sequences = SeqIO.parse(open(data_path + r"Proteomes\ConcatenatedHumanEntrapmentProteins_50percent.fasta"),'fasta')
human_records = list(human_fasta_sequences)
for record in human_records:
    accession_dict[record.id.split('|')[1]] = 'Human'
    accession_dict[record.id] = 'Human'

ecoli_fasta_sequences = SeqIO.parse(open(data_path + r"Proteomes\uniprot_Ecoli_4k_03_2024.fasta"),'fasta')
ecoli_records = list(ecoli_fasta_sequences)
for record in ecoli_records:
    accession_dict[record.id.split('|')[1]] = 'Ecoli'

In [5]:
def get_fdp_old_flash(o_peaks_path, c_peaks_path, censored_psm_path, human_file_pattern = "_1x02nguL_",
                   foreign_species = 'Yeast', rt_delta = 0.25):

    # calculate the error rate for (presumably) native human transfers
    peaks = pd.read_csv(o_peaks_path, sep = '\t')
    
    peaks = peaks.loc[~peaks["Protein Group"].str.contains("DECOY")]
    peaks = peaks.loc[peaks["File Name"].str.contains(human_file_pattern)]
    peaks["Organism"] = peaks.apply(get_organisms, axis = 1)

    msms_detected_yeast_peaks = peaks.loc[(peaks["Peak Detection Type"] == "MSMS") &
                           (peaks["Organism"].str.contains(foreign_species))]
    msms_detected_yeast_peaks.drop_duplicates(subset = ["Full Sequence"], keep = 'first', inplace = True)
    msms_yeast_seqs = set(msms_detected_yeast_peaks["Full Sequence"].tolist())

    peaks = peaks.loc[(peaks["Peak Detection Type"] == "MBR")]

    human_peaks = peaks.loc[peaks["Organism"].str.contains('Human')]
    # Our entrapment database has a bunch of scrambled sequences appended to the end of the DB, which is why we're checking for them
    entrapment_peaks = human_peaks.loc[human_peaks["Full Sequence"].isin(entrapment_seqs_set)]
    human_peaks = human_peaks.loc[~human_peaks["Base Sequence"].isin(entrapment_seqs_set)]
    
    human_count = human_peaks.shape[0]
    arabida_count = entrapment_peaks.shape[0]

    yeast_peaks = peaks.loc[peaks["Organism"].str.contains(foreign_species)]
    yeast_peaks = yeast_peaks.loc[~yeast_peaks["Organism"].str.contains('Human')]
    yeast_count = yeast_peaks.shape[0]
    
    msms_yeast_count = sum(yeast_peaks["Full Sequence"].isin(msms_yeast_seqs))
    yeast_count = yeast_count - msms_yeast_count


    scaled_arabida = arabida_count * scaling_factor
    nper_results = get_nper_old_flash(censored_psm_path, o_peaks_path, c_peaks_path, rt_delta)
    scaled_nper = human_count * nper_results["error_rate"]

    total = human_count + yeast_count + arabida_count

    fper = 100 * yeast_count / total
    nper = 100 * scaled_nper / total
    fder = 100 * scaled_arabida / total

    false_discovery_proportion = fper + nper + fder

    results = dict(
        {
            "human": human_count,
            "foreign": yeast_count,
            "arabida": arabida_count,
            "total": total,
            "scaled_arabida": scaled_arabida,
            "scaled_nper": scaled_nper,
            "sensitivity": nper_results["sensitivity"],
            "fper": fper,
            "nper": nper,
            "fder": fder,
            "false_discovery_proportion": false_discovery_proportion
        }
    )
    return results

def get_organisms(row):
    accesions = row["Protein Group"].split(";")
    organisms = []
    for a in accesions:
        if a in accession_dict:
            organisms.append(accession_dict[a])
        elif ("ENTRAPMENT" in a):
            organisms.append("Arabida")
    return ";".join(organisms)

def get_nper_old_flash(censored_path, old_peak_path, new_peak_path, rt_delta = 0.25):
    # Read in the list of peptides that were censored
    censored_psms = pd.read_csv(censored_path, sep = '\t')
    # Select the ones that were quantified in the initial FlashLFQ analysis
    original_peaks = pd.read_csv(old_peak_path, sep = '\t')
    original_peaks = original_peaks[original_peaks["Peak RT Apex"] != "-"]
    censored_peaks = pd.merge(censored_psms, original_peaks, how = "inner", left_on=["File Name", "Full Sequence"], right_on=["File Name", "Full Sequence"])
    censored_peaks = censored_peaks[["File Name", "Full Sequence", "Peak RT Start", "Peak RT Apex", "Peak RT End", "Peak Charge", "Peak intensity"]]
    # This will allow for comparison later, as all the new peaks were take from files named in this way
    censored_peaks["File Name"] = censored_peaks["File Name"].astype(str) + "-censored" 

    # Merge the old and new results
    new_peaks = pd.read_csv(new_peak_path, sep = '\t')
    new_peaks = new_peaks.loc[(new_peaks["Peak Detection Type"] == "MBR")]
    new_peaks = new_peaks[["File Name", "Full Sequence", "Peak RT Start", "Peak RT Apex", "Peak RT End", "Peak Charge",  "Peak intensity", "MBR Score"]]
    peak_join = pd.merge(censored_peaks, new_peaks, how = "inner", left_on=["File Name", "Full Sequence"], right_on=["File Name", "Full Sequence"])

    peak_join_no_dup = peak_join.drop_duplicates(subset = ["File Name", "Full Sequence"], keep = 'first')
    censored_peaks_no_dup = censored_peaks.drop_duplicates(subset = ["File Name", "Full Sequence"], keep = 'first')
    sensitivity = peak_join_no_dup.shape[0] / censored_peaks_no_dup.shape[0]

    #compare the old and new results (here, 1 means the match, 0 means they dont)
    peak_join["Agreement"] = peak_join.apply(lambda x: check_overlap_time(x, rt_delta), axis = 1)

    if(peak_join.shape[0] == 0):
        return dict({
            "sensitivity": 0,
            "error_rate": 0} )

    peak_join.sort_values(by = 'MBR Score', ascending=False, inplace=True)
    peak_join["good_transfers"] = (peak_join["Agreement"] == 1).cumsum()
    peak_join["bad_transfers"] = (peak_join["Agreement"] == 0).cumsum()
    peak_join["error_rate"] = peak_join.apply(calculateErrorRate, axis = 1)

    results = dict(
        {
            "sensitivity": sensitivity,
            "error_rate": peak_join["error_rate"].iloc[-1]
        }

    )

    return results

# Native Peak Error Rate functions
def check_overlap_time(table, rt_delta = 0.25):
    try:
        pip_apex = float(table["Peak RT Apex_y"])
        ms_apex = float(table["Peak RT Apex_x"])
    except:
        return -1
    if(abs(pip_apex-ms_apex) < rt_delta):
        return 1
    else:
        return 0
    
    
def calculateErrorRate(table):
    return table["bad_transfers"] / (table["good_transfers"] + table["bad_transfers"])

In [6]:
def get_fdp_flash(o_peaks_path, c_peaks_path, censored_psm_path, human_file_pattern = "_1x02nguL_",
                   foreign_species = 'Saccharomyces cerevisiae', q_value = 0.05, rt_delta = 0.5):

    peaks = pd.read_csv(o_peaks_path, sep = '\t')
    peaks = peaks.loc[peaks["File Name"].str.contains(human_file_pattern)]

    msms_detected_yeast_peaks = peaks.loc[(peaks["Peak Detection Type"] == "MSMS") &
                           (peaks["Organism"].str.contains(foreign_species))]
    msms_detected_yeast_peaks.drop_duplicates(subset = ["Full Sequence"], keep = 'first', inplace = True)
    msms_yeast_seqs = msms_detected_yeast_peaks["Full Sequence"].tolist()

    msms_peaks = peaks.loc[(peaks["Peak Detection Type"] == "MSMS") &
                           (peaks["Decoy Peptide"] == False) &
                           ((peaks["Organism"].str.contains(foreign_species)) | (peaks["Organism"].str.contains('Homo sapiens')) | (peaks["Organism"].str.contains('Arabidopsis')))]
    msms_count = msms_peaks.shape[0]

    peaks = peaks.loc[(peaks["Peak Detection Type"] == "MBR") & (peaks["PIP Q-Value"] < q_value) & (peaks["Decoy Peptide"] == False)]

    peaks = peaks.loc[(peaks["Random RT"] == False) & (peaks["Decoy Peptide"] == False)]

    human_peaks = peaks.loc[peaks["Organism"].str.contains('Homo sapiens')]
    # Our entrapment database has a bunch of scrambled sequences appended to the end of the DB, which is why we're checking for them
    entrapment_peaks = human_peaks.loc[human_peaks["Full Sequence"].isin(entrapment_seqs_set)]
    human_peaks = human_peaks.loc[~human_peaks["Base Sequence"].isin(entrapment_seqs_set)]
    
    human_count = human_peaks.shape[0]
    arabida_count = entrapment_peaks.shape[0]

    yeast_peaks = peaks.loc[peaks["Organism"].str.contains(foreign_species)]
    yeast_peaks = yeast_peaks.loc[~yeast_peaks["Organism"].str.contains('Homo sapiens')]
    yeast_count = yeast_peaks.shape[0]
    
    msms_yeast_count = sum(yeast_peaks["Full Sequence"].isin(msms_yeast_seqs))
    yeast_count = yeast_count - msms_yeast_count

    score_threshold = peaks["PIP PEP"].max()

    # calculate the error rate for (presumably) native human transfers
    nper_results = get_native_peak_error_rate( 
                            o_peaks_path = o_peaks_path,
                            c_peaks_path = c_peaks_path,
                            censored_psm_path = censored_psm_path,
                            q_value_threshold = q_value,
                            score_threshold = score_threshold,
                            rt_delta = rt_delta)
    

    scaled_arabida = arabida_count * scaling_factor
    scaled_nper = human_count * nper_results["error_rate"]

    total = human_count + yeast_count + arabida_count
    total = max(1, total)

    fper = 100 * yeast_count / total
    nper = 100 * scaled_nper / total
    fder = 100 * scaled_arabida / total

    false_discovery_proportion = fper + nper + fder

    results = dict(
        {
            "human": human_count,
            "foreign": yeast_count,
            "arabida": arabida_count,
            "total": total,
            "msms_count" : msms_count,
            "scaled_arabida": scaled_arabida,
            "scaled_nper": scaled_nper,
            "sensitivity": nper_results["sensitivity"],
            "fper": fper,
            "nper": nper,
            "fder": fder,
            "false_discovery_proportion": false_discovery_proportion
        }
    )
    return results

# Native Peak Error Rate functions
def check_overlap_time(table, rt_delta = 0.25):
    try:
        pip_apex = float(table["Peak RT Apex_y"])
        ms_apex = float(table["Peak RT Apex_x"])
    except:
        return -1
    if(abs(pip_apex-ms_apex) < rt_delta):
        return 1
    else:
        return 0
    
def calculateErrorRate(table):
    return table["bad_transfers"] / (table["good_transfers"] + table["bad_transfers"])

def get_native_peak_error_rate(o_peaks_path, c_peaks_path, censored_psm_path, q_value_threshold, score_threshold=10, rt_delta = 0.25):
    # Read in the list of peptides that were censored
    censored_psms = pd.read_csv(censored_psm_path, sep = '\t')
    # Select the ones that were quantified in the initial FlashLFQ analysis
    original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
    original_peaks = original_peaks[original_peaks["Peak RT Apex"] != "-"]
    
    censored_peaks = pd.merge(censored_psms, original_peaks, how = "inner", left_on=["File Name", "Full Sequence"], right_on=["File Name", "Full Sequence"])
    censored_peaks = censored_peaks[["File Name", "Full Sequence", "Peak RT Start", "Peak RT Apex", "Peak RT End", "Peak Charge", "Peak intensity"]]

    # This will allow for comparison later, as all the new peaks were take from files named in this way
    censored_peaks["File Name"] = censored_peaks["File Name"].astype(str) + "-censored" 

    # Merge the old and new results
    new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
    new_peaks = new_peaks.loc[(new_peaks["Peak Detection Type"] == "MBR") & (new_peaks["Random RT"] == False) & (new_peaks["Decoy Peptide"] == False)]
    new_peaks = new_peaks.loc[(new_peaks["PIP Q-Value"] < q_value_threshold)]
    #new_peaks = new_peaks.loc[(new_peaks["MBR Score"] > score_threshold)]
    new_peaks = new_peaks[["File Name", "Full Sequence", "Peak RT Start", "Peak RT Apex", "Peak RT End", "Peak Charge",  "Peak intensity", "PIP PEP", 'PIP Q-Value']]
    peak_join = pd.merge(censored_peaks, new_peaks, how = "inner", left_on=["File Name", "Full Sequence"], right_on=["File Name", "Full Sequence"])

    peak_join["Agreement"] = peak_join.apply(lambda x: check_overlap_time(x, rt_delta), axis = 1)
    peak_join.sort_values(by = "Agreement", ascending=False, inplace=True)

    #compare the old and new results (here, 1 means the match, 0 means they dont)
    peak_join.drop_duplicates(subset = ["File Name", "Full Sequence"], keep = 'first', inplace=True)
    censored_peaks.drop_duplicates(subset = ["File Name", "Full Sequence"], keep = 'first', inplace=True)
    sensitivity = peak_join.shape[0] / censored_peaks.shape[0]


    if(peak_join.shape[0] == 0):
        return dict({
            "sensitivity": 0,
            "error_rate": 0} )

    peak_join.sort_values(by = 'PIP Q-Value', ascending=True, inplace=True)
    peak_join["good_transfers"] = (peak_join["Agreement"] == 1).cumsum()
    peak_join["bad_transfers"] = (peak_join["Agreement"] == 0).cumsum()
    peak_join["error_rate"] = peak_join.apply(calculateErrorRate, axis = 1)

    results = dict(
        {
            "sensitivity": sensitivity,
            "error_rate": peak_join["error_rate"].iloc[-1]
        }
    )

    return results


In [7]:
def get_fdp_fragger(o_folder = r"D:\GygiTwoProteome_PXD014415\IonQuant1Percent",
                      c_folder= r"D:\GygiTwoProteome_PXD014415\IonQuant1Percent_censored",
                      censored_psm_path = r"D:\GygiTwoProteome_PXD014415\CensoredDataFiles_fragger\CensoredPsms.tsv",
                      foreign_species = "YEAST",
                      human_file_pattern = "_human_90min_",
                      rt_diff_threshold = 0.5):
        
        rt_diff_threshold = rt_diff_threshold * 60 # Convert to seconds, which is how iq stores RT

        human_rep_indices = get_indices(o_folder, human_file_pattern)
        human_rep_indices_censored = get_indices(c_folder, human_file_pattern)
        
        ion_count = count_ions_by_species(o_folder + r"\combined_modified_peptide.tsv", match_indices = human_rep_indices, foreign_species = foreign_species)
        nper_results = analyze_nper(o_folder = o_folder, c_folder = c_folder, censored_psm_path = censored_psm_path, 
                                         human_rep_indices = human_rep_indices, human_rep_indices_censored = human_rep_indices_censored,
                                         rt_diff_threshold = rt_diff_threshold)
        
        scaled_arabida = ion_count["arabida"] * scaling_factor
        scaled_nper = ion_count["human"] * nper_results["specificity"]

        ion_count["scaled_arabida"] = scaled_arabida
        ion_count["scaled_nper"] = scaled_nper
        ion_count["specificity"] = nper_results["specificity"]
        ion_count["sensitivity"] = nper_results["sensitivity"]
        ion_count["nper_pct_diff"] = nper_results["intensity_diff"]

        ion_count["fper"] = 100 * ion_count["foreign"]  / ion_count["total"]
        ion_count["nper"] = 100 * ion_count["scaled_nper"]  / ion_count["total"]
        ion_count["fder"] = 100 * ion_count["scaled_arabida"]  / ion_count["total"]
        ion_count["false_discovery_proportion"] = ion_count["fper"] + ion_count["nper"] + ion_count["fder"]

        return ion_count
    
def get_indices(folder_path, pattern = "_human_90min_"):
    exp_file_path = folder_path + r"\experiment_annotation.tsv"
    exp_file = pd.read_csv(exp_file_path, sep = '\t')
    human_indices = exp_file[exp_file["file"].str.contains(pattern)].index
    return human_indices

def analyze_nper(o_folder,
                      c_folder,
                      censored_psm_path,
                      human_rep_indices,
                      human_rep_indices_censored = None,
                      rt_diff_threshold = 30):
    
    if(not any(human_rep_indices)):
        human_rep_indices_censored = human_rep_indices

    o_annotation_path = o_folder + r"\experiment_annotation.tsv"
    annotation_file = pd.read_csv(o_annotation_path, sep = '\t')
    file_dict_o = dict(zip(annotation_file.file, annotation_file.sample_name))
    original_file_dict = dict()
    for old_key in file_dict_o.keys():
        new_key = old_key.split("\\")[-1].split(".")[0]
        original_file_dict[new_key] = file_dict_o[old_key]

    c_annotation_path = c_folder + r"\experiment_annotation.tsv"
    annotation_file = pd.read_csv(c_annotation_path, sep = '\t')
    file_dict_c = dict(zip(annotation_file.file, annotation_file.sample_name))
    censored_file_dict = dict()
    for old_key in file_dict_c.keys():
        new_key = old_key.replace("-censored", "").split("\\")[-1].split(".")[0]
        censored_file_dict[new_key] = file_dict_c[old_key]

    # Read in the censored psms
    censored_psms = pd.read_csv(censored_psm_path, sep = '\t')
    censored_psms["File Name"] = censored_psms["Spectrum File"].apply(lambda x: x.replace("-censored", "").replace("interact-", "").split("\\")[-1].split(".")[0])
    # The mods got screwed up during writing the psms, so we fix that here
    censored_psms["Modified Peptide"] = censored_psms.apply(lambda x: x["Modified Peptide"].replace("43", "42.0106"), axis = 1)
    censored_psms["Modified Peptide"] = censored_psms.apply(lambda x: x["Modified Peptide"].replace("147", "15.9949"), axis = 1)
    censored_psms["Modified Peptide"] = censored_psms.apply(lambda x: x["Modified Peptide"].replace("C", "C[57.0215]"), axis = 1)

    # Read in the original ions
    o_ion_path = o_folder + r"\combined_ion.tsv"
    original_ions = pd.read_csv(o_ion_path, sep = '\t')

    info_cols = original_ions.columns[[0, 1, 7,8,11,12,13,14,15,16, 17]].to_list()

    match_cols = [col for col in original_ions.columns if "Match Type" in col]
    intensity_cols = [col for col in original_ions.columns if "Intensity" in col]
    rt_cols = [col for col in original_ions.columns if "Apex Retention Time" in col]

    match_cols = [match_cols[i] for i in human_rep_indices]
    rt_cols = [rt_cols[i] for i in human_rep_indices]
    intensity_cols = [intensity_cols[i] for i in human_rep_indices]

    original_ions = original_ions[info_cols + match_cols + rt_cols + intensity_cols]

    ion_melt = pd.melt(original_ions, id_vars = info_cols + match_cols + intensity_cols, value_vars=rt_cols, var_name="RT Column", value_name="RT")
    ion_melt["Match Type"] = ion_melt.apply(get_match_type, axis = 1)
    ion_melt["Intensity"] = ion_melt.apply(get_intensity, axis = 1)
    ion_melt.drop(match_cols + intensity_cols, axis = 1, inplace = True)
    ion_melt = ion_melt.loc[ion_melt["Match Type"] == "MS/MS"]

    original_file_rev_dict = dict((v, k) for k, v in original_file_dict.items())
    ion_melt["File Name"] = ion_melt.apply(lambda x: original_file_rev_dict[x["RT Column"].split(" ")[0]], axis = 1)
    o_ion_merge = pd.merge(left = ion_melt, right=censored_psms[["File Name", "Modified Peptide"]],
                            how = "inner", left_on = ["File Name", "Modified Sequence"], right_on = ["File Name", "Modified Peptide"])
    o_ion_merge = o_ion_merge.sort_values("Intensity", ascending=False).groupby(["Modified Sequence", "File Name"], as_index = False).first() # Keep only the highest intensity charge state


    #Read in the ions derived from the censored data
    # melt it to have one row per peak (need to group rt, intensity, etc.)
    c_ion_path = c_folder + r"\combined_ion.tsv"
    censored_ions = pd.read_csv(c_ion_path, sep = '\t')
    info_cols = censored_ions.columns[[0, 1, 7,8,11,12,13,14,15,16, 17]].to_list()

    match_cols = [col for col in censored_ions.columns if "Match Type" in col]
    intensity_cols = [col for col in censored_ions.columns if "Intensity" in col]
    rt_cols = [col for col in censored_ions.columns if "Apex Retention Time" in col]

    match_cols = [match_cols[i] for i in human_rep_indices_censored]
    rt_cols = [rt_cols[i] for i in human_rep_indices_censored]
    intensity_cols = [intensity_cols[i] for i in human_rep_indices_censored]

    censored_ions = censored_ions[info_cols + match_cols + rt_cols + intensity_cols]

    c_ion_melt = pd.melt(censored_ions, id_vars = info_cols + match_cols + intensity_cols, value_vars=rt_cols, var_name="RT Column", value_name="RT")
    c_ion_melt["Match Type"] = c_ion_melt.apply(get_match_type, axis = 1)
    c_ion_melt["Intensity"] = c_ion_melt.apply(get_intensity, axis = 1)
    c_ion_melt.drop(match_cols + intensity_cols, axis = 1, inplace = True)

    censored_file_rev_dict = dict((v, k) for k, v in censored_file_dict.items())
    c_ion_melt["File Name"] = c_ion_melt.apply(lambda x: censored_file_rev_dict[x["RT Column"].split(" ")[0]], axis = 1)
    c_ion_melt = c_ion_melt.loc[c_ion_melt["Match Type"] == "MBR"]
    c_ion_merge = pd.merge(left = c_ion_melt, right=censored_psms[["File Name", "Modified Peptide"]],
                            how = "right", left_on = ["File Name", "Modified Sequence"], right_on = ["File Name", "Modified Peptide"])
    c_ion_merge = c_ion_merge.sort_values("Intensity", ascending=False).groupby(["Modified Sequence", "File Name"], as_index = False).first() # Keep only the highest intensity charge state

    # Calculate the sensitivity (number of MBR ions / number of ions detected w/ MSMS)
    ion_comp = pd.merge(o_ion_merge, c_ion_merge, on = ["Modified Peptide", "File Name"], suffixes = ("_o", "_c"), how = "inner")
    ion_comp = ion_comp[ion_comp["RT_o"].notna() & ion_comp["RT_c"].notna()]
    ion_comp["RT_diff"] = abs(ion_comp["RT_o"] - ion_comp["RT_c"])
    ion_comp.sort_values("RT_diff", ascending=True, inplace=True)
    ion_comp = ion_comp.drop_duplicates(subset = ["Modified Peptide", "File Name"], keep = 'first')
    specificity = sum(ion_comp["RT_diff"] > rt_diff_threshold) / ion_comp.shape[0]

    ion_diffs = ion_comp.loc[ion_comp["RT_diff"] > rt_diff_threshold]
    pct_diffs = ion_diffs.apply(lambda x: 100 * abs(x["Intensity_o"] - x["Intensity_c"]) /
                                             ((x["Intensity_o"] + x["Intensity_c"]) / 2.0), axis = 1)
    

    o_ion_merge.drop_duplicates(subset = ["Modified Peptide", "File Name"], inplace = True)
    c_ion_merge.drop_duplicates(subset = ["Modified Peptide", "File Name"], inplace = True)
    number_of_ions_original = o_ion_merge.shape[0] - o_ion_merge["RT"].isna().sum()
    number_of_ions_censored = c_ion_merge.shape[0] - c_ion_merge["RT"].isna().sum()
    sensitivity = number_of_ions_censored / number_of_ions_original

    results = dict(sensitivity = sensitivity, specificity = specificity, intensity_diff = pct_diffs.mean())
    return results


# combined_ion parsing functions
def get_match_type(row):
    return(row[row["RT Column"].split(" ")[0] + " Match Type"])

def get_intensity(row):
    return(row[row["RT Column"].split(" ")[0] + " Intensity"])

def count_ions_by_species(path_to_combined_ion, match_indices = [0, 1, 2, 3, 4, 5, 6 ], foreign_species = "YEAST"):
    entrapment_species = "_ENT"

    # Read in combined ions
    ions = pd.read_csv(path_to_combined_ion, sep = '\t')
    # Keep informative columns (species, peptide, etc.) and "... Match Type" columns
    match_cols = [col for col in ions.columns if "Match Type" in col]
    # WARNING - This line is experiment dependent and should be changed based on the samples you wish to analyze
    match_cols = [match_cols[i] for i in match_indices] #Remove the two mixed proteome samples 
    info_cols = ions.columns[[0, 1, 7,8,11,12,13,14,15,16, 17]].to_list()
    ions = ions[info_cols + match_cols]
    # us melt to create one row per sample match type
    ion_mbr = ions.melt(info_cols)

    foreign_msms = ion_mbr.loc[(ion_mbr["Entry Name"].str.contains(foreign_species)) & (ion_mbr["value"] == "MS/MS")]
    foreign_msms_seqs = set(foreign_msms["Modified Sequence"].tolist())
    
    ion_msms = ion_mbr.loc[ion_mbr.value == "MS/MS"]
    ion_msms = ion_msms.loc[(ion_msms["Entry Name"].str.contains("HUMAN")) | (ion_msms["Entry Name"].str.contains(foreign_species)) | (ion_msms["Entry Name"].str.contains(entrapment_species))]
    msms_count = ion_msms.shape[0]

    # keep only the MBR match types
    ion_mbr = ion_mbr.loc[ion_mbr.value == "MBR"]

    ion_mbr.drop_duplicates(subset = ["Modified Sequence", "variable"], inplace = True)

    # Count the number of ions from each species
    human_ions = ion_mbr.loc[ion_mbr["Entry Name"].str.contains("HUMAN")]
    real_human_ions = human_ions.loc[~human_ions["Peptide Sequence"].isin(entrapment_seqs_set)]
    entrapment_ions = human_ions.loc[human_ions["Peptide Sequence"].isin(entrapment_seqs_set)]
    human_ion_count = real_human_ions.shape[0]
    arath_ion_count = entrapment_ions.shape[0]

    foreign_ions = ion_mbr.loc[ion_mbr["Entry Name"].str.contains(foreign_species)]
    foreign_ions["Mapped Proteins"] = foreign_ions["Mapped Proteins"].fillna('')
    foreign_ions = foreign_ions.loc[~foreign_ions["Mapped Proteins"].str.contains("HUMAN")]
    foreign_ion_count = foreign_ions.shape[0]

    foreign_double_count = sum(foreign_ions["Modified Sequence"].isin(foreign_msms_seqs))
    foreign_ion_count = foreign_ion_count - foreign_double_count



    count_dict = dict(
        {
            "human" : human_ion_count,
            "foreign" : foreign_ion_count,
            "arabida" : arath_ion_count,
            "total" : human_ion_count + foreign_ion_count + arath_ion_count,
            "msms_count" : msms_count,
            "mbr_count" : ion_mbr.shape[0]
        })
    return(count_dict)

In [8]:
def get_fdp_maxquant(path, foreign_species = "Yeast", human_file_pattern = "_human_90min_"):
    
    ion_count = count_ions_by_species_mq(path, foreign_species, human_file_pattern)

    scaled_arabida = ion_count["arabida"] * scaling_factor
        
    ion_count["scaled_arabida"] = scaled_arabida

    ion_count["fper"] = 100 * ion_count["foreign"]  / ion_count["total"]
    ion_count["nper"] = 0 # MaxQuant doesn't support mzML files, so no censored analysis was performed, and the number of native peak errors couldn't be evaluated
    ion_count["fder"] = 100 * ion_count["scaled_arabida"]  / ion_count["total"]
    ion_count["false_discovery_proportion"] = ion_count["fper"] + ion_count["nper"] + ion_count["fder"]

    return ion_count

def count_ions_by_species_mq(evidence_path = r"D:\Gygi_TwoProteomeData\combined_MaxQuant_Gygi\txt\evidence.txt", foreign_species = "Yeast",
                             human_file_pattern = "_human_90min_"):
    
    evidence = pd.read_csv(evidence_path, sep = '\t')
    evidence = evidence.loc[(evidence["Reverse"] != "+") & (evidence["Raw file"].str.contains(human_file_pattern))]
    evidence["Proteins"] = evidence["Proteins"].astype(str)
    evidence["Organism"] = evidence.apply(lambda x: ';'.join([get_species(protein) for protein in x['Proteins'].split(';')]), axis = 1)
    
    msms_foreign = evidence.loc[(evidence["MS/MS count"] > 0) & (evidence["Organism"].str.contains(foreign_species))]
    msms_foreign_seq = set(msms_foreign["Modified sequence"].tolist())

    pip = evidence.loc[evidence["Match score"].isna() == False]
    pip = pip.loc[pip["Raw file"].str.contains(human_file_pattern)]

    human_ions = pip.loc[pip["Organism"].str.contains("Human")]
    human_ion_real = human_ions.loc[~human_ions["Sequence"].isin(entrapment_seqs_set)]  
    entrapment_ions = human_ions.loc[human_ions["Sequence"].isin(entrapment_seqs_set)]  
    human_ion_count = human_ion_real.shape[0]
    arath_ion_count = entrapment_ions.shape[0]

    foreign_ions = pip.loc[pip["Organism"].str.contains(foreign_species)]
    foreign_ions = foreign_ions.loc[~foreign_ions["Organism"].str.contains("Human")]
    foreign_ion_count = foreign_ions.shape[0]

    foreign_ion_double_count = sum(foreign_ions["Modified sequence"].isin(msms_foreign_seq))
    foreign_ion_count = foreign_ion_count - foreign_ion_double_count

    count_dict = dict(
        {
            "human" : human_ion_count,
            "foreign" : foreign_ion_count,
            "arabida" : arath_ion_count,
            "total" : sum([human_ion_count, foreign_ion_count, arath_ion_count])
        })
    return count_dict

def get_species(accession):
    species = accession_dict.get(accession, 'Other')
    return species

In [9]:
# RT thresholds for native peak error calculations

gygi_rt = 90 * 0.01
inhouse_rt = 60 * 0.01
kelly_rt = 40 * 0.01

print("Gygi RT Threshold: ", gygi_rt)
print("Kelly RT Threshold: ", kelly_rt)
print("Inhouse RT Threshold: ", inhouse_rt)

Gygi RT Threshold:  0.9
Kelly RT Threshold:  0.4
Inhouse RT Threshold:  0.6


In [None]:
gygi_path = data_path + "LimDataset\\"
inhouse_path = data_path + "EcoliDataset\\"
kelly_path = data_path + "SingleCellDataset\\"

gygi_flashv1 = get_fdp_old_flash(o_peaks_path =  gygi_path + "Lim_QuantResults-FlashLFQv1\FlashLFQ_CurRel_PepQ_ConcatenatedDb\QuantifiedPeaks.tsv",
                          c_peaks_path = gygi_path + "Lim_QuantResults-FlashLFQv1\CensoredData_FlashLFQ_CurRel_PepQ_ConcatenatedDb\QuantifiedPeaks.tsv",
                          censored_psm_path = gygi_path + "CensoredFiles-MetaMorpheus\CensoredPsms.psmtsv",
                          human_file_pattern="human_90", foreign_species='Yeast', rt_delta=gygi_rt)

inhouse_flashv1 = get_fdp_old_flash(o_peaks_path =  inhouse_path + "Ecoli_QuantResults-FlashLFQv1\\Human_FlashLFQ_CurRel_PepQ_NewDb\QuantifiedPeaks.tsv",
                          c_peaks_path = inhouse_path + "Ecoli_QuantResults-FlashLFQv1\\CensoredHuman_FlashLFQ_CurRel_PepQ_NewDb\QuantifiedPeaks.tsv",
                          censored_psm_path = inhouse_path + "Ecoli_CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv",
                          human_file_pattern="Human_C18", foreign_species='Ecoli', rt_delta=inhouse_rt)

kelly_flashv1= get_fdp_old_flash(o_peaks_path =  kelly_path + "SingleCell_QuantResults-FlashLFQv1\\FlashLFQ_CurRel_PepQ_ConcatenatedDb\QuantifiedPeaks.tsv",
                          c_peaks_path = kelly_path + "SingleCell_QuantResults-FlashLFQv1\\CensoredData_FlashLFQ_CurRel_PepQ_ConcatenatedDb\QuantifiedPeaks.tsv",
                          censored_psm_path = kelly_path + "CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv",
                          human_file_pattern="_1x02nguL_", foreign_species='Yeast', rt_delta=kelly_rt)

  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(old_peak_path, sep = '\t')
  new_peaks = pd.read_csv(new_peak_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')


In [68]:
kelly_censored_psms = kelly_path + "CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv"

kelly_dd_1 = get_fdp_flash(o_peaks_path =  kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          c_peaks_path = kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          censored_psm_path = kelly_censored_psms,
                          human_file_pattern="_1x02nguL_", foreign_species='Saccharomyces cerevisiae', q_value=0.01, rt_delta=kelly_rt)


In [69]:
# Current FlashLFQ
gygi_censored_psms = gygi_path + "CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv"

gygi_dd_1 = get_fdp_flash(o_peaks_path =  gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          c_peaks_path = gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          censored_psm_path = gygi_censored_psms,
                          human_file_pattern="human_90", foreign_species='Saccharomyces cerevisiae', q_value=0.01, rt_delta=gygi_rt)

gygi_dd_2p5 = get_fdp_flash(o_peaks_path =  gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                            c_peaks_path = gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                            censored_psm_path = gygi_censored_psms,
                            human_file_pattern="human_90", foreign_species='Saccharomyces cerevisiae', q_value=0.025, rt_delta=gygi_rt)

gygi_dd_5 = get_fdp_flash(o_peaks_path =  gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          c_peaks_path = gygi_path + "Lim_QuantResults-FlashLFQ_PIP_ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          censored_psm_path = gygi_censored_psms,
                          human_file_pattern="human_90", foreign_species='Saccharomyces cerevisiae', q_value=0.05, rt_delta=gygi_rt)

inhouse_censored_psms = inhouse_path + "Ecoli_CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv"

inhouse_dd_1 = get_fdp_flash(o_peaks_path =  inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\Human_FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          c_peaks_path = inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\CensoredHuman_FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          censored_psm_path = inhouse_censored_psms,
                          human_file_pattern="Human_C18", foreign_species='Escherichia coli', q_value=0.01, rt_delta=inhouse_rt)

inhouse_dd_2p5 = get_fdp_flash(o_peaks_path =  inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\Human_FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                          c_peaks_path = inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\CensoredHuman_FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                          censored_psm_path = inhouse_censored_psms,
                          human_file_pattern="Human_C18", foreign_species='Escherichia coli', q_value=0.025, rt_delta=inhouse_rt)

inhouse_dd_5 = get_fdp_flash(o_peaks_path =  inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\Human_FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          c_peaks_path = inhouse_path + "Ecoli_QuantResults-FlashLFQ_PIP-ECHO\\CensoredHuman_FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          censored_psm_path = inhouse_censored_psms,
                          human_file_pattern="Human_C18", foreign_species='Escherichia coli', q_value=0.05, rt_delta=inhouse_rt)

kelly_censored_psms = kelly_path + "CensoredFiles-MetaMorpheus\\CensoredPsms.psmtsv"

kelly_dd_1 = get_fdp_flash(o_peaks_path =  kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          c_peaks_path = kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_02\QuantifiedPeaks.tsv",
                          censored_psm_path = kelly_censored_psms,
                          human_file_pattern="_1x02nguL_", foreign_species='Saccharomyces cerevisiae', q_value=0.01, rt_delta=kelly_rt)

kelly_dd_2p5 = get_fdp_flash(o_peaks_path =  kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                          c_peaks_path = kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_05\QuantifiedPeaks.tsv",
                          censored_psm_path = kelly_censored_psms,
                          human_file_pattern="_1x02nguL_", foreign_species='Saccharomyces cerevisiae', q_value=0.025, rt_delta=kelly_rt)


kelly_dd_5 = get_fdp_flash(o_peaks_path =  kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          c_peaks_path = kelly_path + "SingleCell_QuantResults-FlashLFQ_PIP-ECHO\\CensoredData_FlashLFQ_7772_DonorPepQ_1\QuantifiedPeaks.tsv",
                          censored_psm_path = kelly_censored_psms,
                          human_file_pattern="_1x02nguL_", foreign_species='Saccharomyces cerevisiae', q_value=0.05, rt_delta=kelly_rt)



  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')
  peaks = pd.read_csv(o_peaks_path, sep = '\t')
  original_peaks = pd.read_csv(o_peaks_path, sep = '\t')
  new_peaks = pd.read_csv(c_peaks_path, sep = '\t')


In [70]:
gygi_iq_1 = get_fdp_fragger(o_folder =  gygi_path + "Lim_Results-FragPipe\\IonQuant_1Percent_50PercentEntrapmentDb",
                                c_folder= gygi_path + "Lim_Results-FragPipe\\CensoredData_IonQuant_1Percent_50PercentEntrapmentDb",
                                censored_psm_path= gygi_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv",
                                human_file_pattern="human_90",
                                rt_diff_threshold=gygi_rt)

gygi_iq_2p5 = get_fdp_fragger(o_folder =  gygi_path + "Lim_Results-FragPipe\\IonQuant_2p5Percent_50PercentEntrapmentDb",
                                c_folder= gygi_path + "Lim_Results-FragPipe\\CensoredData_IonQuant_2p5Percent_50PercentEntrapmentDb",
                                censored_psm_path= gygi_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv",
                                human_file_pattern="human_90",
                                rt_diff_threshold=gygi_rt) 

gygi_iq_5 = get_fdp_fragger(o_folder =  gygi_path + "Lim_Results-FragPipe\\IonQuant_5Percent_50PercentEntrapmentDb",
                                c_folder= gygi_path + "Lim_Results-FragPipe\\CensoredData_IonQuant_5Percent_50PercentEntrapmentDb",
                                censored_psm_path= gygi_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv",
                                human_file_pattern="human_90",
                                rt_diff_threshold=gygi_rt)

inhouse_iq_1 = get_fdp_fragger(o_folder = inhouse_path + "Ecoli_Results-FragPipe\\IonQuant_1Percent_50PercentEntrapmentDb", 
                            c_folder= inhouse_path + "Ecoli_Results-FragPipe\\CensoredData_IonQuant_1Percent_50PercentEntrapmentDb",
                            censored_psm_path= inhouse_path + "Ecoli_CensoredFiles-FragPipe\\CensoredPsms.tsv", 
                            human_file_pattern="_Human_C18_",
                            foreign_species="ECOLI",
                            rt_diff_threshold=inhouse_rt)

inhouse_iq_2p5 = get_fdp_fragger(o_folder = inhouse_path + "Ecoli_Results-FragPipe\\IonQuant_2p5Percent_50PercentEntrapmentDb", 
                            c_folder= inhouse_path + "Ecoli_Results-FragPipe\\CensoredData_IonQuant_2p5Percent_50PercentEntrapmentDb",
                            censored_psm_path= inhouse_path + "Ecoli_CensoredFiles-FragPipe\\CensoredPsms.tsv", 
                            human_file_pattern="_Human_C18_",
                            foreign_species="ECOLI",
                            rt_diff_threshold=inhouse_rt)

inhouse_iq_5 = get_fdp_fragger(o_folder = inhouse_path + "Ecoli_Results-FragPipe\\IonQuant_5Percent_50PercentEntrapmentDb", 
                            c_folder= inhouse_path + "Ecoli_Results-FragPipe\\CensoredData_IonQuant_5Percent_50PercentEntrapmentDb",
                            censored_psm_path= inhouse_path + "Ecoli_CensoredFiles-FragPipe\\CensoredPsms.tsv", 
                            human_file_pattern="_Human_C18_",
                            foreign_species="ECOLI",
                            rt_diff_threshold=inhouse_rt)

kelly_iq_1 = get_fdp_fragger(o_folder = kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_1Percent", 
                            c_folder= kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_CensoredData_1Percent",
                            censored_psm_path= kelly_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv", 
                            human_file_pattern="_1x02nguL_",
                            rt_diff_threshold=kelly_rt)

kelly_iq_2p5 = get_fdp_fragger(o_folder = kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_2p5Percent_2", 
                            c_folder= kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_CensoredData_2p5Percent",
                            censored_psm_path= kelly_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv",  
                            human_file_pattern="_1x02nguL_",
                            rt_diff_threshold=kelly_rt)

kelly_iq_5 = get_fdp_fragger(o_folder = kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_5Percent_2", 
                            c_folder= kelly_path + "SingleCell_Results-FragPipe\\LibrarySettings_CensoredData_5Percent",
                            censored_psm_path= kelly_path + "CensoredFiles-FragPipe\\CensoredPsms.tsv",  
                            human_file_pattern="_1x02nguL_",
                            rt_diff_threshold=kelly_rt)



FileNotFoundError: [Errno 2] No such file or directory: 'D:\\PIP_ECHO_PRIDE\\SingleCellDataset\\SingleCell_Results-FragPipe\\LibrarySettings_1Percent\\experiment_annotation.tsv'

In [None]:
gygi_mq = get_fdp_maxquant(gygi_path + "Lim_Results-MaxQuant\\txt\\evidence.txt", "Yeast", human_file_pattern="human_90")

inhouse_mq = get_fdp_maxquant(inhouse_path + "Ecoli_Results-MaxQuant\\txt\\evidence.txt", "Ecoli", human_file_pattern="_Human_C18_")

kelly_mq = get_fdp_maxquant(kelly_path + "SingleCell_Results-MaxQuant\\txt\\evidence.txt", "Yeast", human_file_pattern="_1x02nguL_")

  evidence = pd.read_csv(evidence_path, sep = '\t')
  evidence = pd.read_csv(evidence_path, sep = '\t')


In [None]:
fdp_df = pd.DataFrame(
    {
        "Gygi_Flash_v1" : pd.Series(gygi_flashv1),
        "Gygi_Flash_v2_1" : pd.Series(gygi_dd_1),
        "Gygi_Flash_v2_2.5" : pd.Series(gygi_dd_2p5),
        "Gygi_Flash_v2_5" : pd.Series(gygi_dd_5),
        "Gygi_IonQuant_1" : pd.Series(gygi_iq_1),
        "Gygi_IonQuant_2.5" : pd.Series(gygi_iq_2p5),
        "Gygi_IonQuant_5" : pd.Series(gygi_iq_5),
        "Gygi_MaxQuant" : pd.Series(gygi_mq),

        "Kelly_Flash_v1" : pd.Series(kelly_flashv1),
        "Kelly_Flash_v2_1" : pd.Series(kelly_dd_1),
        "Kelly_Flash_v2_2.5" : pd.Series(kelly_dd_2p5),
        "Kelly_Flash_v2_5" : pd.Series(kelly_dd_5),
        "Kelly_IonQuant_1" : pd.Series(kelly_iq_1),
        "Kelly_IonQuant_2.5" : pd.Series(kelly_iq_2p5),
        "Kelly_IonQuant_5" : pd.Series(kelly_iq_5),
        "Kelly_MaxQuant" : pd.Series(kelly_mq),

        "Inhouse_Flash_v1" : pd.Series(inhouse_flashv1),
        "Inhouse_Flash_v2_1" : pd.Series(inhouse_dd_1),
        "Inhouse_Flash_v2_2.5" : pd.Series(inhouse_dd_2p5),
        "Inhouse_Flash_v2_5" : pd.Series(inhouse_dd_5),
        "Inhouse_IonQuant_1" : pd.Series(inhouse_iq_1),
        "Inhouse_IonQuant_2.5" : pd.Series(inhouse_iq_2p5),
        "Inhouse_IonQuant_5" : pd.Series(inhouse_iq_5),
        "Inhouse_MaxQuant" : pd.Series(inhouse_mq)
    }
)

fdp_df = fdp_df.transpose()
fdp_df["software"] = fdp_df.index

fdp_df.to_csv(r"C:\Users\Alex\Source\Repos\MBR_metamorpheus\R_Files\FDP_Analysis_Results.tsv", sep = '\t')