In [6]:
library(tidyverse)
source("../../evaluation_utils/preprocessing_MQ/preprocessing_report.R")

# Path and meta

In [7]:
# REVIEWED
data_path = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_reviewed_MQ_report/'

# NOT REVIEWED
# data_path = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/'

In [8]:
# Metadtata

metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Metadata_CosyBio.tsv",
                       header = TRUE, sep = "\t", stringsAsFactors = FALSE)
                       
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

rownames(metadata) <- metadata$Quantitative.column.name
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(".Pool", ".P_", gsub("Reporter.intensity.corrected.", "RIC_", Quantitative.column.name)))


# remove the outlier
metadata <- metadata %>%
            filter(Quantitative.column.name != "RIC_3.P_3")


# rename the columns - put the .P_[1-9]+ after dot before and the rest after (e.g. RIC_1.P_1 -> P_1.RIC_1)
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub("RIC_([0-9]+).P_([0-9]+)", "P_\\2.RIC_\\1", Quantitative.column.name))



head(metadata, 3)
dim(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Reporter.intensity.corrected.1.Pool1,P_1.RIC_1,Pool1,126,Common Reference,Common Reference,Center1
Reporter.intensity.corrected.2.Pool1,P_1.RIC_2,Pool1,127N,heathy1,heathy,Center1
Reporter.intensity.corrected.3.Pool1,P_1.RIC_3,Pool1,127C,FSGS1,FSGS,Center1


# Logic

Three types of data:
- (1) Data summarized using Gene names
- (2) Data summarized using Major.PG.IDs
- (3) Data summarized to PG from peptides level.
- (4) Data summarized to Gene.names from peptides level.


For (1) and (2) PG_group report is used.  
Filtered, then features names and samples columns (+ razor+unique peptides are extracted) and stored into separated files.

For (3) and (4) approach from MaxQuant for data aggregation is used. Filtered on peptides level, then summarized into PG group using MaxQuant logic. 

# Extract data for (1) and (2)

In [21]:
# List of input files
list_of_inputs <- list(
    "Center1" = paste0(data_path, "Center1/proteinGroups.txt"),
    "Center2" = paste0(data_path, "Center2/proteinGroups.txt"),
    "Center3" = paste0(data_path, "Center3/proteinGroups.txt")
)

# Initialize empty lists to store combined data
combined_pg_intensities <- list()
combined_counts <- list()

# Iterate over each center
for(center in names(list_of_inputs)) {
    # Determine the center folder based on the center name
    center_folder <- ifelse(center == "Center1", "center_one", ifelse(center == "Center2", "center_two", "center_three"))

    # Filter metadata for the current center
    center_metadata <- metadata %>%
        filter(Center == center)

    # Preprocess the MaxQuant output file for the current center
    results_list <- preprocess_data_mxout(list_of_inputs[[center]], center_metadata, data_type='protein')
    pg_intensities <- results_list[[1]]
    counts_df <- results_list[[2]]

    # Add the processed data to the combined lists
    combined_pg_intensities <- c(combined_pg_intensities, list(pg_intensities))
    combined_counts <- c(combined_counts, list(counts_df))
}

# Add names to the combined lists
names(combined_pg_intensities) <- names(list_of_inputs)
names(combined_counts) <- names(list_of_inputs)

Processed data count: 491 
Counts data count: 491 
Processed data count: 516 
Counts data count: 516 
Processed data count: 438 
Counts data count: 438 


### (1)

In [22]:
# Set the output path for the gene intensities and counts
genes_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/01_PG_Genes/"

# Iterate over each center
for(center in list("Center1", "Center2", "Center3")) {
  
  # Summarize gene intensities
  # Calculate the sum of intensities for each gene and concatenate multiple gene names
  # separated by semicolons into a single string
  genes_intensities <- combined_pg_intensities[[center]] %>%
    select(-Majority.protein.IDs) %>%
    group_by(Gene.names) %>%
    summarize_all(sum) %>%
    ungroup() %>%
    filter(Gene.names != "") %>%
    rowwise() %>% # Apply the following operations to each row individually
    mutate(Gene.names = map_chr(strsplit(Gene.names, ";"), ~paste(sort(.x), collapse = ";")))

  # Write the gene intensities to a file
  write.table(genes_intensities, paste0(genes_output_path, center, "/genes_intensities.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    
  # Summarize gene counts
  # Filter, select, and manipulate the counts dataframe
  #"Peptide.is.razor"
  # counts_df <- combined_counts[[center]] %>%
  #   filter(Gene.names != "") %>%
  #   select(-c(Majority.protein.IDs, Razor...unique.peptides)) %>%
  #   mutate(Peptide.IDs = strsplit(Peptide.IDs, ";")) %>%
  #   unnest(Peptide.IDs) %>%  unique() %>%
  #   group_by(Gene.names) %>%  summarize(Unique.peptides = n()) %>%
  #   rowwise() %>% # Apply the following operations to each row individually
  #   mutate(Gene.names = map_chr(strsplit(Gene.names, ";"), ~paste(sort(.x), collapse = ";")))
  counts_df <- combined_counts[[center]] %>%
    filter(Gene.names != "") %>%
    select(-c(Majority.protein.IDs, Razor...unique.peptides)) %>%
    mutate(Peptide.IDs = strsplit(Peptide.is.razor, ";")) %>%
    unnest(Peptide.IDs) %>%  
    filter(Peptide.IDs == "True") %>%
    group_by(Gene.names) %>%  summarize(Unique.peptides = n()) %>%
    rowwise() %>% # Apply the following operations to each row individually
    mutate(Gene.names = map_chr(strsplit(Gene.names, ";"), ~paste(sort(.x), collapse = ";")))

  # This code performs data preparation steps on the counts dataframe. 
  #It filters out rows where the Gene.names column is empty, removes unnecessary columns, 
  # splits the Peptide.IDs column into separate values, removes duplicate rows, 
  # groups the data by Gene.names, calculates the count of unique peptides for each gene, 
  # and finally sorts and collapses the Gene.names values. 
  #The resulting dataframe is stored in the counts_df variable.

  # Write the gene counts to a file
  write.table(counts_df, paste0(genes_output_path, center, "/genes_counts_v2.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

  # Write the metadata to a file
  metadata %>%
    filter(Center == center) %>%
    write.table(paste0(genes_output_path, center, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = TRUE)

  # if(center == 'Center2'){
  #   write.table(genes_intensities, "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2/intensities_Gene.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
  #   write.table(counts_df, "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2/counts_Gene.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
  # }
}

### (2)

In [23]:
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/02_PG_MajorPG/"

# This loop iterates over a list of centers and performs data preparation tasks for each center.
for(center in list("Center1", "Center2", "Center3")) {
  # Summarize gene names
  genes_intensities <- combined_pg_intensities[[center]] %>%
    select(-Gene.names) %>%
    group_by(Majority.protein.IDs) %>%
    rowwise() %>% # Apply the following operations to each row individually
    mutate(Majority.protein.IDs = map_chr(strsplit(Majority.protein.IDs, ";"), ~paste(sort(.x), collapse = ";")))

  # Write gene intensities to a file
  write.table(genes_intensities, paste0(pg_output_path, center, "/pg_intensities.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    
  # Summarize counts
  counts_df <- combined_counts[[center]] %>%
    select(-c(Gene.names, Peptide.IDs, Peptide.is.razor)) %>%
    rowwise() %>% # Apply the following operations to each row individually
    mutate(Majority.protein.IDs = map_chr(strsplit(Majority.protein.IDs, ";"), ~paste(sort(.x), collapse = ";")))

  # Write counts to a file
  write.table(counts_df, paste0(pg_output_path, center, "/pg_counts.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

  # Write metadata to a file
  metadata %>%
    filter(Center == center) %>%
    write.table(paste0(pg_output_path, center, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = TRUE)

  # if(center == 'Center2'){
  #   write.table(genes_intensities, "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2/intensities_PG.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
  #   write.table(counts_df, "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2/counts_PG.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
  #   metadata %>%
  #     filter(Center == center) %>%
  #     write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2/metadata.tsv", sep = "\t", quote = FALSE, row.names = TRUE)
  # }
}

# Extract data for (3) and (4)

## (3) and (4) --- aggragated report

In [9]:
genes_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes/"
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG/"


list_of_outputs <- list(
    "Center1" = paste0(data_path, "Center1/peptides.txt"),
    "Center2" = paste0(data_path, "Center2/peptides.txt"),
    "Center3" = paste0(data_path, "Center3/peptides.txt")
)

combined_pg_intensities <- list()

for(center in names(list_of_outputs)) {

    center_metadata <- metadata %>%
        filter(Center == center)

    not_filtered <- preprocess_data_mxout(list_of_outputs[[center]], center_metadata, data_type = 'peptide', do_filter = FALSE)
    # results_list <- preprocess_data_mxout(list_of_outputs[[center]], center_metadata, data_type = 'peptide', do_filter = TRUE)

    # pg_intensities <- results_list %>%
    #     rowwise() %>% # Apply the following operations to each row individually
    #     mutate(Gene.names = map_chr(strsplit(Gene.names, ";"), ~paste(sort(.x), collapse = ";"))) %>%
    #     mutate(Proteins = map_chr(strsplit(Proteins, ";"), ~paste(sort(.x), collapse = ";")))
    # pg_intensities <- pg_intensities[, c("Sequence", "Proteins", "Gene.names", rownames(center_metadata))]
    # colnames(pg_intensities) <- c("Sequence", "Proteins", "Gene.names", center_metadata$Quantitative.column.name)

    # write.table(pg_intensities, paste0(genes_output_path, center, "/aggregated.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    # write.table(pg_intensities, paste0(pg_output_path, center, "/aggregated.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

    pg_intensities_NF <- not_filtered %>%
        rowwise() %>% # Apply the following operations to each row individually
        mutate(Gene.names = map_chr(strsplit(Gene.names, ";"), ~paste(sort(.x), collapse = ";"))) %>%
        mutate(Proteins = map_chr(strsplit(Proteins, ";"), ~paste(sort(.x), collapse = ";")))
    pg_intensities_NF <- pg_intensities_NF[, c("Sequence", "Proteins", "Gene.names", 'Reverse', 'Potential.contaminant', rownames(center_metadata))]
    colnames(pg_intensities_NF) <- c("Sequence", "Proteins", "Gene.names", 'Reverse', 'Potential.contaminant', center_metadata$Quantitative.column.name)

    write.table(pg_intensities_NF, paste0(genes_output_path, center, "/aggregated_NF.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    write.table(pg_intensities_NF, paste0(pg_output_path, center, "/aggregated_NF.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

    # write.table(not_filtered, paste0("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/mapping_filtering_check/custom_NR/", center, "_aggregated_NF.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

    # write metadata
    metadata %>%
        filter(Center == center) %>%
        write.table(paste0(genes_output_path, center, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = TRUE)
    metadata %>%
        filter(Center == center) %>%
        write.table(paste0(pg_output_path, center, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = TRUE)
}


Processed data count: 5987 
Processed data count: 6360 
Processed data count: 5517 
