In [1]:
library(tidyverse)
source("../../evaluation_utils/preprocessing_raw/preprocessing_report.R")

“package ‘tidyverse’ was built under R version 4.2.2”
“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘tibble’ was built under R version 4.2.3”
“package ‘tidyr’ was built under R version 4.2.2”
“package ‘readr’ was built under R version 4.2.2”
“package ‘purrr’ was built under R version 4.2.3”
“package ‘dplyr’ was built under R version 4.2.3”
“package ‘stringr’ was built under R version 4.2.3”
“package ‘forcats’ was built under R version 4.2.2”
“package ‘lubridate’ was built under R version 4.2.2”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ──────────

# Path and meta

In [2]:
# REVIEWED
data_path = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_reviewed_MQ_report/'

# output
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_balanced_PG_MajorPG/"

In [3]:
# Metadtata

metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/Metadata_CosyBio.tsv",
                       header = TRUE, sep = "\t", stringsAsFactors = FALSE)
                       
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

rownames(metadata) <- metadata$Quantitative.column.name
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(".Pool", ".P_", gsub("Reporter.intensity.corrected.", "RIC_", Quantitative.column.name)))


# remove the outlier
metadata <- metadata %>%
            filter(Quantitative.column.name != "RIC_3.P_3")


# rename the columns - put the .P_[1-9]+ after dot before and the rest after (e.g. RIC_1.P_1 -> P_1.RIC_1)
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub("RIC_([0-9]+).P_([0-9]+)", "P_\\2.RIC_\\1", Quantitative.column.name))



head(metadata, 3)
dim(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Reporter.intensity.corrected.1.Pool1,P_1.RIC_1,Pool1,126,Common Reference,Common Reference,Center1
Reporter.intensity.corrected.2.Pool1,P_1.RIC_2,Pool1,127N,heathy1,heathy,Center1
Reporter.intensity.corrected.3.Pool1,P_1.RIC_3,Pool1,127C,FSGS1,FSGS,Center1


# Logic

Type of data:
-  Data summarized using Major.PG.IDs

PG_group report is used.  
Filtered, then features names and samples columns (+ razor+unique peptides are extracted) and stored into separated files.

# Extract data

In [4]:
# List of input files
list_of_inputs <- list(
    "Center1" = paste0(data_path, "Center1/proteinGroups.txt"),
    "Center2" = paste0(data_path, "Center2/proteinGroups.txt"),
    "Center3" = paste0(data_path, "Center3/proteinGroups.txt")
)

# Initialize empty lists to store combined data
combined_pg_intensities <- list()
combined_counts <- list()

# Iterate over each center
for(center in names(list_of_inputs)) {
    # Determine the center folder based on the center name
    center_folder <- ifelse(center == "Center1", "center_one", ifelse(center == "Center2", "center_two", "center_three"))

    # Filter metadata for the current center
    center_metadata <- metadata %>%
        filter(Center == center)

    # Preprocess the MaxQuant output file for the current center
    results_list <- preprocess_data_mxout(list_of_inputs[[center]], center_metadata, data_type='protein')
    pg_intensities <- results_list[[1]]
    counts_df <- results_list[[2]]

    # Add the processed data to the combined lists
    combined_pg_intensities <- c(combined_pg_intensities, list(pg_intensities))
    combined_counts <- c(combined_counts, list(counts_df))
}

# Add names to the combined lists
names(combined_pg_intensities) <- names(list_of_inputs)
names(combined_counts) <- names(list_of_inputs)

Filtering out decoy, contaminant, and modification site-only entries...
Initial data count: 563 
Filtered data count: 491 
Processed data count: 491 
Counts data count: 491 
Filtering out decoy, contaminant, and modification site-only entries...
Initial data count: 578 
Filtered data count: 516 
Processed data count: 516 
Counts data count: 516 
Filtering out decoy, contaminant, and modification site-only entries...
Initial data count: 491 
Filtered data count: 438 
Processed data count: 438 
Counts data count: 438 


In [6]:

# This loop iterates over a list of centers and performs data preparation tasks for each center.
for(center in list("Center1", "Center2", "Center3")) {
  # create sub dir for each center
  sub_dir <- paste0(pg_output_path, center)
  if(!dir.exists(sub_dir)) {
    dir.create(sub_dir)
  }

  # Summarize names
  intensities <- combined_pg_intensities[[center]] %>%
    select(-Gene.names)
  write.table(intensities, paste0(sub_dir, "/for_sorting_check.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    
  intensities <- intensities %>%
    group_by(Majority.protein.IDs) %>%
    rowwise() %>% # Apply the following operations to each row individually
    # sort!
    mutate(Majority.protein.IDs = map_chr(strsplit(Majority.protein.IDs, ";"), ~paste(sort(.x), collapse = ";")))

  # replace 0 with NA
  intensities <- intensities %>%
    mutate_all(~replace(., . == 0, NA))

  # Write  intensities to a file
  write.table(intensities, paste0(sub_dir, "/pg_intensities.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    
  # Summarize counts
  counts_df <- combined_counts[[center]] %>%
    select(-c(Gene.names, Peptide.IDs, Peptide.is.razor)) %>%
    rowwise() %>% # Apply the following operations to each row individually
    mutate(Majority.protein.IDs = map_chr(strsplit(Majority.protein.IDs, ";"), ~paste(sort(.x), collapse = ";"))) %>%
    rename("counts" = "Razor...unique.peptides")

  # Write counts to a file
  write.table(counts_df, paste0(sub_dir, "/pg_counts.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

  # Write metadata to a file
  metadata %>%
    # rownames_to_column("filename") %>%
    filter(Center == center) %>%
    write.table(paste0(sub_dir, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)


  # write a design file
  for_dummy <- metadata %>%
    filter(Center == center) %>% 
    mutate(Group = ifelse(Group == "Common Reference", "CommonReference", Group))
  rownames(for_dummy) <- for_dummy$Quantitative.column.name
  dummy_df <- model.matrix(~0+Group, for_dummy)
  colnames(dummy_df) <- gsub("Group", "", colnames(dummy_df))

  for_dummy <- for_dummy %>% 
      rownames_to_column("filename") %>%
      select(-c(Quantitative.column.name, Reporter.ion, Group, Patient)) %>% 
      cbind(dummy_df)
  write.table(for_dummy, paste0(sub_dir, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

}

# write metadata to a file
metadata %>%
  rownames_to_column("filename") %>%
  write.table(paste0(pg_output_path, "metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

# Make data imbalanced

-- references were removed

In [5]:
sample_based_on_rules <- function(center, pool, metadata) {
  set.seed(42)  # Set seed for reproducibility
  
  if (center == "Center1") {
    if (pool == "Pool1") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 2), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 2), ])
    } else if (pool == "Pool2") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 1), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 2), ])
    }
  } else if (center == "Center2") {
    if (pool == "Pool3") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 4), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 1), ])
    } else if (pool == "Pool5") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 2), ])
    }
  } else if (center == "Center3") {
    if (pool == "Pool4") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 5), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 5), ])
    } else if (pool == "Pool6") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 5), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 4), ])
    }
  } else {
    stop("Invalid center or pool")
  }
  
  return(c(heathy_samples, FSGS_samples))
}

In [6]:
# output
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_balanced_PG_MajorPG/"
output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_imbalanced_PG_MajorPG/"

imbalanced_meta <- NULL

# This loop iterates over a list of centers and performs data preparation tasks for each center.
for(center in list("Center1", "Center2", "Center3")) {
  # create sub dir for each center
  sub_dir <- paste0(output_path, center)
  if(!dir.exists(sub_dir)) {
    dir.create(sub_dir, recursive = TRUE)
  }

  # read full metadata
  metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_balanced_PG_MajorPG/metadata.tsv",
                         header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
    filter(Center == center)
  rownames(metadata) <- metadata$Quantitative.column.name

  # select random samples according to the rule:
  keep_samples <- c()

  for(pool in unique(metadata$Pool)) {
    selected_samples <- c(keep_samples, sample_based_on_rules(center, pool, metadata))
    keep_samples <- c(keep_samples, selected_samples)
  }
  metadata <- metadata[metadata$Quantitative.column.name %in% keep_samples, ]

  # read intensities from prev step for the center and filter them
  intensities <- read.table(paste0(pg_output_path, center, "/pg_intensities.tsv"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  rownames(intensities) <- intensities$Majority.protein.IDs
  intensities <- intensities[,colnames(intensities) %in% keep_samples] %>%
    rownames_to_column("Majority.protein.IDs")

  # read counts from prev step for the center and filter them
  counts <- read.table(paste0(pg_output_path, center, "/pg_counts.tsv"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  

  # write intensities to a file
  write.table(intensities, paste0(sub_dir, "/pg_intensities.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  # write counts to a file
  write.table(counts, paste0(sub_dir, "/pg_counts.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  # write metadata to a file
  write.table(metadata, paste0(sub_dir, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)


  # create design df
  for_dummy <- metadata
  rownames(for_dummy) <- for_dummy$Quantitative.column.name
  dummy_df <- model.matrix(~0+Group, for_dummy)
  colnames(dummy_df) <- gsub("Group", "", colnames(dummy_df))

  # # write design to a file
  for_dummy <- for_dummy %>% 
    select(-c(filename, Quantitative.column.name, Reporter.ion, Group, Patient)) %>% 
    rownames_to_column("filename") %>%
    cbind(dummy_df)
  write.table(for_dummy, paste0(sub_dir, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  

  # add metadata to the imbalanced_meta
  if(is.null(imbalanced_meta)) {
    imbalanced_meta <- metadata
  } else {
    imbalanced_meta <- rbind(imbalanced_meta, metadata)
  }
}

cat("Shape of imbalanced_meta: ", dim(imbalanced_meta), "\n")
#write in to a file
write.table(imbalanced_meta, paste0(output_path, "metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

Shape of imbalanced_meta:  36 7 


# Downsampled

In [7]:
sample_based_on_rules <- function(center, pool, metadata) {
  set.seed(42)  # Set seed for reproducibility
  
  if (center == "Center1") {
    if (pool == "Pool1" || pool == "Pool2") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 3), ])
    }
  } else if (center == "Center2") {
    if (pool == "Pool3") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 3), ])
    } else if (pool == "Pool5") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 3), ])
    }
  } else if (center == "Center3") {
    if (pool == "Pool4") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 3), ])
    } else if (pool == "Pool6") {
      heathy_samples <- rownames(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'heathy' & metadata$Pool == pool, ]), 3), ])
      FSGS_samples <- rownames(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ][sample(1:nrow(metadata[metadata$Group == 'FSGS' & metadata$Pool == pool, ]), 3), ])
    }
  } else {
    stop("Invalid center or pool")
  }
  
  return(c(heathy_samples, FSGS_samples))
}

In [8]:
# output
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_balanced_PG_MajorPG/"
output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_downsampled_PG_MajorPG/"

imbalanced_meta <- NULL

# This loop iterates over a list of centers and performs data preparation tasks for each center.
for(center in list("Center1", "Center2", "Center3")) {
  # create sub dir for each center
  sub_dir <- paste0(output_path, center)
  if(!dir.exists(sub_dir)) {
    dir.create(sub_dir, recursive = TRUE)
  }

  # read full metadata
  metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/01_smaller_lib_balanced_PG_MajorPG/metadata.tsv",
                         header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
    filter(Center == center)
  rownames(metadata) <- metadata$Quantitative.column.name

  # select random samples according to the rule:
  keep_samples <- c()

  for(pool in unique(metadata$Pool)) {
    selected_samples <- c(keep_samples, sample_based_on_rules(center, pool, metadata))
    keep_samples <- c(keep_samples, selected_samples)
  }
  metadata <- metadata[metadata$Quantitative.column.name %in% keep_samples, ]

  # read intensities from prev step for the center and filter them
  intensities <- read.table(paste0(pg_output_path, center, "/pg_intensities.tsv"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  rownames(intensities) <- intensities$Majority.protein.IDs
  intensities <- intensities[,colnames(intensities) %in% keep_samples] %>%
    rownames_to_column("Majority.protein.IDs")

  # read counts from prev step for the center and filter them
  counts <- read.table(paste0(pg_output_path, center, "/pg_counts.tsv"), header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  

  # write intensities to a file
  write.table(intensities, paste0(sub_dir, "/pg_intensities.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  # write counts to a file
  write.table(counts, paste0(sub_dir, "/pg_counts.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  # write metadata to a file
  write.table(metadata, paste0(sub_dir, "/metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)


  # create design df
  for_dummy <- metadata
  rownames(for_dummy) <- for_dummy$Quantitative.column.name
  dummy_df <- model.matrix(~0+Group, for_dummy)
  colnames(dummy_df) <- gsub("Group", "", colnames(dummy_df))

  # # write design to a file
  for_dummy <- for_dummy %>% 
    select(-c(filename, Quantitative.column.name, Reporter.ion, Group, Patient)) %>% 
    rownames_to_column("filename") %>%
    cbind(dummy_df)
  write.table(for_dummy, paste0(sub_dir, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  

  # add metadata to the imbalanced_meta
  if(is.null(imbalanced_meta)) {
    imbalanced_meta <- metadata
  } else {
    imbalanced_meta <- rbind(imbalanced_meta, metadata)
  }
}

cat("Shape of imbalanced_meta: ", dim(imbalanced_meta), "\n")
#write in to a file
write.table(imbalanced_meta, paste0(output_path, "metadata.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

Shape of imbalanced_meta:  36 7 
