In [300]:
library(tidyverse)
library(cowplot)

# NORM methods contains bugs!

# Load and Filter

In [301]:
metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Metadata_CosyBio.tsv",
                       header = TRUE, sep = "\t", stringsAsFactors = FALSE)
                       
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

rownames(metadata) <- metadata$Quantitative.column.name
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(".Pool", ".P_", gsub("Reporter.intensity.corrected.", "RIC_", Quantitative.column.name)))


# remove the outlier
metadata <- metadata %>%
            filter(Quantitative.column.name != "RIC_3.P_3")


# rename the columns - put the .P_[1-9]+ after dot before and the rest after (e.g. RIC_1.P_1 -> P_1.RIC_1)
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub("RIC_([0-9]+).P_([0-9]+)", "P_\\2.RIC_\\1", Quantitative.column.name))


metadata <- metadata %>%
    filter(Center == "Center2")


head(metadata, 3)
dim(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Reporter.intensity.corrected.1.Pool3,P_3.RIC_1,Pool3,126,Common Reference,Common Reference,Center2
Reporter.intensity.corrected.2.Pool3,P_3.RIC_2,Pool3,127N,heathy11,heathy,Center2
Reporter.intensity.corrected.4.Pool3,P_3.RIC_4,Pool3,128N,heathy12,heathy,Center2


In [302]:
PG_report <- read.table(
    "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/proteinGroups_center2.txt",
    header=T,
    sep="\t",
    stringsAsFactors = F)

dim(PG_report)

In [303]:
# remove decoy matches and matches to contaminant
PG_report <- PG_report[!PG_report$Reverse=="+",]
PG_report <- PG_report[!PG_report$Potential.contaminant=="+",]
# # Only.identified.by.site - exclude proteins that were only identified by a modification site
# PG_report <- PG_report[!PG_report$Only.identified.by.site=="+",]


dim(PG_report)

#### intensities

In [304]:
# take only the columns that we need
pg_intensities <- PG_report %>%
    select(c("Protein.IDs", rownames(metadata))) 
rownames(pg_intensities) <- pg_intensities$Protein.IDs
pg_intensities$Protein.IDs <- NULL

colnames(pg_intensities) <- metadata$Quantitative.column.name
dim(pg_intensities)


# remove rows with all values are zeros
pg_intensities <- pg_intensities[!rowSums(pg_intensities) == 0,]
dim(pg_intensities)

pg_intensities[pg_intensities==0] <- NA
# count the % of missing values
round(sum(is.na(pg_intensities)) * 100 / (dim(pg_intensities)[1] * dim(pg_intensities)[2]), 3)


In [305]:
# pg_intensities %>% write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/pg_intensities_combined.tsv",
#                                sep = "\t", quote = FALSE, row.names = TRUE)

# rownames(metadata) <- metadata$Quantitative.column.name
# metadata %>% write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/metadata_combined.tsv",
#                             sep = "\t", quote = FALSE, row.names = FALSE)
    

In [306]:
pg_intensities %>% write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/pg_intensities.tsv",
                               sep = "\t", quote = FALSE, row.names = TRUE)

rownames(metadata) <- metadata$Quantitative.column.name
metadata %>% write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/metadata.tsv",
                            sep = "\t", quote = FALSE, row.names = FALSE)
    

#### counts

In [307]:
pg_counts <- PG_report %>%
    select(c("Protein.IDs", "Razor...unique.peptides")) 
rownames(pg_counts) <- pg_counts$Protein.IDs
pg_counts$Protein.IDs <- NULL

pg_counts <- pg_counts + 1

dim(pg_counts)

pg_counts %>% write.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/pg_counts.tsv",
                               sep = "\t", quote = FALSE, row.names = TRUE)

# EDA combined data

## plots

In [308]:
pca_plot <- function(df, batch_info, title, path) {
  pca <- prcomp(t(na.omit(df)))
  pca_df <- pca$x %>%
    as.data.frame() %>%
    rownames_to_column("Quantitative.column.name") %>% 
    left_join(batch_info,  by = "Quantitative.column.name") 
  var_expl <- pca$sdev^2 / sum(pca$sdev^2)
  names(var_expl) <- paste0("PC", 1:length(var_expl))

  pca_plot <- pca_df %>%
    ggplot(aes(PC1, PC2)) +
    geom_point(aes(col=Group, shape=Pool), size=3)  +
    theme_classic() +
    labs(title = title,
         x = glue::glue("PC1 [{round(var_expl['PC1']*100, 2)}%]"),
         y = glue::glue("PC2 [{round(var_expl['PC2']*100, 2)}%]"))

   # Check if "RIC_3.P_3" exists in the pca_df dataframe
  if("RIC_3.P_3" %in% pca_df$Quantitative.column.name) {
    pca_plot <- pca_plot +
      geom_text(data = pca_df[pca_df$Quantitative.column.name == "RIC_3.P_3", ], aes(label = Quantitative.column.name), 
                vjust = 1, hjust = 0)
  }
  ggsave(path, pca_plot, width = 6, height = 6)
  return(pca_plot)
}


# boxplot
boxplot_pg <- function(protein_matrix, title, path) {
  # Reshape data into long format
  long_data <- tidyr::gather(protein_matrix, 
                             key = "file", value = "Intensity")
  # Log tranformed scale
  boxplot <- ggplot(long_data, aes(x = file, y = Intensity)) + 
    geom_boxplot() +
    stat_summary(fun = mean, geom = "point", shape = 4, size = 3, color = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
    # adjust fonsize for the x-axis
    theme(axis.text.x = element_text(size = 8)) +
    labs(title = title) 

  if(path == "") {
    return(boxplot)
  } else {
      ggsave(path, boxplot)
      return(boxplot)
  }

}


heatmap_plot <- function(pg_matrix, batch_info, name, plot_name_prefix){
    cor_matrix <- cor(na.omit(pg_matrix), use = "pairwise.complete.obs")
    pheatmap::pheatmap(cor_matrix, 
                        annotation_col = select(batch_info, c(Group, Center)),
                        treeheight_row = 0, treeheight_col = 0, 
                        main = paste0(name),
                        fontsize = 8,
                        width = 8, height = 7,
                        filename = paste0(plot_name_prefix, "_heatmap.png"))
}



In [309]:
plot_three_in_one <- function(pg_matrix, metadata, name, plot_name_prefix) {

    pca <- pca_plot(pg_matrix, metadata, paste0(name, ' pca'), paste0(plot_name_prefix, '_pca.png'))
    box <- boxplot_pg(pg_matrix, paste0(name, ' boxplot'), paste0(plot_name_prefix, '_boxplot.png'))
    heatmap_plot(pg_matrix, metadata, name, plot_name_prefix)
    
    combined_plot <- plot_grid(pca, box, ncol = 1, align = "v")

    # Save the combined plot
    ggsave(paste0(plot_name_prefix, "_combined_two.png"), combined_plot, width = 8, height = 11)
}

## results

In [310]:
metadata <- metadata %>%
    mutate(Group = as.factor(Group), Center = as.factor(Center), Reporter.ion = as.factor(Reporter.ion)) %>%
    mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

In [311]:
plot_three_in_one(log2(pg_intensities + 1),
                  metadata,
                  "TMT-data, C2, raw protein intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/01_pg_intensities_raw_log")
                  

[1m[22mSaving 6.67 x 6.67 in image


“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [312]:
head(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<fct>,<chr>,<fct>,<fct>
P_3.RIC_1,P_3.RIC_1,Pool3,126,Common Reference,Common Reference,Center2
P_3.RIC_2,P_3.RIC_2,Pool3,127N,heathy11,heathy,Center2
P_3.RIC_4,P_3.RIC_4,Pool3,128N,heathy12,heathy,Center2
P_3.RIC_5,P_3.RIC_5,Pool3,128C,FSGS12,FSGS,Center2
P_3.RIC_6,P_3.RIC_6,Pool3,129N,heathy13,heathy,Center2
P_3.RIC_7,P_3.RIC_7,Pool3,129C,FSGS13,FSGS,Center2


In [313]:
# print(format(round(colSums(pg_intensities, na.rm = TRUE), digits = 0), big.mark = ","))

# the same using tidyverse and summarize function
pg_intensities %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.


[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,407628918,867295.6
Center2,Pool5,384075724,757545.8


# Normalization
### Methods

In [314]:
iRS_function <- function(df, metadata, sample_column = 'Quantitative.column.name',
                         group_column = 'Group', reference_name = 'Common Reference',
                         pool_column = 'Pool', batch_column = 'Center') {
  
  df <- df %>% rownames_to_column("Protein.IDs")
  
  # Convert to long format while preserving the protein identifier
  df_long <- pivot_longer(df, cols = -Protein.IDs, names_to = sample_column, values_to = "Value")
  
  # Merge long format data with metadata
  df_long <- left_join(df_long, metadata, by = sample_column)
  
  # Isolate 'Common Reference' samples and ensure calculations are per protein within each center
  common_refs <- df_long %>% 
    filter(!!sym(group_column) == reference_name) %>%
    select(-!!sym(group_column), -!!sym(sample_column))
  
  # Compute the geometric mean of 'Common Reference' across all plexes for each protein within each center
  common_geo_mean <- common_refs %>%
    group_by(Protein.IDs, !!sym(batch_column)) %>%
    # group_by(Protein.IDs) %>%
    summarise(OverallGeoMean = exp(mean(log(Value), na.rm = TRUE)), .groups = 'drop')
    # summarise(OverallGeoMean = exp(mean(log(Value))), .groups = 'drop')


  # Calculate IRS factors for each 'Common Reference' in each plex, per protein, within each center
  irs_factors <- common_refs %>%
    left_join(common_geo_mean, by = c("Protein.IDs", batch_column)) %>%
    # left_join(common_geo_mean, by = c("Protein.IDs")) %>%
    group_by(!!sym(pool_column), Protein.IDs, !!sym(batch_column)) %>%
    # group_by(!!sym(pool_column), Protein.IDs) %>%
    summarise(IRSFactor = OverallGeoMean / mean(Value), .groups = 'drop')
  
  # Apply IRS scaling to all samples in each plex, per protein, within each center
  df_scaled <- df_long %>%
    left_join(irs_factors, by = c(pool_column, "Protein.IDs", batch_column)) %>%
    # left_join(irs_factors, by = c(pool_column, "Protein.IDs")) %>%
    mutate(ScaledValue = ifelse(is.na(IRSFactor), Value, Value * IRSFactor))
  
  # Pivot back to wide format if necessary, with proteins as rows and samples as columns
  df_final <- pivot_wider(df_scaled, names_from = sample_column, values_from = ScaledValue, id_cols = "Protein.IDs") %>%
    column_to_rownames("Protein.IDs")
  
  return(df_final)
}



median_noramlization <- function(df, log_normalized = FALSE) {
  if(!log_normalized) {
    df <- log2(df + 1)
  }
  medians <- apply(df, 2, median, na.rm = TRUE)
  normalized_df <- sweep(df, 2, medians, FUN="/")
  mean_of_medians <- mean(medians, na.rm=TRUE)
  normalized_df <- as.data.frame(normalized_df * mean_of_medians)

  return(normalized_df)
}

sample_load_norm <- function(data_raw, metadata){

  # Ensure column names from data_raw match the Quantitative.column.name names in metadata
  data_raw <- data_raw[, metadata$Quantitative.column.name]
  
  # Initialize an empty list to store split experiments
  experiments_raw <- list()
  
  # Separate data_raw based on Pool information in metadata
  unique_pools <- unique(metadata$Pool)
  for (i in 1:length(unique_pools)) {
    pool_samples <- metadata$Quantitative.column.name[metadata$Pool == unique_pools[i]]
    experiments_raw[[i]] <- data_raw[, pool_samples, drop = FALSE]
  }
  
  # Initialize an empty list for storing normalized data
  experiments_sl <- list()
  
  # Calculate the target for normalization
  target <- mean(unlist(sapply(experiments_raw, function(exp) colSums(exp, na.rm = TRUE))))
  
  # Normalize each experiment
  for (i in 1:length(experiments_raw)) {
    norm_facs <- target / colSums(experiments_raw[[i]], na.rm = TRUE)
    experiments_sl[[i]] <- sweep(experiments_raw[[i]], 2, norm_facs, FUN = "*")
  }
  
  # Combine normalized data from all experiments
  data_sl <- do.call(cbind, experiments_sl)
  
  return(as.data.frame(data_sl))

}

### iRS --- > median

In [315]:
intensities_scaled <- iRS_function(pg_intensities, metadata)

dim(intensities_scaled)
round(sum(is.na(intensities_scaled)) * 100 / (dim(intensities_scaled)[1] * dim(intensities_scaled)[2]), 3)

plot_three_in_one(log2(intensities_scaled + 1),
                  metadata,
                  "TMT, C2, iRS-scaled protein intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/02_pg_intensities_scaled_log")
                  

[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [316]:
intensities_scaled %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.


[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,398166621,847163.0
Center2,Pool5,394001005,777122.3


In [317]:
intensities_scaled_norm <- median_noramlization(intensities_scaled, log_normalized = FALSE)


dim(intensities_scaled_norm)
round(sum(is.na(intensities_scaled_norm)) * 100 / (dim(intensities_scaled_norm)[1] * dim(intensities_scaled_norm)[2]), 3)

plot_three_in_one(intensities_scaled_norm,
                  metadata,
                  "TMT, iRScaled, Median norm intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/03_pg_intensities_scaled_norm_log")

[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [318]:
intensities_scaled_norm %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,7916.622,16.84388
Center2,Pool5,8507.892,16.78085


In [319]:
# bacth effects correction

design <- model.matrix(~0 + metadata$Group)
corrected_sl_irs <- limma::removeBatchEffect(intensities_scaled_norm, batch = metadata$Pool, design = design)



dim(corrected_sl_irs)
round(sum(is.na(corrected_sl_irs)) * 100 / (dim(corrected_sl_irs)[1] * dim(corrected_sl_irs)[2]), 3)

plot_three_in_one(as.data.frame(corrected_sl_irs),
                  metadata,
                  "TMT, C2, iRScaled, Median norm, BEC intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/04_pg_intensities_scaled_norm_log_BEC")

“Partial NA coefficients for 63 probe(s)”


[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


### SL ---> iRS

In [320]:
sl_intesities <- sample_load_norm(pg_intensities, metadata)

dim(sl_intesities)
round(sum(is.na(sl_intesities)) * 100 / (dim(sl_intesities)[1] * dim(sl_intesities)[2]), 3)

plot_three_in_one(log2(sl_intesities + 1),
                  metadata,
                  "TMT, C2, SL-scaled protein intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/02_pg_intensities_SL_log")
                  

[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [321]:
sl_intesities %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,395291531,841045.8
Center2,Pool5,395291531,779667.7


In [322]:
intensities_scaled <- iRS_function(sl_intesities, metadata)

dim(intensities_scaled)
round(sum(is.na(intensities_scaled)) * 100 / (dim(intensities_scaled)[1] * dim(intensities_scaled)[2]), 3)

plot_three_in_one(log2(intensities_scaled + 1),
                  metadata,
                  "TMT, C2, SL, iRS-scaled protein intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/03_pg_intensities_SL_iRS_log")
                  

[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [323]:
intensities_scaled %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.


[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,396310495,843213.8
Center2,Pool5,395246120,779578.1


In [324]:
# bacth effects correction

design <- model.matrix(~0 + metadata$Group)
corrected_sl_irs <- limma::removeBatchEffect(log2(intensities_scaled + 1), batch = metadata$Pool, design = design)



dim(corrected_sl_irs)
round(sum(is.na(corrected_sl_irs)) * 100 / (dim(corrected_sl_irs)[1] * dim(corrected_sl_irs)[2]), 3)

plot_three_in_one(as.data.frame(corrected_sl_irs),
                  metadata,
                  "TMT, C2, SL, iRS-scaled, BEC protein intensities, log2-transformed",
                  "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/center_two/plots/04_pg_intensities_SL_iRS_log_BEC")

“Partial NA coefficients for 63 probe(s)”


[1m[22mSaving 6.67 x 6.67 in image
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 643 rows containing non-finite values (`stat_summary()`).”
“Graphs cannot be vertically aligned unless the axis parameter is set. Placing graphs unaligned.”


In [325]:
corrected_sl_irs %>%
    as.data.frame() %>%
    # summarize(across(everything(), ~sum(., na.rm = TRUE))) %>%
    pivot_longer(cols = everything(), 
                 names_to = "Quantitative.column.name", 
                 values_to = "Value") %>%
    na.omit() %>%
    left_join(select(metadata, c(Quantitative.column.name, Pool, Center)), by = 'Quantitative.column.name') %>%
    group_by(Center, Pool, Quantitative.column.name) %>%
    summarize(sum = sum(Value, na.rm = TRUE), mean = mean(Value, na.rm = TRUE)) %>%
    group_by(Center, Pool) %>%
    summarise(mean_sample_sum = mean(sum), mean_sample_mean = mean(mean))

[1m[22m`summarise()` has grouped output by 'Center', 'Pool'. You can override using
the `.groups` argument.


[1m[22m`summarise()` has grouped output by 'Center'. You can override using the
`.groups` argument.


Center,Pool,mean_sample_sum,mean_sample_mean
<fct>,<chr>,<dbl>,<dbl>
Center2,Pool3,7957.495,16.93084
Center2,Pool5,8470.847,16.70778
