In [1]:
library(tidyverse)
library(grid)
library(gridExtra)

source("../../evaluation_utils/plots_eda.R")
source("../../evaluation_utils/filtering.R")

“package ‘tidyverse’ was built under R version 4.2.2”
“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘tibble’ was built under R version 4.2.3”
“package ‘tidyr’ was built under R version 4.2.2”
“package ‘readr’ was built under R version 4.2.2”
“package ‘purrr’ was built under R version 4.2.3”
“package ‘dplyr’ was built under R version 4.2.3”
“package ‘stringr’ was built under R version 4.2.3”
“package ‘forcats’ was built under R version 4.2.2”
“package ‘lubridate’ was built under R version 4.2.2”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ──────────

# MODE

In [2]:
MODE <- 'balanced'
# MODE <- 'imbalanced'

## Load data

In [3]:
path_to_data <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/", MODE, "/")

metadata <- read.csv(paste0(path_to_data, "central_batch_info.tsv"), sep = "\t") %>%
    column_to_rownames('rowname') %>% 
    as.data.frame() #%>%
    # mutate(file = gsub("X", "", file))
dim(metadata)
rownames(metadata) <- metadata$file

intensities <- read.csv(paste0(path_to_data, "central_intensities.tsv"), sep = "\t", row.names = 1)
# change X in the beginning of the column names to empty string
colnames(intensities) <- gsub("^X", "", colnames(intensities))
colnames(intensities) <- gsub("\\.", "-", colnames(intensities))
intensities <- intensities %>% as.data.frame()

dim(intensities)
intensities_raw <- intensities[, metadata$file]
intensities <- log2(intensities_raw)
dim(intensities)

write.table(intensities %>% rownames_to_column("Protein"), file = paste0(path_to_data, "central_intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F)

In [5]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab) %>%
    summarise(n = n()) 

lab,n
<chr>,<int>
lab_A,16
lab_B,13
lab_C,14
lab_D,15
lab_E,15


In [4]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab, condition) %>%
    summarise(n = n()) 

[1m[22m`summarise()` has grouped output by 'lab'. You can override using the `.groups`
argument.


lab,condition,n
<chr>,<chr>,<int>
lab_A,Glu,12
lab_A,Pyr,4
lab_B,Glu,4
lab_B,Pyr,9
lab_C,Glu,3
lab_C,Pyr,11
lab_D,Glu,10
lab_D,Pyr,5
lab_E,Glu,5
lab_E,Pyr,10


In [15]:
# intensities_filteres <- filter_per_center(intensities, metadata, 'file', unique(metadata$lab), 'lab')

# # save before
# write.table(intensities_filteres %>% rownames_to_column("Protein"), 
#     file = paste0(path_to_data, "central_intensities_log_filtered.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)


In [16]:
for(center in unique(metadata$lab)){
    center_metadata <- metadata[metadata$lab == center,]
    center_intensities <- intensities_raw[, center_metadata$file]
    # center_intensities_filtered <- intensities_filteres[, center_metadata$file]
    center_intensities_filtered <- intensities[, center_metadata$file]

    path_before <- paste(path_to_data, center, "/", sep = "")

    if(!dir.exists(path_before)){
        dir.create(path_before)
    }
    # write.table(center_intensities %>% rownames_to_column('rowname'), 
    #     file = paste0(path_before, "intensities_raw.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
    write.table(center_metadata,
        file = paste0(path_before, "metadata.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
    write.table(center_intensities_filtered %>% rownames_to_column('rowname'),
        file = paste0(path_before, "intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
}



In [17]:
# pca_plot_study <- pca_plot(
#     intensities, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     col_col = "lab", shape_col = "condition")

# pca_plot_class <- pca_plot(
#     intensities, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     shape_col = "lab", col_col = "condition")

# layout <- (pca_plot_class | pca_plot_study)
# options(repr.plot.width=11, repr.plot.height=6)
# layout

In [18]:
# create design
metadata <- metadata %>%
    mutate(condition = factor(condition, levels = c("Glu", "Pyr")))
design= model.matrix(~condition, data = metadata)
colnames(design) <- c("Intercept", "Pyr")

intensities_corrected <- limma::removeBatchEffect(intensities, metadata$lab, design = design) %>% as.data.frame()


“Partial NA coefficients for 785 probe(s)”


In [19]:
# pca_plot_study <- pca_plot(
#     intensities_corrected, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     col_col = "lab", shape_col = "condition")

# pca_plot_class <- pca_plot(
#     intensities_corrected, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     shape_col = "lab", col_col = "condition")

# layout <- (pca_plot_class | pca_plot_study)
# options(repr.plot.width=11, repr.plot.height=6)
# layout

In [20]:
design <- design %>% as.data.frame()
path_to_after_data <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/after/", MODE, "/")


for(center in unique(metadata$lab)){
    path_to_before <- paste0(path_to_data, center, "/")
    path_to_after <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/after/", MODE, "/", center, "/")

    design_center = design[rownames(metadata[metadata$lab == center,]),]['Pyr']

    # if(!dir.exists(path_to_after)){
    #     dir.create(path_to_after)
    # }

    write.table(design_center %>% rownames_to_column('file'),
        file = paste0(path_to_before, "design.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
}

write.table(intensities_corrected %>% rownames_to_column('rowname'), 
    file = paste0(path_to_after_data, "central_intensities_log_corrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
write.table(metadata,
    file = paste0(path_to_after_data, "central_batch_info_corrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)