In [1]:
library(tidyverse)
library(grid)
library(gridExtra)

source("../../evaluation_utils/plots_eda.R")
source("../../evaluation_utils/filtering.R")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine


Loading required package: viridisLite


Attaching package: ‘data.ta

In [2]:
plots_multiple <- function(intensities, metadata, name){
    pca_plot_study <- pca_plot(
        intensities, metadata, 
        title = name,
        quantitative_col_name = 'file',
        col_col = "lab", shape_col = "condition")

    pca_plot_class <- pca_plot(
        intensities, metadata, 
        title = name,
        quantitative_col_name = 'file',
        shape_col = "lab", col_col = "condition")

    boxplot <- boxplot_plot(
        intensities, metadata, 
        title = name,
        color_col = 'lab', quantitativeColumnName = 'file', 
        path = '')

    density_plot <- plotIntensityDensity(
        intensities, metadata, 
        quantitativeColumnName = 'file', 
        colorColumnName = 'lab',
        title = name)

    layout <- (pca_plot_class | pca_plot_study) /
                boxplot /
                density_plot

    return(layout)
}


## Load data

In [3]:
path_to_data <- paste0("before/")

metadata <- read.csv(paste0(path_to_data, "initial_data/central_batch_info.tsv"), sep = "\t") %>%
    column_to_rownames('rowname') %>% 
    as.data.frame() #%>%
    # mutate(file = gsub("X", "", file))
dim(metadata)
rownames(metadata) <- metadata$file

intensities <- read.csv(paste0(path_to_data, "initial_data/central_intensities.tsv"), sep = "\t", row.names = 1)
# change X in the beginning of the column names to empty string
colnames(intensities) <- gsub("^X", "", colnames(intensities))
colnames(intensities) <- gsub("\\.", "-", colnames(intensities))
intensities <- intensities %>% as.data.frame()

dim(intensities)
intensities_raw <- intensities[, metadata$file]
intensities <- log2(intensities_raw)
dim(intensities)

write.table(intensities %>% rownames_to_column("Protein"), file = paste0(path_to_data, "/central_intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F)

In [8]:
metadata <- metadata %>% 
    mutate(center = ifelse(lab %in% c("lab_A", "lab_B"), "center1", 
                        ifelse(lab %in% c("lab_C", "lab_D"), "center2", "center3"))) 
    
    
metadata %>% select(center, lab, condition) %>%
    group_by(center, lab) %>%
    summarise(n = n())


[1m[22m`summarise()` has grouped output by 'center'. You can override using the
`.groups` argument.


center,lab,n
<chr>,<chr>,<int>
center1,lab_A,24
center1,lab_B,23
center2,lab_C,23
center2,lab_D,24
center3,lab_E,24


In [9]:
metadata %>% 
    select(center, lab, condition) %>%
    group_by(center, lab, condition) %>%
    summarise(n = n()) 

[1m[22m`summarise()` has grouped output by 'center', 'lab'. You can override using the
`.groups` argument.


center,lab,condition,n
<chr>,<chr>,<chr>,<int>
center1,lab_A,Glu,12
center1,lab_A,Pyr,12
center1,lab_B,Glu,11
center1,lab_B,Pyr,12
center2,lab_C,Glu,12
center2,lab_C,Pyr,11
center2,lab_D,Glu,12
center2,lab_D,Pyr,12
center3,lab_E,Glu,12
center3,lab_E,Pyr,12


In [11]:
joint_intensities <- NULL

for(center in unique(metadata$center)){
    center_metadata <- metadata[metadata$center == center,]
    center_intensities <- intensities[, center_metadata$file]

    # remove rows with all values NA
    intens_filtered <- center_intensities[!apply(is.na(center_intensities), 1, all),]
    cat("Center: ", center, " removed ", nrow(center_intensities) - nrow(intens_filtered), " rows with all NA values\n")
    cat("Before: ", nrow(center_intensities), " After: ", nrow(intens_filtered), "\n")
    
    intens_filtered <- filter_per_center(intens_filtered, center_metadata, 'file', unique(center_metadata$center), 'center')
    intens_filtered <- filter_per_center(intens_filtered, center_metadata, 'file', unique(center_metadata$condition), 'condition')
    center_intensities_filtered <- intens_filtered[, center_metadata$file]
    
    path_before <- paste(path_to_data, center, "/", sep = "")

    if(!dir.exists(path_before)){
        dir.create(path_before)
    }
    write.table(center_metadata,
        file = paste0(path_before, "/intermediate/metadata.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
    write.table(center_intensities_filtered %>% rownames_to_column('rowname'),
        file = paste0(path_before, "intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)

    if(is.null(joint_intensities)){
        joint_intensities <- center_intensities_filtered
        joint_intensities <- joint_intensities %>% rownames_to_column("rowname")
    } else {
        center_intensities_filtered <- center_intensities_filtered %>% rownames_to_column("rowname")
        
        # Union or full join by rownames (index)
        joint_intensities <- merge(joint_intensities, center_intensities_filtered, 
                                   by = "rowname", all = TRUE)
    }
}
cat("Joint intensities: ", dim(joint_intensities), "\n")
intensities <- joint_intensities %>%
    column_to_rownames("rowname")

Center:  center1  removed  135  rows with all NA values
Before:  3059  After:  2924 
Filtering by center - two not-NA per center
	Before filtering: 2924 47 
	After filtering: 2924 47 
Filtering by center - two not-NA per center
	Before filtering: 2924 47 
	After filtering: 2910 47 
Center:  center2  removed  118  rows with all NA values
Before:  3059  After:  2941 
Filtering by center - two not-NA per center
	Before filtering: 2941 47 
	After filtering: 2940 47 
Filtering by center - two not-NA per center
	Before filtering: 2940 47 
	After filtering: 2900 47 
Center:  center3  removed  658  rows with all NA values
Before:  3059  After:  2401 
Filtering by center - two not-NA per center
	Before filtering: 2401 24 
	After filtering: 2401 24 
Filtering by center - two not-NA per center
	Before filtering: 2401 24 
	After filtering: 2363 24 
Joint intensities:  3034 119 


In [12]:
layout <- plots_multiple(intensities, metadata, "Bacterial dataset, uncorrected")
ggsave(paste0("plots/data_plot.png"), plot = layout, width = 12, height = 15)

“[1m[22m`aes_string()` was deprecated in ggplot2 3.0.0.
[36mℹ[39m Please use tidy evaluation idioms with `aes()`.
[36mℹ[39m See also `vignette("ggplot2-in-packages")` for more information.”
No id variables; using all as measure variables

“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_summary()`).”
“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_density()`).”


# Correction

In [13]:
# create design
metadata <- metadata %>%
    mutate(condition = factor(condition, levels = c("Glu", "Pyr")))
design= model.matrix(~condition, data = metadata)
colnames(design) <- c("Intercept", "Pyr")
print(dim(intensities))
intensities_corrected <- limma::removeBatchEffect(intensities, metadata$lab, design = design) %>% as.data.frame()


[1] 3034  118


“Partial NA coefficients for 784 probe(s)”


In [14]:
layout <- plots_multiple(intensities_corrected, metadata, "Bacterial dataset, R corrected")
ggsave(paste0("plots/data_plot_Rcorrected.png"), plot = layout, width = 12, height = 15)


No id variables; using all as measure variables

“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_summary()`).”
“[1m[22mRemoved 49366 rows containing non-finite outside the scale range
(`stat_density()`).”


In [24]:
design <- design %>% as.data.frame()
path_to_after_data <- paste0("after/")


for(center in unique(metadata$center)){
    path_to_before <- paste0(path_to_data, center, "/")
    path_to_after <- paste0("after/", center, "/")

    design_center = design[rownames(metadata[metadata$center == center,]),]['Pyr']
    design_center$batch <- metadata[metadata$center == center,]$lab
    design_center <- design_center[, c('batch', 'Pyr')]

    # if(!dir.exists(path_to_after)){
    #     dir.create(path_to_after)
    # }

    write.table(design_center %>% rownames_to_column('file'),
        file = paste0(path_to_before, "design.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
}

write.table(intensities_corrected %>% rownames_to_column('rowname'), 
    file = paste0(path_to_after_data, "intensities_log_Rcorrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)

In [25]:
design_center

Unnamed: 0_level_0,batch,Pyr
Unnamed: 0_level_1,<chr>,<dbl>
CVT09_QC1_LabE_X023,lab_E,0
CVT09_QC2_LabE_X024,lab_E,1
CVT09_QC3_LabE_X025,lab_E,1
CVT09_QC4_LabE_X026,lab_E,0
CVT09_s3_X002,lab_E,1
CVT09_s10_X003,lab_E,1
CVT09_s13_X004,lab_E,1
CVT09_s16_X005,lab_E,1
CVT09_s25_X006,lab_E,1
CVT09_s28_X007,lab_E,1
