In [12]:
library(tidyverse)
library(grid)
library(gridExtra)

source("../../evaluation_utils/plots_eda.R")
source("../../evaluation_utils/filtering.R")

# MODE

In [13]:
MODE <- 'balanced'
# MODE <- 'imbalanced'

## Load data

In [14]:
path_to_data <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/", MODE, "/")

metadata <- read.csv(paste0(path_to_data, "central_batch_info.tsv"), sep = "\t") %>%
    column_to_rownames('rowname') %>% 
    as.data.frame() #%>%
    # mutate(file = gsub("X", "", file))
dim(metadata)
rownames(metadata) <- metadata$file

intensities <- read.csv(paste0(path_to_data, "central_intensities.tsv"), sep = "\t", row.names = 1)
# change X in the beginning of the column names to empty string
colnames(intensities) <- gsub("^X", "", colnames(intensities))
colnames(intensities) <- gsub("\\.", "-", colnames(intensities))
intensities <- intensities %>% as.data.frame()

dim(intensities)
intensities_raw <- intensities[, metadata$file]
intensities <- log2(intensities_raw)
dim(intensities)

write.table(intensities %>% rownames_to_column("Protein"), file = paste0(path_to_data, "central_intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F)

In [15]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab) %>%
    summarise(n = n()) 

lab,n
<chr>,<int>
lab_A,24
lab_B,23
lab_C,23
lab_D,24
lab_E,24


In [16]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab, condition) %>%
    summarise(n = n()) 

[1m[22m`summarise()` has grouped output by 'lab'. You can override using the `.groups`
argument.


lab,condition,n
<chr>,<chr>,<int>
lab_A,Glu,12
lab_A,Pyr,12
lab_B,Glu,11
lab_B,Pyr,12
lab_C,Glu,12
lab_C,Pyr,11
lab_D,Glu,12
lab_D,Pyr,12
lab_E,Glu,12
lab_E,Pyr,12


In [17]:
# intensities_filteres <- filter_per_center(intensities, metadata, 'file', unique(metadata$lab), 'lab')
# dim(intensities_filteres)
# # save before
# write.table(intensities_filteres %>% rownames_to_column("Protein"), 
#     file = paste0(path_to_data, "central_intensities_log_filtered.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)


In [18]:
joint_intensities <- NULL

for(center in unique(metadata$lab)){
    center_metadata <- metadata[metadata$lab == center,]
    center_intensities <- intensities[, center_metadata$file]

    # remove rows with all values NA
    intens_filtered <- center_intensities[!apply(is.na(center_intensities), 1, all),]
    cat("Center: ", center, " removed ", nrow(center_intensities) - nrow(intens_filtered), " rows with all NA values\n")
    cat("Before: ", nrow(center_intensities), " After: ", nrow(intens_filtered), "\n")
    
    intens_filtered <- filter_per_center(intens_filtered, center_metadata, 'file', unique(center_metadata$lab), 'lab')
    intens_filtered <- filter_per_center(intens_filtered, center_metadata, 'file', unique(center_metadata$condition), 'condition')
    center_intensities_filtered <- intens_filtered[, center_metadata$file]
    
    path_before <- paste(path_to_data, center, "/", sep = "")

    if(!dir.exists(path_before)){
        dir.create(path_before)
    }
    write.table(center_metadata,
        file = paste0(path_before, "metadata.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
    write.table(center_intensities_filtered %>% rownames_to_column('rowname'),
        file = paste0(path_before, "intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)

    if(is.null(joint_intensities)){
        joint_intensities <- center_intensities_filtered
        joint_intensities <- joint_intensities %>% rownames_to_column("rowname")
    } else {
        center_intensities_filtered <- center_intensities_filtered %>% rownames_to_column("rowname")
        
        # Union or full join by rownames (index)
        joint_intensities <- merge(joint_intensities, center_intensities_filtered, 
                                   by = "rowname", all = TRUE)
    }
}
cat("Joint intensities: ", dim(joint_intensities), "\n")
intensities <- joint_intensities %>%
    column_to_rownames("rowname")

Center:  lab_A  removed  510  rows with all NA values
Before:  3059  After:  2549 
Filtering by center - two not-NA per center
	Before filtering: 2549 24 
	After filtering: 2548 24 
Filtering by center - two not-NA per center
	Before filtering: 2548 24 
	After filtering: 2525 24 
Center:  lab_B  removed  213  rows with all NA values
Before:  3059  After:  2846 
Filtering by center - two not-NA per center
	Before filtering: 2846 23 
	After filtering: 2846 23 
Filtering by center - two not-NA per center
	Before filtering: 2846 23 
	After filtering: 2828 23 
Center:  lab_C  removed  239  rows with all NA values
Before:  3059  After:  2820 
Filtering by center - two not-NA per center
	Before filtering: 2820 23 
	After filtering: 2819 23 
Filtering by center - two not-NA per center
	Before filtering: 2819 23 
	After filtering: 2775 23 
Center:  lab_D  removed  246  rows with all NA values
Before:  3059  After:  2813 
Filtering by center - two not-NA per center
	Before filtering: 2813 24 
	A

In [19]:
# pca_plot_study <- pca_plot(
#     intensities, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     col_col = "lab", shape_col = "condition")

# pca_plot_class <- pca_plot(
#     intensities, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     shape_col = "lab", col_col = "condition")

# layout <- (pca_plot_class | pca_plot_study)
# options(repr.plot.width=11, repr.plot.height=6)
# layout

In [20]:
# create design
metadata <- metadata %>%
    mutate(condition = factor(condition, levels = c("Glu", "Pyr")))
design= model.matrix(~condition, data = metadata)
colnames(design) <- c("Intercept", "Pyr")
print(dim(intensities))
intensities_corrected <- limma::removeBatchEffect(intensities, metadata$lab, design = design) %>% as.data.frame()


[1] 3034  118


“Partial NA coefficients for 792 probe(s)”


In [21]:
# pca_plot_study <- pca_plot(
#     intensities_corrected, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     col_col = "lab", shape_col = "condition")

# pca_plot_class <- pca_plot(
#     intensities_corrected, metadata, 
#     title = "PCA plot BB",
#     quantitative_col_name = 'file',
#     shape_col = "lab", col_col = "condition")

# layout <- (pca_plot_class | pca_plot_study)
# options(repr.plot.width=11, repr.plot.height=6)
# layout

In [22]:
design <- design %>% as.data.frame()
path_to_after_data <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/after/", MODE, "/")


for(center in unique(metadata$lab)){
    path_to_before <- paste0(path_to_data, center, "/")
    path_to_after <- paste0("/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/after/", MODE, "/", center, "/")

    design_center = design[rownames(metadata[metadata$lab == center,]),]['Pyr']

    # if(!dir.exists(path_to_after)){
    #     dir.create(path_to_after)
    # }

    write.table(design_center %>% rownames_to_column('file'),
        file = paste0(path_to_before, "design.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
}

write.table(intensities_corrected %>% rownames_to_column('rowname'), 
    file = paste0(path_to_after_data, "central_intensities_log_corrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
write.table(metadata,
    file = paste0(path_to_after_data, "central_batch_info_corrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)