In [114]:
library(tidyverse)
library(grid)
library(gridExtra)

source("../../evaluation_utils/plots_eda.R")
source("../../evaluation_utils/filtering.R")

In [115]:
plots_multiple <- function(intensities, metadata, name){
    pca_plot_study <- pca_plot(
        intensities, metadata, 
        title = name,
        quantitative_col_name = 'file',
        col_col = "lab", shape_col = "condition")

    pca_plot_class <- pca_plot(
        intensities, metadata, 
        title = name,
        quantitative_col_name = 'file',
        shape_col = "lab", col_col = "condition")

    boxplot <- boxplot_plot(
        intensities, metadata, 
        title = name,
        color_col = 'lab', quantitativeColumnName = 'file', 
        path = '')

    density_plot <- plotIntensityDensity(
        intensities, metadata, 
        quantitativeColumnName = 'file', 
        colorColumnName = 'lab',
        title = name)

    layout <- (pca_plot_class | pca_plot_study) /
                boxplot /
                density_plot

    return(layout)
}


## Load data

In [116]:
path_to_data <- paste0("before/")

metadata <- read.csv(paste0(path_to_data, "initial_data/central_batch_info.tsv"), sep = "\t") %>%
    column_to_rownames('rowname') %>% 
    as.data.frame() #%>%
    # mutate(file = gsub("X", "", file))
dim(metadata)
rownames(metadata) <- metadata$file

intensities <- read.csv(paste0(path_to_data, "initial_data/central_intensities.tsv"), sep = "\t", row.names = 1)
# change X in the beginning of the column names to empty string
colnames(intensities) <- gsub("^X", "", colnames(intensities))
colnames(intensities) <- gsub("\\.", "-", colnames(intensities))
intensities <- intensities %>% as.data.frame()

dim(intensities)
intensities_raw <- intensities[, metadata$file]
intensities <- log2(intensities_raw)
dim(intensities)

write.table(intensities %>% rownames_to_column("Protein"), file = paste0(path_to_data, "/central_intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F)

In [117]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab) %>%
    summarise(n = n()) 

lab,n
<chr>,<int>
lab_A,24
lab_B,23
lab_C,23
lab_D,24
lab_E,24


In [118]:
metadata %>% 
    select(lab, condition) %>%
    group_by(lab, condition) %>%
    summarise(n = n()) 

[1m[22m`summarise()` has grouped output by 'lab'. You can override using the `.groups`
argument.


lab,condition,n
<chr>,<chr>,<int>
lab_A,Glu,12
lab_A,Pyr,12
lab_B,Glu,11
lab_B,Pyr,12
lab_C,Glu,12
lab_C,Pyr,11
lab_D,Glu,12
lab_D,Pyr,12
lab_E,Glu,12
lab_E,Pyr,12


In [119]:
joint_intensities <- NULL


for(center in unique(metadata$lab)){
    center_metadata <- metadata[metadata$lab == center,]
    center_intensities <- intensities[, center_metadata$file]

    # remove rows with all values NA
    intens_filtered <- center_intensities[!apply(is.na(center_intensities), 1, all),]
    cat("Center: ", center, " removed ", nrow(center_intensities) - nrow(intens_filtered), " rows with all NA values\n")
    cat("Before: ", nrow(center_intensities), " After: ", nrow(intens_filtered), "\n")
  
    center_intensities_filtered <- intens_filtered[, center_metadata$file]
   
    path_before <- paste(path_to_data, center, "/", sep = "")

    if(!dir.exists(path_before)){
        dir.create(path_before)
    }
    write.table(center_metadata,
        file = paste0(path_before, "/intermediate/metadata.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
    write.table(center_intensities_filtered %>% rownames_to_column('rowname'),
        file = paste0(path_before, "intensities_log_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)

    if(is.null(joint_intensities)){
        joint_intensities <- center_intensities_filtered
        joint_intensities <- joint_intensities %>% rownames_to_column("rowname")
    } else {
        center_intensities_filtered <- center_intensities_filtered %>% rownames_to_column("rowname")
        
        # Union or full join by rownames (index)
        joint_intensities <- merge(joint_intensities, center_intensities_filtered, 
                                   by = "rowname", all = TRUE)
    }
}
cat("Joint intensities: ", dim(joint_intensities), "\n")
intensities <- joint_intensities %>%
    column_to_rownames("rowname")

Center:  lab_A  removed  510  rows with all NA values
Before:  3059  After:  2549 
Center:  lab_B  removed  213  rows with all NA values
Before:  3059  After:  2846 
Center:  lab_C  removed  239  rows with all NA values
Before:  3059  After:  2820 
Center:  lab_D  removed  246  rows with all NA values
Before:  3059  After:  2813 
Center:  lab_E  removed  658  rows with all NA values
Before:  3059  After:  2401 
Joint intensities:  3059 119 


In [120]:
# intensities_filteres <- filter_per_center(intensities, metadata, 'file', unique(metadata$lab), 'lab')
# dim(intensities_filteres)
# # save before
# write.table(intensities_filteres %>% rownames_to_column("Protein"), 
#     file = paste0(path_to_data, "central_intensities_log_filtered.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)


In [121]:
# Count the number of labs where each feature is not NA
feature_counts_per_lab <- intensities %>% t() %>% as.data.frame() %>%
  rownames_to_column('file')  %>%
  gather('feature', 'value', -file) %>%
  left_join(metadata, by = "file") %>% na.omit() %>%
  select(-c(file, value, condition)) %>%
  # keep only unique rows
  unique() %>% 
  group_by(feature) %>%
  summarise(n = n())

# remove features where n is less than 3
features_to_remove <- feature_counts_per_lab %>% filter(n < 3) %>% pull(feature)

# remove this rows!!! not columns!! using index
intensities <- intensities[-which(rownames(intensities) %in% features_to_remove),]
print(paste("Number of features removed:", length(features_to_remove)))

[1] "Number of features removed: 350"


In [122]:
for(center in unique(metadata$lab)){
    center_metadata <- metadata[metadata$lab == center,]
    center_intensities <- intensities[, center_metadata$file]

    intens_filtered <- center_intensities[!apply(is.na(center_intensities), 1, all),]

    intens_filtered <- filter_per_center(intens_filtered, center_metadata, 'file', unique(center_metadata$lab), 'lab')

    # get names of features that got removed and put all NA for them in center_intensities
    features_removed <- setdiff(rownames(center_intensities), rownames(intens_filtered))
    center_intensities[features_removed,] <- NA
    # add center intensities to intensities using index and column names
    intensities[rownames(center_intensities), center_metadata$file] <- center_intensities[, center_metadata$file]
    # print rows number difference
    cat("Center: ", center, " replaced ", nrow(center_intensities) - nrow(intens_filtered), " features with NAs\n")
}
    

Filtering by  lab  - two not-NA per lab 
	Before filtering: 2499 24 
	After filtering: 2498 24 
Center:  lab_A  replaced  211  features with NAs
Filtering by  lab  - two not-NA per lab 
	Before filtering: 2670 23 
	After filtering: 2670 23 
Center:  lab_B  replaced  39  features with NAs
Filtering by  lab  - two not-NA per lab 
	Before filtering: 2684 23 
	After filtering: 2684 23 
Center:  lab_C  replaced  25  features with NAs
Filtering by  lab  - two not-NA per lab 
	Before filtering: 2690 24 
	After filtering: 2690 24 
Center:  lab_D  replaced  19  features with NAs
Filtering by  lab  - two not-NA per lab 
	Before filtering: 2378 24 
	After filtering: 2378 24 
Center:  lab_E  replaced  331  features with NAs


In [123]:
# check that there are no labs with less than n(desgin columns) = 6 features
intensities_design_filtered <- filter_per_center(
  intensities, metadata, 'file', unique(metadata$lab), 'lab', min_samples = 6)

print(paste("Number of features removed:", nrow(intensities) - nrow(intensities_design_filtered)))
intensities <- intensities_design_filtered
dim(intensities)


Filtering by  lab  - two not-NA per lab 
	Before filtering: 2709 118 
	After filtering: 2272 118 
[1] "Number of features removed: 437"


In [124]:
# remove rows with all values NA
intensities_design_filtered <- intersities[!apply(is.na(intersities), 1, all),]
print(paste("Number of rows removed:", nrow(intensities) - nrow(intensities_design_filtered)))

[1] "Number of rows removed: 0"


In [125]:
layout <- plots_multiple(intensities, metadata, "Bacterial dataset, uncorrected")
ggsave(paste0("plots/data_plot.png"), plot = layout, width = 12, height = 15)

No id variables; using all as measure variables

“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_summary()`).”
“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_density()`).”


# Correction

In [126]:
# create design
metadata <- metadata %>%
    mutate(condition = factor(condition, levels = c("Glu", "Pyr")))
design= model.matrix(~condition, data = metadata)
intensities <- intensities[, metadata$file]
colnames(design) <- c("Intercept", "Pyr")
print(dim(intensities))
intensities_corrected <- limma::removeBatchEffect(intensities, metadata$lab, design = design) %>% as.data.frame()


[1] 2272  118


In [127]:
layout <- plots_multiple(intensities_corrected, metadata, "Bacterial dataset, R corrected")
ggsave(paste0("plots/data_plot_Rcorrected.png"), plot = layout, width = 12, height = 15)


No id variables; using all as measure variables

“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_summary()`).”
“[1m[22mRemoved 1719 rows containing non-finite outside the scale range
(`stat_density()`).”


In [129]:
design <- design %>% as.data.frame()
path_to_after_data <- paste0("after/")


for(center in unique(metadata$lab)){
    path_to_before <- paste0(path_to_data, center, "/")
    path_to_after <- paste0("after/", center, "/")

    design_center = design[rownames(metadata[metadata$lab == center,]),]['Pyr']

    # if(!dir.exists(path_to_after)){
    #     dir.create(path_to_after)
    # }

    write.table(design_center %>% rownames_to_column('file'),
        file = paste0(path_to_before, "design.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)
}

write.table(intensities_corrected %>% rownames_to_column('rowname'), 
    file = paste0(path_to_after_data, "intensities_log_Rcorrected_UNION.tsv"), sep = "\t", quote = T, row.names = F, col.names = T)

In [130]:
dim(intensities_corrected)