In [49]:
library(tidyverse)
source("../../utils/plots_eda.R")

# Load data

In [50]:
datasets <- c("lab_A", "lab_B", "lab_C",  "lab_D", "lab_E")

In [51]:
all_expression <- NULL
all_metadata <- NULL

for(dataset in datasets){
    metadata <- read.table(paste0("before/", dataset, "/metadata.tsv"), header = TRUE, sep = "\t")
    expr_data <- read.table(paste0("before/", dataset, "/protein_groups_matrix.tsv"), header = TRUE, sep = "\t", row.names = 1)


    # if "." in expression data column names, replace with "_"
    colnames(expr_data) <- gsub("\\.", "-", colnames(expr_data))
    # if start with X, remove X
    colnames(expr_data) <- gsub("^X", "", colnames(expr_data))

    print(paste0("Processing dataset: ", dataset, "; platform: ", metadata$platform_id[1]))
    print(paste0("Samples: ", nrow(metadata), "; Features: ", nrow(expr_data)))

    metadata <- metadata %>% 
        mutate(Dataset = dataset) %>%
        rename("sample_id" = "file", "Status" = "condition")

    expr_data <- expr_data[, metadata$sample_id]
    # log2 transform
    expr_data <- log2(expr_data + 1)
    print(paste0("Samples: ", nrow(metadata), "; Features: ", nrow(expr_data)))

    #########################################################################################################################
    #  plot
    plot_res <- plot_diagnostic(expr_data, metadata, dataset,
                                log_transform = TRUE, with_rowname = TRUE)
    layout <- (plot_res[[1]] + plot_res[[2]] ) / 
              (plot_res[[3]] )
    ggsave(paste0("before/", dataset, "/diagnostic_plot.png"), 
                plot = layout, width = 12, height = 12)


    expr_data <- rownames_to_column(as.data.frame(expr_data), "row_ID")
    # save data
    if(is.null(all_metadata)){
        all_metadata <- metadata
        all_expression <- expr_data
    } else {        
        all_metadata <- rbind(all_metadata, metadata)
        all_expression <- full_join(all_expression, expr_data, by = "row_ID")
    }
    print(paste0("Combined Samples: ", nrow(all_metadata), "; Features: ", nrow(all_expression)))
    print(" ")
}

rownames(all_expression) <- all_expression$row_ID
all_expression <- all_expression[, -1]

# # plot the combined data
print("Plotting combined data")
plot_res <- plot_diagnostic(all_expression, all_metadata, "Combined",
                                log_transform = TRUE, with_rowname = TRUE)
layout <- (plot_res[[1]] + plot_res[[2]] ) / 
          (plot_res[[3]] )
ggsave("before/diagnostic_plot.png", 
            plot = layout, width = 12, height = 12)

[1] "Processing dataset: lab_A; platform: "
[1] "Samples: 20; Features: 2549"
[1] "Samples: 20; Features: 2549"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 1273 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 1273 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 1273 rows containing non-finite outside the scale range
(`stat_summary()`).”


[1] "Combined Samples: 20; Features: 2549"
[1] " "
[1] "Processing dataset: lab_B; platform: "
[1] "Samples: 19; Features: 2846"
[1] "Samples: 19; Features: 2846"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 1029 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 1029 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 1029 rows containing non-finite outside the scale range
(`stat_summary()`).”


[1] "Combined Samples: 39; Features: 2924"
[1] " "
[1] "Processing dataset: lab_C; platform: "
[1] "Samples: 19; Features: 2820"
[1] "Samples: 19; Features: 2820"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 1722 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 1722 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 1722 rows containing non-finite outside the scale range
(`stat_summary()`).”


[1] "Combined Samples: 58; Features: 2997"
[1] " "
[1] "Processing dataset: lab_D; platform: "
[1] "Samples: 20; Features: 2813"
[1] "Samples: 20; Features: 2813"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 857 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 857 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 857 rows containing non-finite outside the scale range
(`stat_summary()`).”


[1] "Combined Samples: 78; Features: 3048"
[1] " "
[1] "Processing dataset: lab_E; platform: "
[1] "Samples: 20; Features: 2401"
[1] "Samples: 20; Features: 2401"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 960 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 960 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 960 rows containing non-finite outside the scale range
(`stat_summary()`).”


[1] "Combined Samples: 98; Features: 3059"
[1] " "
[1] "Plotting combined data"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 42709 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 42709 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 42709 rows containing non-finite outside the scale range
(`stat_summary()`).”


# Save data for correction and after correction

In [52]:
# if colname starts with number, add a prefix "S" - in all_expression df
all_expression <- all_expression %>%
  rename_with(~ ifelse(grepl("^[0-9]", .), paste0("S", .), .))
# replace "-" with "_" in all_expression df
all_expression <- all_expression %>%
  rename_with(~ gsub("-", "_", .))

all_metadata$sample_id <- ifelse(grepl("^[0-9]", all_metadata$sample_id), 
                                 paste0("S", all_metadata$sample_id), 
                                 all_metadata$sample_id)
# replace "-" with "_" in all_metadata df
all_metadata$sample_id <- gsub("-", "_", all_metadata$sample_id)

In [53]:
all_metadata$batch <- as.numeric(as.factor(all_metadata$Dataset)) - 1
# all_metadata$batch <- 0
all_metadata$Status <- as.numeric(as.factor(all_metadata$Status))
all_metadata$Status = all_metadata$Status - 1


for (dataset in unique(all_metadata$Dataset)) {
    print(paste0("Save data prior to batch correction for ", dataset))
    dataset_metadata <- all_metadata[all_metadata$Dataset == dataset,]
    dataset_metadata <- dataset_metadata %>% select(sample_id, Status, batch)
    
    dataset_expression <- all_expression[, dataset_metadata$sample_id]
    dataset_expression <- na.omit(dataset_expression)
    dataset_expression <- dataset_expression[apply(dataset_expression, 1, var) > 0, ]

    dataset_expression <- dataset_expression %>% rownames_to_column("gene_id")
    print(paste0("Samples: ", nrow(dataset_metadata), "; Features: ", nrow(dataset_expression)))
    
    write.table(dataset_metadata, 
        file = paste0("before/", dataset, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    write.table(dataset_expression, 
        file = paste0("before/", dataset, "/expr_for_correction.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

}

all_expression <- na.omit(all_expression)
all_expression <- all_expression[apply(all_expression, 1, var) > 0, ]
all_expression <- all_expression[, all_metadata$sample_id]

write.table(all_metadata %>% select(sample_id, Status, batch), 
    file = "before/all_design.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(all_expression %>% rownames_to_column("gene_id"),
    file = "before/all_expr_for_correction.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

[1] "Save data prior to batch correction for lab_A"
[1] "Samples: 20; Features: 2283"
[1] "Save data prior to batch correction for lab_B"
[1] "Samples: 19; Features: 2592"
[1] "Save data prior to batch correction for lab_C"
[1] "Samples: 19; Features: 2523"
[1] "Save data prior to batch correction for lab_D"
[1] "Samples: 20; Features: 2643"
[1] "Save data prior to batch correction for lab_E"
[1] "Samples: 20; Features: 2229"
