In [1]:
library(tidyverse)
source("../../utils/plots_eda.R")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘cowplot’


The following object is masked from

# Load data

In [2]:
datasets <- c("GSE6008", "GSE14407", "GSE26712",  "GSE40595", "GSE36668", "GSE69428")

In [3]:
all_expression <- NULL
all_metadata <- NULL

for(dataset in datasets){
    metadata <- read.table(paste0("before/", dataset, "/1_metadata_short.tsv"), header = TRUE, sep = "\t")
    expr_data <- read.table(paste0("before/", dataset, "/1_expr_GBacc_MaxVar.tsv"), header = TRUE, sep = "\t", row.names = 1)

    print(paste0("Processing dataset: ", dataset, "; platform: ", metadata$platform_id[1]))
    print(paste0("Samples: ", nrow(metadata), "; Features: ", nrow(expr_data)))

    metadata <- metadata %>% mutate(Dataset = dataset) %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage2, Grade2, ClinicopathologicSubtypes, Dataset) %>%
        # outliers
        filter(!(Sample_geo_ID %in% c("GSM1701038", "GSM898305", "GSM997639"))) %>%
        rename("sample_id" = "Sample_geo_ID")

    expr_data <- expr_data[, metadata$sample_id]
    print(paste0("Samples: ", nrow(metadata), "; Features: ", nrow(expr_data)))

    #########################################################################################################################
    #  plot
    plot_res <- plot_diagnostic(expr_data, metadata, dataset,
                                log_transform = TRUE, with_rowname = TRUE)
    layout <- (plot_res[[1]] + plot_res[[2]] ) / 
              (plot_res[[3]] )
    ggsave(paste0("before/", dataset, "/diagnostic_plot.png"), 
                plot = layout, width = 12, height = 12)


    expr_data <- rownames_to_column(as.data.frame(expr_data), "row_ID")
    # save data
    if(is.null(all_metadata)){
        all_metadata <- metadata
        all_expression <- expr_data
    } else {        
        all_metadata <- rbind(all_metadata, metadata)
        all_expression <- full_join(all_expression, expr_data, by = "row_ID")
    }
    print(paste0("Combined Samples: ", nrow(all_metadata), "; Features: ", nrow(all_expression)))
    print(" ")
}

rownames(all_expression) <- all_expression$row_ID
all_expression <- all_expression[, -1]

# # plot the combined data
print("Plotting combined data")
plot_res <- plot_diagnostic(all_expression, all_metadata, "Combined",
                                log_transform = TRUE, with_rowname = TRUE)
layout <- (plot_res[[1]] + plot_res[[2]] ) / 
          (plot_res[[3]] )
ggsave("before/diagnostic_plot.png", 
            plot = layout, width = 12, height = 12)

[1] "Processing dataset: GSE6008; platform: GPL96"
[1] "Samples: 27; Features: 13237"
[1] "Samples: 27; Features: 13237"
[1] "..plotting.."


“[1m[22m`aes_string()` was deprecated in ggplot2 3.0.0.
[36mℹ[39m Please use tidy evaluation idioms with `aes()`.
[36mℹ[39m See also `vignette("ggplot2-in-packages")` for more information.”
No id variables; using all as measure variables



[1] "Combined Samples: 27; Features: 13237"
[1] " "
[1] "Processing dataset: GSE14407; platform: GPL570"
[1] "Samples: 24; Features: 22880"
[1] "Samples: 24; Features: 22880"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Combined Samples: 51; Features: 22880"
[1] " "
[1] "Processing dataset: GSE26712; platform: GPL96"
[1] "Samples: 195; Features: 13237"
[1] "Samples: 195; Features: 13237"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Combined Samples: 246; Features: 22880"
[1] " "
[1] "Processing dataset: GSE40595; platform: GPL570"
[1] "Samples: 38; Features: 22880"
[1] "Samples: 37; Features: 22880"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Combined Samples: 283; Features: 22880"
[1] " "
[1] "Processing dataset: GSE36668; platform: GPL570"
[1] "Samples: 8; Features: 22880"
[1] "Samples: 7; Features: 22880"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Combined Samples: 290; Features: 22880"
[1] " "
[1] "Processing dataset: GSE69428; platform: GPL570"
[1] "Samples: 20; Features: 22880"
[1] "Samples: 19; Features: 22880"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Combined Samples: 309; Features: 22880"
[1] " "
[1] "Plotting combined data"
[1] "..plotting.."


No id variables; using all as measure variables

“[1m[22mRemoved 931512 rows containing non-finite outside the scale range
(`stat_density()`).”
“[1m[22mRemoved 931512 rows containing non-finite outside the scale range
(`stat_boxplot()`).”
“[1m[22mRemoved 931512 rows containing non-finite outside the scale range
(`stat_summary()`).”


# Save data for correction and after correction

In [4]:
all_metadata$batch <- as.numeric(as.factor(all_metadata$Dataset)) - 1
# all_metadata$batch <- 0
all_metadata$Status <- as.numeric(as.factor(all_metadata$Status))
all_metadata$Status = all_metadata$Status - 1


for (dataset in unique(all_metadata$Dataset)) {
    print(paste0("Save data prior to batch correction for ", dataset))
    dataset_metadata <- all_metadata[all_metadata$Dataset == dataset,]
    dataset_metadata <- dataset_metadata %>% select(sample_id, Status, batch)
    
    dataset_expression <- all_expression[, dataset_metadata$sample_id]
    dataset_expression <- na.omit(dataset_expression)
    dataset_expression <- dataset_expression[apply(dataset_expression, 1, var) > 0, ]

    dataset_expression <- dataset_expression %>% rownames_to_column("gene_id")
    print(paste0("Samples: ", nrow(dataset_metadata), "; Features: ", nrow(dataset_expression)))
    
    write.table(dataset_metadata, 
        file = paste0("before/", dataset, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    write.table(dataset_expression, 
        file = paste0("before/", dataset, "/expr_for_correction.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

}

all_expression <- na.omit(all_expression)
all_expression <- all_expression[apply(all_expression, 1, var) > 0, ]
all_expression <- all_expression[, all_metadata$sample_id]

write.table(all_metadata %>% select(sample_id, Status, batch), 
    file = "before/all_design.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(all_expression %>% rownames_to_column("gene_id"),
    file = "before/all_expr_for_correction.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

[1] "Save data prior to batch correction for GSE6008"
[1] "Samples: 27; Features: 13237"
[1] "Save data prior to batch correction for GSE14407"
[1] "Samples: 24; Features: 22880"
[1] "Save data prior to batch correction for GSE26712"
[1] "Samples: 195; Features: 13237"
[1] "Save data prior to batch correction for GSE40595"
[1] "Samples: 37; Features: 22880"
[1] "Save data prior to batch correction for GSE36668"
[1] "Samples: 7; Features: 22880"
[1] "Save data prior to batch correction for GSE69428"
[1] "Samples: 19; Features: 22880"


In [12]:
for (dataset in unique(all_metadata$Dataset)) {

    dataset_metadata <- all_metadata[all_metadata$Dataset == dataset,]
    print(paste0("Save data prior to batch correction for ", dataset))
    dataset_expression <- all_expression[, dataset_metadata$sample_id]
    dataset_expression <- dataset_expression %>% t() %>% as.data.frame() %>% 
    rownames_to_column("sample_id")
    print(paste0("Samples: ", nrow(dataset_expression), "; Features: ", ncol(dataset_expression)))
    
    write.table(dataset_expression, 
        file = paste0("before/", dataset, "/kmeans/expr_for_kmeans_before.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

}

[1] "Save data prior to batch correction for GSE6008"
[1] "Samples: 27; Features: 13238"
[1] "Save data prior to batch correction for GSE14407"
[1] "Samples: 24; Features: 13238"
[1] "Save data prior to batch correction for GSE26712"
[1] "Samples: 195; Features: 13238"
[1] "Save data prior to batch correction for GSE40595"
[1] "Samples: 37; Features: 13238"
[1] "Save data prior to batch correction for GSE36668"
[1] "Samples: 7; Features: 13238"
[1] "Save data prior to batch correction for GSE69428"
[1] "Samples: 19; Features: 13238"
