In [10]:
library(tidyverse)

source("../../evaluation_utils/plots_eda.R")

In [11]:
plots_normtumor <- function(expr, metadata, i, dataset){
    pca_plot <- pca_plot(expr, metadata, 
        paste0(dataset , i, ", norm_vs_tumor"),
        col_col = 'Status', shape_col = "Dataset", quantitative_col_name = 'Sample_geo_ID')

    boxplot <- boxplot_plot(expr, metadata, 
        title = paste0(dataset , i, ", norm_vs_tumor"),
        color_col = 'Status', quantitativeColumnName = 'Sample_geo_ID', 
        path = '')

    density_plot <- plotIntensityDensity(expr, metadata, 
        quantitativeColumnName = 'Sample_geo_ID', 
        colorColumnName = 'Status',
        title = paste0(dataset , i, ", norm_vs_tumor"))

    return(list(pca_plot, density_plot, boxplot))
}


# Load data

In [12]:
norm_vs_HGSC = c("GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407")

In [13]:
all_expression <- NULL
all_metadata <- NULL

i <- 1
for(dataset in norm_vs_HGSC){
    metadata <- read.table(paste0("before/", dataset, "/intermediate/", i, "_metadata_short.tsv"), header = TRUE, sep = "\t")
    expr_data <- read.table(paste0("before/", dataset, "/intermediate/expr_GBacc_MaxVar.tsv"), header = TRUE, sep = "\t") 
    print(paste0("Processing dataset: ", dataset, "platform: ", metadata$platform_id[1]))
    metadata <- metadata %>% mutate(Dataset = dataset) %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage2, Grade2, ClinicopathologicSubtypes, Dataset) %>%

        # outliers
        filter(!(Sample_geo_ID %in% c("GSM1701038", "GSM898305", "GSM997639")))

    # for plots
    plt_meta <- metadata #%>%
        # filter((Status == "ovarian tumour" & HistSubtypes == "high-grade serous carcinoma" | 
                # Status == "normal"))

    plt_expression <- expr_data %>% column_to_rownames("row_ID")
    plt_meta <- plt_meta[plt_meta$Sample_geo_ID %in% colnames(expr_data),]
    plt_expression <- expr_data[,plt_meta$Sample_geo_ID]

    # plots
    print("..plotting..")
    #  plot
    plot_res <- plots_normtumor(plt_expression[sample(1:nrow(plt_expression), 10000),], plt_meta, dataset, ', norm_vs_tumor')
    layout <- (plot_res[[1]] + plot_res[[2]] ) / 
            (plot_res[[3]] )
    ggsave(paste0("before/", dataset, "/intermediate/norm_vs_tumor_plot.png"), 
                plot = layout, width = 12, height = 12)

    if(is.null(all_metadata)){
        all_metadata <- metadata
        all_expression <- expr_data
    } else {
        
        all_metadata <- rbind(all_metadata, metadata)

        # shared features analysis:
        # all_expression <- inner_join(all_expression, expr_data, by = "row_ID")

        # all features analysis (union):
        all_expression <- full_join(all_expression, expr_data, by = "row_ID")
    }
}

[1] "Processing dataset: GSE6008platform: GPL96"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Processing dataset: GSE26712platform: GPL96"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Processing dataset: GSE40595platform: GPL570"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Processing dataset: GSE69428platform: GPL570"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Processing dataset: GSE38666platform: GPL570"
[1] "..plotting.."


No id variables; using all as measure variables



[1] "Processing dataset: GSE14407platform: GPL570"
[1] "..plotting.."


No id variables; using all as measure variables



In [14]:
all_metadata %>%
    group_by(Dataset, Status) %>% summarise(n())

[1m[22m`summarise()` has grouped output by 'Dataset'. You can override using the
`.groups` argument.


Dataset,Status,n()
<chr>,<chr>,<int>
GSE14407,normal,12
GSE14407,ovarian tumour,12
GSE26712,normal,10
GSE26712,ovarian tumour,185
GSE38666,normal,12
GSE38666,ovarian tumour,18
GSE40595,normal,6
GSE40595,ovarian tumour,31
GSE6008,normal,4
GSE6008,ovarian tumour,99


In [15]:
all_metadata %>%
    group_by(Dataset, Status, HistSubtypes) %>% summarise(n())

[1m[22m`summarise()` has grouped output by 'Dataset', 'Status'. You can override using
the `.groups` argument.


Dataset,Status,HistSubtypes,n()
<chr>,<chr>,<chr>,<int>
GSE14407,normal,,12
GSE14407,ovarian tumour,high-grade serous carcinoma,12
GSE26712,normal,,10
GSE26712,ovarian tumour,high-grade serous carcinoma,185
GSE38666,normal,,12
GSE38666,ovarian tumour,high-grade serous carcinoma,18
GSE40595,normal,,6
GSE40595,ovarian tumour,high-grade serous carcinoma,31
GSE6008,normal,,4
GSE6008,ovarian tumour,clear cell carcinoma,8


In [16]:
all_metadata <- all_metadata %>%
    mutate(Status = as.factor(Status),
             Stage2 = as.factor(Stage2),
             Grade2 = as.factor(Grade2),
             ClinicopathologicSubtypes = as.factor(ClinicopathologicSubtypes),
             Dataset = as.factor(Dataset))

all_expression <- all_expression %>% column_to_rownames("row_ID")
all_metadata <- all_metadata[all_metadata$Sample_geo_ID %in% colnames(all_expression),]
all_expression <- all_expression[,all_metadata$Sample_geo_ID]

print(paste0("Number of samples in type_1_early_vs_late: ", ncol(all_expression)))
print(paste0("Number of genes in type_1_early_vs_late: ", nrow(all_expression)))

[1] "Number of samples in type_1_early_vs_late: 408"
[1] "Number of genes in type_1_early_vs_late: 51276"


In [17]:
#  plot
plot_res <- plots_normtumor(all_expression[sample(1:nrow(all_expression), 10000),], 
                            all_metadata, "", 'norm_vs_tumor')
layout <- (plot_res[[1]] + plot_res[[2]] ) / 
         (plot_res[[3]] )
# ggsave("before/plots/norm_vs_HGSC_plot.png", 
#              plot = layout, width = 14, height = 12)
ggsave("before/plots/norm_vs_tumor_plot_UNION.png", 
             plot = layout, width = 14, height = 12)



No id variables; using all as measure variables

“[1m[22mRemoved 1767140 rows containing non-finite values (`stat_density()`).”
“[1m[22mRemoved 1767140 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 1767140 rows containing non-finite values (`stat_summary()`).”


In [18]:
# remove batch effect from GSE68928
design <- model.matrix(~all_metadata$Status)
colnames(design) <- c("Intercept", "HGSC")
corrected_expr <- limma::removeBatchEffect(all_expression, batch = all_metadata$Dataset, design = design) %>% as.data.frame()

# save corrected data to after
write.table(corrected_expr %>% rownames_to_column("Gene"), "after/central_corrected_UNION.tsv", sep = "\t", quote = FALSE, row.names = FALSE)


# # plot plots for norm_vs_tumor
plot_res <- plots_normtumor(
    # take random 10000 rows 
    corrected_expr[sample(1:nrow(corrected_expr), 10000),], all_metadata, " ", "norm_vs_tumor")
layout <- (plot_res[[1]] + plot_res[[2]] ) / 
    (plot_res[[3]] )
ggsave("before/plots/norm_vs_tumor_plot_Rcorrected_UNION.png", 
             plot = layout, width = 14, height = 12)


“Partial NA coefficients for 30148 probe(s)”
No id variables; using all as measure variables

“[1m[22mRemoved 1744194 rows containing non-finite values (`stat_density()`).”
“[1m[22mRemoved 1744194 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 1744194 rows containing non-finite values (`stat_summary()`).”


In [19]:
# save expression and metadata in folder
# all in one file
write.table(all_metadata, "before/all_metadata.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
# write.table(all_expression %>% rownames_to_column("Gene"), "before/all_expression.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(all_expression %>% rownames_to_column("Gene"), "before/all_expression_UNION.tsv", sep = "\t", quote = FALSE, row.names = FALSE)


In [21]:
path_to_after_data <- "after"

design <- design %>% as.data.frame()
rownames(design) <- all_metadata$Sample_geo_ID

for(center in unique(all_metadata$Dataset)){
    print(center)
    path_to_before <- paste0("before/", center, "/")

    design_center = design[all_metadata[all_metadata$Dataset == center,]$Sample_geo_ID,]['HGSC']

    # if(!dir.exists(path_to_after)){
    #     dir.create(path_to_after)
    # }

    write.table(design_center %>% rownames_to_column('file'),
        file = paste0(path_to_before, "design.tsv"), sep = "\t", quote = F, row.names = F, col.names = T)

    # take also the expression data
    expr_center <- all_expression[,all_metadata$Sample_geo_ID[all_metadata$Dataset == center]]
    print(dim(expr_center))
    write.table(expr_center %>% rownames_to_column("Gene"), file = paste0(path_to_before, "expr_for_correction_UNION.tsv"), sep = "\t", quote = F, row.names = F, col.names = T)
}

[1] "GSE6008"
[1] 51276   103
[1] "GSE26712"
[1] 51276   195
[1] "GSE40595"
[1] 51276    37
[1] "GSE69428"
[1] 51276    19
[1] "GSE38666"
[1] 51276    30
[1] "GSE14407"
[1] 51276    24
