In [1]:
suppressPackageStartupMessages({
library(DESeq2)
library(ggplot2)
library(pheatmap)
library(tidyverse)
library(here)
    })

In [2]:
setwd('..')

In [3]:
getwd()

In [4]:
utils <- new.env()

sys.source(here::here("scripts", "utils.r"), envir = utils)

In [5]:
data_path <- "./Data/Bulk-seq_data/GSE252692_feature_counts/"
files <- list.files(data_path, pattern = "\\.txt$", full.names = TRUE)

In [6]:
files

In [7]:
merged_df <- files %>%
  lapply(function(file) {
    data <- read.delim(file, skip = 1)
    sample_name <- gsub("_count_matrix.txt", "", basename(file))
    selected_data <- data %>% dplyr::select(Geneid = 1, last_col())
    colnames(selected_data)[2] <- sample_name
    return(selected_data)
  }) %>%
  purrr::reduce(full_join, by = "Geneid")

In [8]:
write_csv(merged_df,'./CSV/MRC-5_raw_counts.csv')

In [9]:
sample_names <- colnames(merged_df)[-1]
condition <- rep(x = c("Control","3hpi","6hpi","9hpi","12hpi","18hpi","24hpi","30hpi"),times = 3)

In [10]:
count_data <- merged_df[2:25]
rownames(count_data) <- merged_df$Geneid

In [11]:
count_data

Unnamed: 0_level_0,01_uninfected_rep1,02_OC43_3hpi_rep1,03_OC43_6hpi_rep1,04_OC43_9hpi_rep1,05_OC43_12hpi_rep1,06_OC43_18hpi_rep1,07_OC43_24hpi_rep1,08_OC43_30hpi_rep1,17_uninfected_rep3,19_OC43_3hpi_rep3,⋯,24_OC43_24hpi_rep3,25_OC43_30hpi_rep3,29_uninfected_rep4,31_OC43_3hpi_rep4,32_OC43_6hpi_rep4,33_OC43_9hpi_rep4,34_OC43_12hpi_rep4,35_OC43_18hpi_rep4,36_OC43_24hpi_rep4,37_OC43_30hpi_rep4
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ENSG00000279928,25,32,29,19,15,5,7,15,21,16,⋯,8,2,19,14,24,13,15,10,4,10
ENSG00000228037,1,0,0,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,1
ENSG00000142611,386,953,869,609,258,157,128,116,262,360,⋯,49,56,165,268,307,202,99,105,40,45
ENSG00000284616,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000157911,1490,1638,1287,1063,812,491,498,533,1012,770,⋯,500,596,1057,834,1173,820,586,643,369,711
ENSG00000269896,34,38,22,9,10,7,10,20,24,10,⋯,16,27,19,11,9,13,8,20,14,25
ENSG00000228463,905,1027,948,825,525,379,494,678,784,450,⋯,320,675,842,476,717,522,360,380,262,749
ENSG00000260972,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000224340,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000226374,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [12]:
condition

In [13]:
sample_names

In [14]:
coldata <- data.frame(row.names = sample_names, condition = factor(condition))

In [15]:
dds <- DESeqDataSetFromMatrix(countData = count_data, colData = coldata, design = ~ condition)
dds <- DESeq(dds)

  it appears that the last variable in the design formula, 'condition',
  has a factor level, 'Control', which is not the reference level. we recommend
  to use factor(...,levels=...) or relevel() to set this as the reference level
  before proceeding. for more information, please see the 'Note on factor levels'
  in vignette('DESeq2').

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [16]:
resultsNames(dds)

In [17]:
target_timepoints <- c("3hpi", "6hpi", "9hpi", "12hpi", "18hpi", "24hpi", "30hpi")

lapply(target_timepoints, function(tp) {
    utils$analyze_save(dds, 'condition',coefficient = tp, "Control", './CSV/Bulk-seq/MRC-5/', lfc_cutoff = 0.58)
})

Processing: 3hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 3hpi | Found 2831 DEGs

Processing: 6hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 6hpi | Found 1849 DEGs

Processing: 9hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 9hpi | Found 2217 DEGs

Processing: 12hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 12hpi | Found 3459 DEGs

Processing: 18hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 18hpi | Found 6553 DEGs

Processing: 24hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 24hpi | Found 9503 DEGs

Processing: 30hpi versus Control ...

"attempt to set 'col.names' ignored"
"attempt to set 'col.names' ignored"
Done: 30hpi | Found 12033 DEGs



In [18]:
sessionInfo()

R version 4.2.3 (2023-03-15 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 26200)

Matrix products: default

locale:
[1] LC_COLLATE=Korean_Korea.utf8  LC_CTYPE=Korean_Korea.utf8   
[3] LC_MONETARY=Korean_Korea.utf8 LC_NUMERIC=C                 
[5] LC_TIME=Korean_Korea.utf8    

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] here_1.0.2                  lubridate_1.9.3            
 [3] forcats_1.0.1               stringr_1.5.2              
 [5] dplyr_1.1.4                 purrr_1.0.2                
 [7] readr_2.1.5                 tidyr_1.3.1                
 [9] tibble_3.2.1                tidyverse_2.0.0            
[11] pheatmap_1.0.13             ggplot2_3.5.1              
[13] DESeq2_1.38.3               SummarizedExperiment_1.28.0
[15] Biobase_2.58.0              MatrixGenerics_1.10.0      
[17] matrixStats_1.3.0           GenomicRanges_1.50.2   