### 读取不同的Bulk数据

In [16]:
# 设置工作目录
data_dir <- "/home/guoliming/Brown/ALI_Gaoji/sc_bulk_analysis/1_bulk_data"

# 主文件路径
main_file <- file.path(data_dir, "GSE193876_gene_counts_FPKM_TPM_matrix.txt")

# 读取主文件
main_df <- read.table(main_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# 只保留 id、gene_name 以及列名中含 fpkm_CTRL 或 fpkm_LPS 的列
cols_to_keep <- c("gene_name", grep("fpkm_(CTRL|LPS)", colnames(main_df), value = TRUE))
main_df <- main_df[, cols_to_keep]

# 列出目录下所有 txt 文件（排除主文件）
all_files <- list.files(data_dir, pattern = "\\.txt$", full.names = TRUE)
other_files <- setdiff(all_files, main_file)

# 循环读取其他文件并做合并（交集）
for(f in other_files){
  temp_df <- read.table(f, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  temp_df <- temp_df[, !(colnames(temp_df) %in% "id")]  # 删除id列
  main_df <- merge(main_df, temp_df, by = c("gene_name"))
}

# 查看合并结果
head(main_df)

# 保存结果
write.table(main_df, file = file.path(data_dir, "merged_intersect.txt"), 
            sep = "\t", row.names = FALSE, quote = FALSE)

Unnamed: 0_level_0,gene_name,fpkm_CTRL_1,fpkm_CTRL_2,fpkm_CTRL_3,fpkm_CTRL_4,fpkm_LPS_12h_1,fpkm_LPS_12h_2,fpkm_LPS_12h_3,fpkm_LPS_12h_4,fpkm_LPS_24h_1,⋯,control2.count,control2.FPKM,control3.count,control3.FPKM,LPS1.count,LPS1.FPKM,LPS2.count,LPS2.FPKM,LPS3.count,LPS3.FPKM
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0610009B22Rik,16.82,16.33,16.65,18.28,16.21,15.19,14.28,13.91,15.44,⋯,115,11.98,95,9.78,20,2.08,80,8.2,73,7.4
2,0610010F05Rik,5.96,7.0,6.11,5.85,4.3,5.18,4.22,4.65,6.53,⋯,121,1.69,169,2.45,251,4.35,169,2.52,115,1.63
3,0610010K14Rik,40.64,40.94,33.15,39.02,35.14,35.58,38.09,35.75,37.13,⋯,1045,109.93,1671,172.15,719,77.28,1427,152.6,1382,142.58
4,0610030E20Rik,13.23,14.48,13.86,12.9,11.92,11.24,9.67,12.19,13.99,⋯,254,3.34,342,4.41,119,1.61,282,3.71,310,3.97
5,0610040J01Rik,9.03,9.58,9.25,11.77,13.37,11.66,13.21,12.3,6.33,⋯,247,8.7,157,5.47,18,0.65,115,4.08,164,5.66
6,1110002E22Rik,0.03,0.08,0.15,0.29,0.04,0.05,0.01,0.01,0.08,⋯,2,0.01,0,0.0,3,0.02,3,0.02,3,0.02


In [17]:
nrow(main_df)

### 每次只需要改这个筛选规则就可以了

In [2]:
data_dir <- "/home/guoliming/Brown/ALI_Gaoji/sc_bulk_analysis/1_bulk_data"

# 主文件路径
bulk_file <- file.path(data_dir, "merged_intersect.txt")
bulk_df <- read.table(bulk_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [6]:

# 选择符合条件的列名
keep_cols <- c("gene_name",
                grep("(CTRL|LPS[0-9]+|48h|control)", colnames(bulk_df), value = TRUE))
keep_cols <- keep_cols[!grepl("average|count", keep_cols)]

# 生成新的data.frame
df_sub <- bulk_df[, keep_cols, drop = FALSE]

# 查看结果
head(df_sub)


Unnamed: 0_level_0,gene_name,fpkm_CTRL_1,fpkm_CTRL_2,fpkm_CTRL_3,fpkm_CTRL_4,fpkm_LPS_48h_1,fpkm_LPS_48h_2,fpkm_LPS_48h_3,fpkm_LPS_48h_4,control1.FPKM,control2.FPKM,control3.FPKM,LPS1.FPKM,LPS2.FPKM,LPS3.FPKM
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0610009B22Rik,16.82,16.33,16.65,18.28,15.49,16.27,16.01,14.3,7.69,11.98,9.78,2.08,8.2,7.4
2,0610010F05Rik,5.96,7.0,6.11,5.85,6.88,7.72,7.27,6.91,2.55,1.69,2.45,4.35,2.52,1.63
3,0610010K14Rik,40.64,40.94,33.15,39.02,57.35,44.37,46.53,42.54,185.63,109.93,172.15,77.28,152.6,142.58
4,0610030E20Rik,13.23,14.48,13.86,12.9,11.52,13.15,13.31,11.82,3.22,3.34,4.41,1.61,3.71,3.97
5,0610040J01Rik,9.03,9.58,9.25,11.77,5.53,5.38,6.5,6.07,2.15,8.7,5.47,0.65,4.08,5.66
6,1110002E22Rik,0.03,0.08,0.15,0.29,0.03,0.03,0.06,0.03,0.01,0.01,0.0,0.02,0.02,0.02


In [23]:
#  bulk_df$gene_name <- toupper(bulk_df$gene_name)   # 转大写

In [11]:
colnames(df_sub)[colnames(df_sub) == "gene_name"] <- "GeneSymbol"


In [12]:
head(df_sub)

Unnamed: 0_level_0,GeneSymbol,fpkm_CTRL_1,fpkm_CTRL_2,fpkm_CTRL_3,fpkm_CTRL_4,fpkm_LPS_48h_1,fpkm_LPS_48h_2,fpkm_LPS_48h_3,fpkm_LPS_48h_4,control1.FPKM,control2.FPKM,control3.FPKM,LPS1.FPKM,LPS2.FPKM,LPS3.FPKM
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0610009B22Rik,16.82,16.33,16.65,18.28,15.49,16.27,16.01,14.3,7.69,11.98,9.78,2.08,8.2,7.4
2,0610010F05Rik,5.96,7.0,6.11,5.85,6.88,7.72,7.27,6.91,2.55,1.69,2.45,4.35,2.52,1.63
3,0610010K14Rik,40.64,40.94,33.15,39.02,57.35,44.37,46.53,42.54,185.63,109.93,172.15,77.28,152.6,142.58
4,0610030E20Rik,13.23,14.48,13.86,12.9,11.52,13.15,13.31,11.82,3.22,3.34,4.41,1.61,3.71,3.97
5,0610040J01Rik,9.03,9.58,9.25,11.77,5.53,5.38,6.5,6.07,2.15,8.7,5.47,0.65,4.08,5.66
6,1110002E22Rik,0.03,0.08,0.15,0.29,0.03,0.03,0.06,0.03,0.01,0.01,0.0,0.02,0.02,0.02


In [13]:
library(data.table)

In [19]:
# 输出文件
fwrite(df_sub,file = "Bulk_data_need_edit.txt",sep = "\t",row.names = F)

#### 手动修改一下列名（样本名），在样本前面加上GSE号，规范化

In [26]:
check_df = read.table("/home/guoliming/Brown/ALI_Gaoji/sc_bulk_analysis/1_bulk_data/Bulk_data.txt",header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [27]:
head(check_df)

Unnamed: 0_level_0,GeneSymbol,GSE193876_CTRL_1,GSE193876_CTRL_2,GSE193876_CTRL_3,GSE193876_CTRL_4,GSE193876_LPS_48h_1,GSE193876_LPS_48h_2,GSE193876_LPS_48h_3,GSE193876_LPS_48h_4,GSE247266_CTRL_1,GSE247266_CTRL_2,GSE247266_CTRL_3,GSE247266_LPS_1,GSE247266_LPS_2,GSE247266_LPS_3
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0610009B22Rik,16.82,16.33,16.65,18.28,15.49,16.27,16.01,14.3,7.69,11.98,9.78,2.08,8.2,7.4
2,0610010F05Rik,5.96,7.0,6.11,5.85,6.88,7.72,7.27,6.91,2.55,1.69,2.45,4.35,2.52,1.63
3,0610010K14Rik,40.64,40.94,33.15,39.02,57.35,44.37,46.53,42.54,185.63,109.93,172.15,77.28,152.6,142.58
4,0610030E20Rik,13.23,14.48,13.86,12.9,11.52,13.15,13.31,11.82,3.22,3.34,4.41,1.61,3.71,3.97
5,0610040J01Rik,9.03,9.58,9.25,11.77,5.53,5.38,6.5,6.07,2.15,8.7,5.47,0.65,4.08,5.66
6,1110002E22Rik,0.03,0.08,0.15,0.29,0.03,0.03,0.06,0.03,0.01,0.01,0.0,0.02,0.02,0.02


### 输出表型

In [33]:
# 假设你的原始数据叫 df
samples <- setdiff(colnames(check_df), "GeneSymbol")

# 新建 data.frame，行名是样本，列是 Group
phenotype_df <- data.frame(
  status  = ifelse(grepl("LPS", samples), 1,
                 ifelse(grepl("CTRL", samples), 0, NA)),
  row.names = samples
)

phenotype_df$time <- ifelse(phenotype_df$status == 1, 0, 1)

# 把 time 列放到前面
phenotype_df <- phenotype_df[, c("time", "status")]

# 查看结果
head(phenotype_df)



Unnamed: 0_level_0,time,status
Unnamed: 0_level_1,<dbl>,<dbl>
GSE193876_CTRL_1,1,0
GSE193876_CTRL_2,1,0
GSE193876_CTRL_3,1,0
GSE193876_CTRL_4,1,0
GSE193876_LPS_48h_1,0,1
GSE193876_LPS_48h_2,0,1


In [34]:
# 输出文件
fwrite(phenotype_df,file = "Phenotype.txt",sep = "\t",row.names = T)