### 读取不同的Bulk数据

In [None]:
# 设置工作目录
data_dir <- "/home/user/sc_bulk_analysis/1_bulk_data"

# 主文件路径
main_file <- file.path(data_dir, "GSE193876_gene_counts_FPKM_TPM_matrix.txt")

# 读取主文件
main_df <- read.table(main_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# 只保留 id、gene_name 以及列名中含 fpkm_CTRL 或 fpkm_LPS 的列
cols_to_keep <- c("gene_name", grep("fpkm_(CTRL|LPS)", colnames(main_df), value = TRUE))
main_df <- main_df[, cols_to_keep]

# 列出目录下所有 txt 文件（排除主文件）
all_files <- list.files(data_dir, pattern = "\\.txt$", full.names = TRUE)
other_files <- setdiff(all_files, main_file)

# 循环读取其他文件并做合并（交集）
for(f in other_files){
  temp_df <- read.table(f, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  temp_df <- temp_df[, !(colnames(temp_df) %in% "id")]  # 删除id列
  main_df <- merge(main_df, temp_df, by = c("gene_name"))
}

# 查看合并结果
head(main_df)

# 保存结果
write.table(main_df, file = file.path(data_dir, "merged_intersect.txt"), 
            sep = "\t", row.names = FALSE, quote = FALSE)

In [None]:
nrow(main_df)

### 每次只需要改这个筛选规则就可以了

In [None]:
data_dir <- "/home/user/sc_bulk_analysis/1_bulk_data"

# 主文件路径
bulk_file <- file.path(data_dir, "merged_intersect.txt")
bulk_df <- read.table(bulk_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [None]:

# 选择符合条件的列名
keep_cols <- c("gene_name",
                grep("(CTRL|LPS[0-9]+|48h|control)", colnames(bulk_df), value = TRUE))
keep_cols <- keep_cols[!grepl("average|count", keep_cols)]

# 生成新的data.frame
df_sub <- bulk_df[, keep_cols, drop = FALSE]

# 查看结果
head(df_sub)


In [None]:
colnames(df_sub)[colnames(df_sub) == "gene_name"] <- "GeneSymbol"


In [None]:
head(df_sub)

In [None]:
library(data.table)

In [None]:
# 输出文件
fwrite(df_sub,file = "Bulk_data_need_edit.txt",sep = "\t",row.names = F)

#### 手动修改一下列名（样本名），在样本前面加上GSE号，规范化

In [None]:
check_df = read.table("/home/guoliming/Brown/ALI_Gaoji/sc_bulk_analysis/1_bulk_data/Bulk_data.txt",header = TRUE, sep = "\t", stringsAsFactors = FALSE)

In [None]:
head(check_df)

### 输出表型

In [None]:
# 假设你的原始数据叫 df
samples <- setdiff(colnames(check_df), "GeneSymbol")

# 新建 data.frame，行名是样本，列是 Group
phenotype_df <- data.frame(
  status  = ifelse(grepl("LPS", samples), 1,
                 ifelse(grepl("CTRL", samples), 0, NA)),
  row.names = samples
)

phenotype_df$time <- ifelse(phenotype_df$status == 1, 0, 1)

# 把 time 列放到前面
phenotype_df <- phenotype_df[, c("time", "status")]

# 查看结果
head(phenotype_df)



In [None]:
# 输出文件
fwrite(phenotype_df,file = "Phenotype.txt",sep = "\t",row.names = T)