In [2]:
library(ClassifyR)
library(tidyverse)
library(survival)
library(survminer)
library(janitor)
library(dplyr)
library(survival)
library(survcomp)
library(ggplot2)
library(RColorBrewer)
library(caret)  
library(boot)

#### Subtype of TCGA samples

In [3]:
################################### Metadata informtaion of subtype ###################################
brca_meta <- read.table("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/BRCA.clin.merged.txt", fill = TRUE, header = FALSE)
brca_meta <- t(brca_meta)
colnames(brca_meta) <- brca_meta[1, ]
brca_meta <- brca_meta[-1, ]
brca_meta <- as.data.frame(brca_meta)
brca_meta$her2 <- ifelse(brca_meta$`patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="positive","Her2+","Others")
Her2_samples <- na.omit(toupper(unique(brca_meta[brca_meta$her2=="Her2+",]$patient.bcr_patient_barcode)))

brca_meta$subtype <- ifelse((brca_meta$`patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="positive"),"Her2",
                     ifelse((brca_meta$`patient.lab_proc_her2_neu_immunohistochemistry_receptor_status`=="negative" &
                            brca_meta$patient.breast_carcinoma_estrogen_receptor_status=="negative" &
                            brca_meta$patient.breast_carcinoma_progesterone_receptor_status=="negative"),"TNBC",
                     ifelse((brca_meta$patient.breast_carcinoma_estrogen_receptor_status=="positive" &
                             brca_meta$patient.breast_carcinoma_progesterone_receptor_status=="positive") |  
                             (brca_meta$patient.breast_carcinoma_estrogen_receptor_status=="positive" &
                             brca_meta$patient.breast_carcinoma_progesterone_receptor_status=="negative"),"Luminal","Other")))
brca_meta <- brca_meta[c("patient.bcr_patient_barcode","subtype")]
brca_meta$patient.bcr_patient_barcode <- toupper(brca_meta$patient.bcr_patient_barcode)
brca_meta <- brca_meta %>% filter(grepl("^TCGA-", patient.bcr_patient_barcode))
brca_meta <- na.omit(brca_meta)
brca_meta <- subset(brca_meta,subtype!="Other")

In [4]:
list_of_HistoQC_filtered_patients = read.csv("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Outputs/group_scratch_Outputs/HistoQC_filtered.csv")["Patient"]

#### Predicted Pseudobulk gene expression

In [5]:
survival_info = read.csv("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Updated3_survival_info.csv")
survival_info <- unique(survival_info[c("case_id","case_submitter_id","time","vital_status","ajcc_pathologic_stage")])

final_gene_mean_counts = read.csv("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Updated3_final_gene_mean_counts_names.csv",header=FALSE)
final_gene_mean_counts = janitor::row_to_names(final_gene_mean_counts,1)
colnames(final_gene_mean_counts)[1] <- "X"
final_gene_mean_counts_HQC_filt = final_gene_mean_counts[final_gene_mean_counts$X %in% list_of_HistoQC_filtered_patients$Patient,]

final_gene_mean_counts_HQC_filt = t(final_gene_mean_counts_HQC_filt)
final_gene_mean_counts_HQC_filt <- janitor::row_to_names(final_gene_mean_counts_HQC_filt,1)
genes <- rownames(final_gene_mean_counts_HQC_filt)
final_gene_mean_counts_HQC_filt <- as.data.frame(final_gene_mean_counts_HQC_filt)
final_gene_mean_counts_HQC_filt <- apply(final_gene_mean_counts_HQC_filt, 2, function(x) as.numeric(as.character(x)))
rownames(final_gene_mean_counts_HQC_filt) <- genes
final_gene_mean_counts_HQC_filt <- t(final_gene_mean_counts_HQC_filt)
final_gene_mean_counts_HQC_filt <- as.data.frame(final_gene_mean_counts_HQC_filt)
                                         
                                         
rownames(final_gene_mean_counts_HQC_filt) <- gsub("\\.", "-", rownames(final_gene_mean_counts_HQC_filt))
final_gene_mean_counts_HQC_filt$pateint <- sub("\\-1$", "", rownames(final_gene_mean_counts_HQC_filt))
final_gene_mean_counts_HQC_filt$pateint_surv <- substring(final_gene_mean_counts_HQC_filt$pateint, 1, 
                                              nchar(final_gene_mean_counts_HQC_filt$pateint) - 4)
                                
##############################################################################################################################
                                
survival_info_2 <- data.frame(col1 = character(0))
df_counts <- table(final_gene_mean_counts_HQC_filt$pateint_surv)
for (value in unique(final_gene_mean_counts_HQC_filt$pateint_surv)) {
  rows_df2 <- survival_info[survival_info$case_submitter_id == value, , drop = FALSE]
  replicated_rows <- rows_df2[rep(seq_len(nrow(rows_df2)), df_counts[value]), , drop = FALSE]
  survival_info_2 <- rbind(survival_info_2, replicated_rows)}
rownames(survival_info_2) <- NULL
survival_info_2 = survival_info_2[order(survival_info_2$case_submitter_id), ]

final_gene_mean_counts_HQC_filt = final_gene_mean_counts_HQC_filt[order(final_gene_mean_counts_HQC_filt$pateint_surv), ]
final_gene_mean_counts_HQC_filt = final_gene_mean_counts_HQC_filt[final_gene_mean_counts_HQC_filt$pateint_surv %in% survival_info_2$case_submitter_id,]

data = merge(survival_info[c("case_submitter_id","time","vital_status")], 
                   final_gene_mean_counts_HQC_filt, by.x = "case_submitter_id", by.y = "pateint_surv")

unique_pseudobulk_data <- data[!duplicated(data), ]
colnames(unique_pseudobulk_data)[3] <- "status" 
unique_pseudobulk_data$status <- ifelse(unique_pseudobulk_data$status == "Dead", 1, 0)
colnames(unique_pseudobulk_data) <- gsub("[[:space:]-]+", "_", colnames(unique_pseudobulk_data))   

# Add Subtype
unique_pseudobulk_data = merge(unique_pseudobulk_data,brca_meta,by.x="case_submitter_id",by.y="patient.bcr_patient_barcode")

“Row 1 does not provide unique names. Consider running clean_names() after row_to_names().”


#### True bulk gene expression

In [6]:
all_bulk_ge = read.csv("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Updated3_bluk_ge.csv")
all_bulk_ge <- all_bulk_ge %>%
               group_by(gene_name) %>%
               summarize(across(where(is.numeric), mean))
all_bulk_ge <- t(all_bulk_ge)
all_bulk_ge <- as.data.frame(all_bulk_ge)
all_bulk_ge <- janitor::row_to_names(all_bulk_ge,1)
rownames(all_bulk_ge) <- gsub("\\.", "-", rownames(all_bulk_ge))
patients_rownames <- rownames(all_bulk_ge)
all_bulk_ge <- apply(all_bulk_ge, 2, function(x) as.numeric(as.character(x)))
rownames(all_bulk_ge) <- patients_rownames
all_bulk_ge <- as.data.frame(all_bulk_ge)
all_bulk_ge$pateint <- sub("\\-1$", "", rownames(all_bulk_ge))
all_bulk_ge$pateint <- substring(rownames(all_bulk_ge), 1, nchar(rownames(all_bulk_ge)) - 4)
all_bulk_ge <- all_bulk_ge %>%
               group_by(pateint) %>%
               summarize(across(where(is.numeric), mean))
all_bulk_ge <- as.data.frame(all_bulk_ge)
all_bulk_ge <- all_bulk_ge %>% column_to_rownames(var = "pateint")            
                     
                     
data = merge(survival_info[c("case_submitter_id","time","vital_status")], 
                   all_bulk_ge, by.x = "case_submitter_id", by.y = "row.names")

unique_bulk_data <- data[!duplicated(data), ]
unique_bulk_data <- unique_bulk_data %>% remove_rownames %>% column_to_rownames(var="case_submitter_id")
colnames(unique_bulk_data)[2] <- "status" 
unique_bulk_data$status <- ifelse(unique_bulk_data$status == "Dead", 1, 0)
colnames(unique_bulk_data) <- gsub("[[:space:]-]+", "_", colnames(unique_bulk_data))
                     
# Add subtypes                                          
unique_bulk_data = merge(unique_bulk_data,brca_meta,by.x="row.names",by.y="patient.bcr_patient_barcode")
rownames(unique_bulk_data) <- as.character(unique_bulk_data$Row.names)
unique_bulk_data <- unique_bulk_data[, !colnames(unique_bulk_data) %in% "Row.names"]

#### Matched Survival, Subtype, Bulk&Pseudobulk for 580 patients

In [7]:
common_patients <- intersect(rownames(unique_bulk_data),unique_pseudobulk_data$case_submitter_id)

In [8]:
unique_bulk_data <- unique_bulk_data[rownames(unique_bulk_data) %in% common_patients,]
unique_bulk_data$case_submitter_id <- rownames(unique_bulk_data)
rownames(unique_bulk_data) <- NULL
unique_pseudobulk_data <- unique_pseudobulk_data[unique_pseudobulk_data$case_submitter_id %in% common_patients,]
unique_pseudobulk_data$pateint <- NULL

In [9]:
unique_pseudobulk_data <- unique_pseudobulk_data %>% arrange(case_submitter_id)

# Step 2: Count occurrences in df1
unique_pseudobulk_data_counts <- unique_pseudobulk_data %>% count(case_submitter_id)

# Step 3: Expand df2 to match counts in df1
unique_bulk_data_expanded <- unique_pseudobulk_data_counts %>%
  left_join(unique_bulk_data, by = "case_submitter_id") %>%
  group_by(case_submitter_id) %>%
  slice(rep(1, n)) %>%  # Duplicate rows to match df1 counts
  select(-n) %>%
  ungroup()

# Step 4: Arrange df2_expanded in the same order as df1
unique_bulk_data_expanded <- unique_bulk_data_expanded[match(unique_pseudobulk_data$case_submitter_id, 
                                                             unique_bulk_data_expanded$case_submitter_id), ]

# Cleaning the data based on time
unique_bulk_data_expanded$time <- as.numeric(unique_bulk_data_expanded$time)
unique_pseudobulk_data$time <- as.numeric(unique_pseudobulk_data$time)
# unique_pseudobulk_data <- subset(unique_pseudobulk_data,time>0)
# unique_bulk_data_expanded <- subset(unique_bulk_data_expanded,time>0)

# Convert to dataframe
unique_bulk_data_expanded <- as.data.frame(unique_bulk_data_expanded)
unique_bulk_data_expanded <- unique(unique_bulk_data_expanded)
unique_pseudobulk_data <- as.data.frame(unique_pseudobulk_data)
unique_pseudobulk_data <- unique(unique_pseudobulk_data)

#### Bulk survival analysis

In [10]:
top_predict_genes <- read.csv("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/stimage_LOOCV_9visium_top300gene.csv")
top_predict_genes$Gene <- gsub("-","_",top_predict_genes$Gene)

In [11]:
unique_bulk_data_expanded$time <- as.integer(unique_bulk_data_expanded$time)
true_subtype_results <- list()

for (subtype_name in unique(unique_bulk_data_expanded$subtype)) {
  
  # Subset the data for the current subtype
  subtype_data <- subset(unique_bulk_data_expanded, subtype == subtype_name)  
  # Identify the genes (covariates) to use in the models
  covariates <- intersect(colnames(subtype_data), top_predict_genes$Gene)
  # Create a formula for each gene (e.g., Surv(time, status) ~ gene)
  univ_formulas <- sapply(covariates,
                          function(gene) as.formula(paste("Surv(time, status) ~", gene)))
  
  # Fit a Cox model for each gene using the corresponding formula
  univ_models <- lapply(univ_formulas, function(formula) {
    coxph(formula, data = subtype_data)})
  
  # For each model, compute the predicted risk scores and then the c-index
  true_c_index_list <- list()
  for (i in seq_along(univ_models)) {
    model <- univ_models[[i]]
    gene_name <- covariates[i]
    
    # Obtain the predicted risk scores (linear predictors)
    true_survival_scores <- predict(model, newdata = subtype_data, type = "lp")
    
    # Calculate the c-index for the current model
    true_c_index_result <- concordance.index(x = true_survival_scores,
                                        surv.time = subtype_data$time,
                                        surv.event = subtype_data$status)
    
      
  # Store the c-index (usually in c_index_result$c.index) with the gene name
  true_c_index_list[[gene_name]] <- true_c_index_result$c.index}
  
  # Save the c-index list for the current subtype
  true_subtype_results[[subtype_name]] <- true_c_index_list}
      
      
#     # Extract the p-value for the Wald test from the model summary
#     wald_p_value <- summary(model)$coef[1, "Pr(>|z|)"]    
#     true_c_index_list[[gene_name]] <- list(
#     concordance = true_c_index_result$c.index,
#     wald_p_value = wald_p_value)}
#     true_subtype_results[[subtype_name]] <- true_c_index_list}

In [12]:
covariates <- intersect(colnames(unique_bulk_data_expanded), top_predict_genes$Gene)
univ_formulas <- sapply(covariates,
                      function(gene) as.formula(paste("Surv(time, status) ~", gene)))

# Fit a Cox model for each gene using the corresponding formula
univ_models <- lapply(univ_formulas, function(formula) {coxph(formula, data = unique_bulk_data_expanded)})

true_c_index_all <- list()
for (i in seq_along(univ_models)) {
    model <- univ_models[[i]]
    gene_name <- covariates[i]

    true_survival_scores_all <- predict(model, newdata = unique_bulk_data_expanded, type = "lp")

    # Calculate the c-index for the current model
    true_c_index_all_result <- concordance.index(x = true_survival_scores_all,
                                        surv.time = unique_bulk_data_expanded$time,
                                        surv.event = unique_bulk_data_expanded$status)

    # Store the c-index (usually in c_index_result$c.index) with the gene name
    true_c_index_all[[gene_name]] <- true_c_index_all_result$c.index
}
true_c.index_scores = as.data.frame(t(as.data.frame(true_c_index_all)))
true_c.index_scores = subset(true_c.index_scores,V1>0.5)
true_c.index_scores <- true_c.index_scores %>% arrange(desc(V1))
colnames(true_c.index_scores)[1] <- "score"
true_c.index_scores["subtype"]="zALL"

In [13]:
true_subtype_results_sorted <- lapply(true_subtype_results, function(inner_list) {
  sorted_vec <- sort(unlist(inner_list), decreasing = TRUE)
  as.list(sorted_vec)})

true_subtype_results_long <- do.call(rbind, lapply(names(true_subtype_results_sorted), function(subtype) {
    scores <- unlist(true_subtype_results_sorted[[subtype]])
    data.frame(
    subtype = subtype,
    score   = scores,
    row.names = names(scores),
    stringsAsFactors = FALSE)}))

# df_pred <- rbind(true_c.index_scores,true_subtype_results_long)
df_pred <- true_subtype_results_long
options(repr.plot.width=7,repr.plot.height=7)
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_Cindex_true_boxplot.pdf",width=5,height=6)
ggplot(df_pred, aes(x = subtype, y = score, fill = subtype)) +
  geom_boxplot() +
  scale_fill_brewer(palette = "Dark2") +
  labs(title = "Boxplot by Group",
       x = "Group",
       y = "Value") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 25, face = "bold"),  # Title size
    axis.title = element_text(size = 25),  
    axis.text = element_text(size = 25, angle = 90, hjust = 1),    
    legend.text = element_text(size = 25), 
    legend.title = element_text(size = 25))
dev.off()

In [14]:
unique_bulk_data_expanded_cpy <- unique_bulk_data_expanded

res_sig <- rownames(true_c.index_scores)[1:5] 
formula_string <- paste("Surv(time, status) ~", paste(res_sig, collapse = " + "))
res.cox <- coxph(as.formula(formula_string), data = unique_bulk_data_expanded_cpy)

# Assuming you have already fitted a multivariate Cox regression model 'res.cox'
# 1. Calculate survival scores
survival_scores <- -predict(res.cox, type = "lp")
# 2. Determine the median survival score
median_score <- median(survival_scores)
# 3. Categorize patients into high and low groups
patient_groups <- ifelse(survival_scores > median_score, "High", "Low")
# Add the patient groups to your original data (assuming your data is in 'lung' data frame)
unique_bulk_data_expanded_cpy$SurvivalGroup <- patient_groups
# Display the result
table(unique_bulk_data_expanded_cpy$SurvivalGroup)

####################################################################################################################

# Fit survival curves
surv_curve <- survfit(Surv(time, status) ~ SurvivalGroup, data = unique_bulk_data_expanded_cpy)
# Create a ggsurvplot
options(repr.plot.width=5, repr.plot.height=7.5)
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_KM_true_all.pdf",width=7.5,height=9)
print(ggsurvplot(surv_curve, 
           pval = TRUE, 
           conf.int = TRUE, 
           risk.table.y.text.col = FALSE,
           risk.table.y.text = FALSE,
           risk.table = "abs_pct",
           ggtheme = theme_light(),
           legend.labs = c("Low Risk", "High Risk"),
           title = "Survival Curve by Survival Score",
           palette = c("#E7B800", "#2E9FDF"),
           ncensor.plot = FALSE, 
           font.tickslab = c(15, "plain", "black"),
           xlab = "Time", ylab = "Survival Probability"))
dev.off()


High  Low 
 269  269 

In [15]:
unique_bulk_data_expanded_cpy <- unique_bulk_data_expanded
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_KM_true_subtype.pdf",width=7.5,height=9)

for (subtype_name in unique(unique_bulk_data_expanded_cpy$subtype)){
    subtype_data <- subset(unique_bulk_data_expanded_cpy, subtype == subtype_name)
    top_pre_genes <- names(true_subtype_results_sorted[[subtype_name]][1:5])
    
####################################################################################################################                        
                        
    res_sig <- top_pre_genes
    subtype_data_pat <- subtype_data[colnames(subtype_data) %in% c("time","status",top_pre_genes)]
    formula_string <- paste("Surv(time, status) ~", paste(res_sig, collapse = " + "))
    res.cox <- coxph(as.formula(formula_string), data = subtype_data_pat)

    # 1. Calculate survival scores
    survival_scores <- -predict(res.cox, type = "lp")
    # 2. Determine the median survival score
    median_score <- median(survival_scores)
    # 3. Categorize patients into high and low groups
    patient_groups <- ifelse(survival_scores > median_score, "High", "Low")
    # Add the patient groups to your original data (assuming your data is in 'lung' data frame)
    subtype_data_pat$SurvivalGroup <- patient_groups
    # Display the result
    table(subtype_data_pat$SurvivalGroup)

    ####################################################################################################################

    # Fit survival curves
    surv_curve <- survfit(Surv(time, status) ~ SurvivalGroup, data = subtype_data_pat)
    options(repr.plot.width=5, repr.plot.height=7.5)
    print(ggsurvplot(surv_curve, 
               pval = TRUE, 
               conf.int = TRUE, 
               risk.table.y.text.col = FALSE,
               risk.table.y.text = FALSE,
               risk.table = "abs_pct",
               ggtheme = theme_light(),
               legend.labs = c("Low Risk", "High Risk"),
               title = "Survival Curve by Survival Score",
               palette = c("#E7B800", "#2E9FDF"),
               ncensor.plot = FALSE, 
               font.tickslab = c(26, "plain", "black"),
               xlab = "Time", ylab = "Survival Probability"))
    print(top_pre_genes)}
dev.off()

[1] "CD52"  "CD8A"  "CD3D"  "TRBC2" "GZMA" 
[1] "DDX41"   "HDAC5"   "HLA_DMB" "SDHB"    "MAP3K7" 
[1] "KIF5B"  "PAK1"   "FUBP1"  "NOTCH1" "COMMD3"


In [16]:
cindex_subtypes <- list()
filtered_data <- subset(unique_bulk_data_expanded_cpy, subtype != "Other")  
for (subtype_name in unique(filtered_data$subtype)) {  
    
    subtype_data <- subset(filtered_data, subtype == subtype_name)
    top_pre_genes <- names(true_subtype_results_sorted[[subtype_name]][1:5])
    survCrossValidated <- crossValidate(subtype_data[c(top_pre_genes,"time","status")], 
                                        c("time", "status"),
                                        nFeatures = c(5),
                                        nFolds = 3, nRepeats = 100,
                                        classifier="CoxPH",
                                        selectionMethod = "CoxPH",
                                        nCores = 2)

    pred_df <- as.data.frame(survCrossValidated@predictions)
    actual_df <- data.frame(sample = survCrossValidated@originalNames,
                            truth = survCrossValidated@actualOutcome)
    eval_df <- left_join(pred_df, actual_df, by = "sample")
    results <- eval_df %>%
      group_by(permutation, fold) %>%
      group_modify(~{
        concordance(
          truth ~ risk,
          data = .x,
          reverse = TRUE
        )$concordance %>%
          as.data.frame() %>%
          rename("cindex" = ".")
      })
    cindex_subtypes[[subtype_name]] <- results}


cindex_subtypes$Her2 <- cindex_subtypes$Her2[c("cindex")]
cindex_subtypes$Luminal <- cindex_subtypes$Luminal[c("cindex")]
cindex_subtypes$TNBC <- cindex_subtypes$TNBC[c("cindex")]

pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_Cindex_true_boxplot_multivariate.pdf",width=5,height=6)
ggplot(data.frame(Subtype = names(cindex_subtypes), Values = unlist(cindex_subtypes)), aes(x = Subtype, y = Values, fill = Subtype)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 25, face = "bold"),  # Title size
    axis.title = element_text(size = 25),  
    axis.text = element_text(size = 25, angle = 90, hjust = 1),    
    legend.text = element_text(size = 25), 
    legend.title = element_text(size = 25))
dev.off()

“'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.”
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call


R_zmq_msg_send errno: 4 strer

#### Predicted data survival analysis

In [17]:
unique_pseudobulk_data$time <- as.integer(unique_pseudobulk_data$time)
pred_subtype_results <- list()

for (subtype_name in unique(unique_pseudobulk_data$subtype)) {
  
  # Subset the data for the current subtype
  subtype_data <- subset(unique_pseudobulk_data, subtype == subtype_name)  
  # Identify the genes (covariates) to use in the models
  covariates <- intersect(colnames(subtype_data), top_predict_genes$Gene)
  # Create a formula for each gene (e.g., Surv(time, status) ~ gene)
  univ_formulas <- sapply(covariates,
                          function(gene) as.formula(paste("Surv(time, status) ~", gene)))
  
  # Fit a Cox model for each gene using the corresponding formula
  univ_models <- lapply(univ_formulas, function(formula) {
    coxph(formula, data = subtype_data)})
  
  # For each model, compute the predicted risk scores and then the c-index
  pred_c_index_list <- list()
  for (i in seq_along(univ_models)) {
    model <- univ_models[[i]]
    gene_name <- covariates[i]
    
    # Obtain the predicted risk scores (linear predictors)
    pred_survival_scores <- predict(model, newdata = subtype_data, type = "lp")
    
    # Calculate the c-index for the current model
    pred_c_index_result <- concordance.index(x = pred_survival_scores,
                                        surv.time = subtype_data$time,
                                        surv.event = subtype_data$status)
    
    # Store the c-index (usually in c_index_result$c.index) with the gene name
    pred_c_index_list[[gene_name]] <- pred_c_index_result$c.index}
  
  # Save the c-index list for the current subtype
  pred_subtype_results[[subtype_name]] <- pred_c_index_list}

In [18]:
covariates <- intersect(colnames(unique_pseudobulk_data), top_predict_genes$Gene)
univ_formulas <- sapply(covariates,
                      function(gene) as.formula(paste("Surv(time, status) ~", gene)))

# Fit a Cox model for each gene using the corresponding formula
univ_models <- lapply(univ_formulas, function(formula) {coxph(formula, data = unique_pseudobulk_data)})

pred_c_index_all <- list()
for (i in seq_along(univ_models)) {
    model <- univ_models[[i]]
    gene_name <- covariates[i]

    pred_survival_scores_all <- predict(model, newdata = unique_pseudobulk_data, type = "lp")

    # Calculate the c-index for the current model
    pred_c_index_all_result <- concordance.index(x = pred_survival_scores_all,
                                        surv.time = unique_pseudobulk_data$time,
                                        surv.event = unique_pseudobulk_data$status)

    # Store the c-index (usually in c_index_result$c.index) with the gene name
    pred_c_index_all[[gene_name]] <- pred_c_index_all_result$c.index
}
pred_c.index_scores = as.data.frame(t(as.data.frame(pred_c_index_all)))
pred_c.index_scores = subset(pred_c.index_scores,V1>0.5)
pred_c.index_scores <- pred_c.index_scores %>% arrange(desc(V1))
colnames(pred_c.index_scores)[1] <- "score"
pred_c.index_scores["subtype"]="zALL"

“Loglik converged before variable  1 ; coefficient may be infinite. ”


In [19]:
pred_subtype_results_sorted <- lapply(pred_subtype_results, function(inner_list) {
  sorted_vec <- sort(unlist(inner_list), decreasing = TRUE)
  as.list(sorted_vec)})

pred_subtype_results_long <- do.call(rbind, lapply(names(pred_subtype_results_sorted), function(subtype) {
    scores <- unlist(pred_subtype_results_sorted[[subtype]])
    data.frame(
    subtype = subtype,
    score   = scores,
    row.names = names(scores),
    stringsAsFactors = FALSE)}))

# df_pred <- rbind(pred_c.index_scores,pred_subtype_results_long)
df_pred <- pred_subtype_results_long
options(repr.plot.width=7,repr.plot.height=7)
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_Cindex_pred_boxplot.pdf",width=5,height=6)
ggplot(df_pred, aes(x = subtype, y = score, fill = subtype)) +
  geom_boxplot() +
  scale_fill_brewer(palette = "Dark2") +
  labs(title = "Boxplot by Group",
       x = "Group",
       y = "Value") +
  theme_minimal()+
  theme(
    plot.title = element_text(size = 25, face = "bold"),  # Title size
    axis.title = element_text(size = 25),  
    axis.text = element_text(size = 25, angle = 90, hjust = 1),    
    legend.text = element_text(size = 25), 
    legend.title = element_text(size = 25))
dev.off()

In [20]:
unique_pseudobulk_data_cpy <- unique_pseudobulk_data

res_sig <- rownames(pred_c.index_scores)[1:5] 
formula_string <- paste("Surv(time, status) ~", paste(res_sig, collapse = " + "))
res.cox <- coxph(as.formula(formula_string), data = unique_pseudobulk_data_cpy)

# Assuming you have already fitted a multivariate Cox regression model 'res.cox'
# 1. Calculate survival scores
survival_scores <- -predict(res.cox, type = "lp")
# 2. Determine the median survival score
median_score <- median(survival_scores)
# 3. Categorize patients into high and low groups
patient_groups <- ifelse(survival_scores > median_score, "High", "Low")
# Add the patient groups to your original data (assuming your data is in 'lung' data frame)
unique_pseudobulk_data_cpy$SurvivalGroup <- patient_groups
# Display the result
table(unique_pseudobulk_data_cpy$SurvivalGroup)

####################################################################################################################

# Fit survival curves
surv_curve <- survfit(Surv(time, status) ~ SurvivalGroup, data = unique_pseudobulk_data_cpy)
# Create a ggsurvplot
options(repr.plot.width=5, repr.plot.height=7.5)
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_KM_pred_all.pdf",width=7.5,height=9)
print(ggsurvplot(surv_curve, 
           pval = TRUE, 
           conf.int = TRUE, 
           risk.table.y.text.col = FALSE,
           risk.table.y.text = FALSE,
           risk.table = "abs_pct",
           ggtheme = theme_light(),
           legend.labs = c("Low Risk", "High Risk"),
           title = "Survival Curve by Survival Score",
           palette = c("#E7B800", "#2E9FDF"),
           ncensor.plot = FALSE, 
           font.tickslab = c(15, "plain", "black"),
           xlab = "Time", ylab = "Survival Probability"))
dev.off()


High  Low 
 410  410 

In [21]:
unique_pseudobulk_data_cpy <- unique_pseudobulk_data
pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_KM_pred_subtype.pdf",width=7.5,height=9)

for (subtype_name in unique(unique_pseudobulk_data_cpy$subtype)){
    subtype_data <- subset(unique_pseudobulk_data_cpy, subtype == subtype_name)
    top_pre_genes <- names(pred_subtype_results_sorted[[subtype_name]][1:5])
    
####################################################################################################################                        
                        
    res_sig <- top_pre_genes
    subtype_data_pat <- subtype_data[colnames(subtype_data) %in% c("time","status",top_pre_genes)]
    formula_string <- paste("Surv(time, status) ~", paste(res_sig, collapse = " + "))
    res.cox <- coxph(as.formula(formula_string), data = subtype_data_pat)

    # 1. Calculate survival scores
    survival_scores <- -predict(res.cox, type = "lp")
    # 2. Determine the median survival score
    median_score <- median(survival_scores)
    # 3. Categorize patients into high and low groups
    patient_groups <- ifelse(survival_scores > median_score, "High", "Low")
    # Add the patient groups to your original data (assuming your data is in 'lung' data frame)
    subtype_data_pat$SurvivalGroup <- patient_groups
    # Display the result
    table(subtype_data_pat$SurvivalGroup)

    ####################################################################################################################

    # Fit survival curves
    surv_curve <- survfit(Surv(time, status) ~ SurvivalGroup, data = subtype_data_pat)
    options(repr.plot.width=5, repr.plot.height=7.5)
    print(ggsurvplot(surv_curve, 
               pval = TRUE, 
               conf.int = TRUE, 
               risk.table.y.text.col = FALSE,
               risk.table.y.text = FALSE,
               risk.table = "abs_pct",
               ggtheme = theme_light(),
               legend.labs = c("Low Risk", "High Risk"),
               title = "Survival Curve by Survival Score",
               palette = c("#E7B800", "#2E9FDF"),
               ncensor.plot = FALSE, 
               font.tickslab = c(26, "plain", "black"),
               xlab = "Time", ylab = "Survival Probability"))
    print(top_pre_genes)}
dev.off()

[1] "SFRP4" "IL2RG" "FYB1"  "C1QA"  "ITGB2"
[1] "IGHA1" "FYB1"  "ANXA2" "CD9"   "MYH9" 
[1] "CD46"  "MIF"   "EIF4E" "ERBB3" "MAPK9"


In [22]:
cindex_subtypes <- list()
filtered_data <- subset(unique_pseudobulk_data_cpy, subtype != "Other")  
for (subtype_name in unique(filtered_data$subtype)) {  
    
    subtype_data <- subset(filtered_data, subtype == subtype_name)
    top_pre_genes <- names(pred_subtype_results_sorted[[subtype_name]][1:5])
    survCrossValidated <- crossValidate(subtype_data[c(top_pre_genes,"time","status")], 
                                        c("time", "status"),
                                        nFeatures = c(5),
                                        nFolds = 3, nRepeats = 100,
                                        classifier="CoxPH",
                                        selectionMethod = "CoxPH",
                                        nCores = 2)

    pred_df <- as.data.frame(survCrossValidated@predictions)
    actual_df <- data.frame(sample = survCrossValidated@originalNames,
                            truth = survCrossValidated@actualOutcome)
    eval_df <- left_join(pred_df, actual_df, by = "sample")
    results <- eval_df %>%
      group_by(permutation, fold) %>%
      group_modify(~{
        concordance(
          truth ~ risk,
          data = .x,
          reverse = TRUE
        )$concordance %>%
          as.data.frame() %>%
          rename("cindex" = ".")
      })
    cindex_subtypes[[subtype_name]] <- results}


cindex_subtypes$Her2 <- cindex_subtypes$Her2[c("cindex")]
cindex_subtypes$Luminal <- cindex_subtypes$Luminal[c("cindex")]
cindex_subtypes$TNBC <- cindex_subtypes$TNBC[c("cindex")]

pdf("/QRISdata/Q2051/Onkar/STimage/project_scratch_STimage/STimage_v1/Survival/Survival_Cindex_pred_boxplot_multivariate.pdf",width=5,height=6)
ggplot(data.frame(Subtype = names(cindex_subtypes), Values = unlist(cindex_subtypes)), aes(x = Subtype, y = Values, fill = Subtype)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, size = 1.5, alpha = 0.6) +
  scale_fill_brewer(palette = "Dark2") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 25, face = "bold"),  # Title size
    axis.title = element_text(size = 25),  
    axis.text = element_text(size = 25, angle = 90, hjust = 1),    
    legend.text = element_text(size = 25), 
    legend.title = element_text(size = 25))
dev.off()

“'measurements' DataFrame must have sample identifiers as its row names. Generating generic ones.”
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call
R_zmq_msg_send errno: 4 strerror: Interrupted system call


R_zmq_msg_send errno: 4 strer