-
Notifications
You must be signed in to change notification settings - Fork 1
/
seer_sigmoidoscopy.qmd
647 lines (497 loc) · 24.4 KB
/
seer_sigmoidoscopy.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
---
title: "SEER Sigmoidoscopy Project"
author: "david.hein@utsouthwestern.edu"
date: "`r Sys.Date()`"
format:
html:
toc: true
number_sections: true
fig-width: 6
fig-height: 3
gfm:
toc: true
number_sections: true
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Load packages and raw data
```{r, results = 'hide', message = FALSE, warning = FALSE}
library(broom)
library(MASS)
library(survival)
library(survminer)
library(tidyverse)
library(survRM2)
library(km.ci)
library(readr)
library(table1)
library(epitools)
library(riskRegression)
library(adjustedCurves)
set.seed(2023)
seer_data <- read_delim("SEERdata42623.txt", delim = "\t", escape_double = FALSE, trim_ws = TRUE)
```
# Data cleaning
```{r}
# Get only variables we want
new_data <- seer_data %>% dplyr::select(Sex, `Year of diagnosis`,
`Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic) - no total`,
`Primary Site`,`Grade Recode (thru 2017)`,
`Grade Pathological (2018+)`,
`Combined Summary Stage (2004+)`,
`Summary stage 2000 (1998-2017)`,
`SEER cause-specific death classification`,
`Survival months`,
`Age recode with single ages and 90+`,
`Survival months flag`,
`Vital status recode (study cutoff used)`,
`Grade Clinical (2018+)`,
`Histologic Type ICD-O-3`,
`COD to site recode ICD-O-3 2023 Revision`,
`SEER other cause of death classification`,
`Site recode ICD-O-3/WHO 2008`)
# need to take out 181 appendix, 260, 188, and 189 because location unknown
new_data <- new_data %>%
filter(!`Primary Site` %in% c(260, 188, 189, 181))
# only adenocarcinoma
new_data <- new_data %>%
filter(`Histologic Type ICD-O-3`== 8140)
# make a new variable for site being seen on colonsocopy or sigmoid, should include 186 descending, 187 sigmoid, 199 rectosigmoid junction, 209 rectum NOS
new_data <- new_data %>%
mutate(can_see_sigmoid = ifelse( `Primary Site` %in% c(186, 187, 199, 209), 1, 0))
# make new variable for 3 age groups
new_data <- new_data %>%
mutate(age_numeric = as.numeric(str_extract(`Age recode with single ages and 90+`, "[:digit:]+")))
new_data <- new_data %>%
mutate(age_group_final = ifelse(age_numeric <44,'under45', 'over50'))%>%
mutate(age_group_final = ifelse(age_numeric <53 & age_numeric >= 44, '45-50', age_group_final))
# Combine the two stages
new_data <- new_data %>%
mutate(new_stage = ifelse(`Summary stage 2000 (1998-2017)` == "Blank(s)",
`Combined Summary Stage (2004+)`,
`Summary stage 2000 (1998-2017)`))
# Filter out unknown stage an in situ
new_data2 <- new_data %>%
filter( !new_stage %in%c("Unknown/unstaged", "In situ"))
# Make nice grade variable from different grade styles
new_data2 <- new_data2 %>%
mutate(final_grade = `Grade Pathological (2018+)`)
new_data2 <- new_data2 %>%
mutate(final_grade = ifelse(`Grade Recode (thru 2017)` %in% "Poorly differentiated; Grade III", "3", final_grade))
new_data2 <- new_data2 %>%
mutate(final_grade = ifelse(`Grade Recode (thru 2017)` %in% "Undifferentiated; anaplastic; Grade IV", "4", final_grade))
new_data2 <- new_data2 %>%
mutate(final_grade = ifelse(`Grade Recode (thru 2017)` %in% "Moderately differentiated; Grade II", "2", final_grade))
new_data2 <- new_data2 %>%
mutate(final_grade = ifelse(`Grade Recode (thru 2017)` %in% "Well differentiated; Grade I", "1", final_grade))
# toss em in an unknown/other
new_data2 <- new_data2 %>%
mutate(final_grade = ifelse(final_grade %in% c("A", "B", "Blank(s)", "L","C", "H", "9", "D"), "unk_other", final_grade))
# make smaller year of diagnosis groups
new_data2 <- new_data2 %>%
mutate(year_of_diag = case_when(
`Year of diagnosis`%in%c('2000', '2001', '2002' ,'2003' ,'2004') ~ "yod1",
`Year of diagnosis`%in%c('2005','2006', '2007', '2008' ,'2009') ~ "yod2",
`Year of diagnosis`%in%c('2010', '2011', '2012' ,'2013' ,'2014') ~ "yod3",
`Year of diagnosis`%in%c('2015', '2016', '2017' ,'2018' ,'2019','2020') ~ "yod4"))
# make cleaner race/eth column name
new_data2 <- new_data2 %>%
rename(race_eth=`Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic) - no total`)
# Survival
# CSS
new_data3 <- new_data2 %>%
mutate(cancer_specific_status = case_when(
`COD to site recode ICD-O-3 2023 Revision`=="Alive"~0,
`SEER cause-specific death classification`=="Dead (attributable to this cancer dx)"~1,
`SEER other cause of death classification`=="Dead (attributable to causes other than this cancer dx)"~2,
`SEER cause-specific death classification`=="Dead (missing/unknown COD)"~2))
# OS
new_data3 <- new_data3 %>%
mutate(overall_status = ifelse(`Vital status recode (study cutoff used)`=="Alive", 0, 1 ))
new_data4 <- new_data3 %>%
filter(`Survival months flag`=='Complete dates are available and there are more than 0 days of survival')
# Remove old data sets to save space
rm(seer_data)
rm(new_data)
rm(new_data2)
rm(new_data3)
#######################################################################
#table(new_data4$cancer_specific_status, useNA = "always")
```
# Setting reference groups
Makes this easier for filling out tables and for regressions
```{r}
# first select only variables we know we will be using to save space
new_data5<-new_data4 %>% dplyr::select(Sex,
race_eth,
can_see_sigmoid,
age_group_final,
new_stage,
final_grade,
year_of_diag,
cancer_specific_status,
overall_status,
`Survival months`,
`Primary Site`,
age_numeric,
`Site recode ICD-O-3/WHO 2008`)%>%
rename(survival_months=`Survival months`, primary_site = `Primary Site`)
# set new factor levels, first level is the reference group
new_data5$new_stage <- factor(new_data5$new_stage, levels = c('Localized', 'Regional', 'Distant'))
new_data5$age_group_final <- factor(new_data5$age_group_final, levels = c('over50', 'under45', '45-50'))
new_data5$year_of_diag <- factor(new_data5$year_of_diag, levels = c('yod1', 'yod2', 'yod3', 'yod4'))
# we dont need to specify all levels for race that would take too much typing, we just want to set the largest group as the reference
new_data5 <- within(new_data5, race_eth <- relevel(factor(race_eth), ref = 'Non-Hispanic White'))
# The grade variable appears to be unreliable
#table(new_data5$final_grade, new_data5$year_of_diag)
# now using new_data5
rm(new_data4)
# make sure surv months is numeric
new_data5 <- new_data5 %>% mutate(survival_months = as.numeric(`survival_months`))
# Print out table 1
#table(new_data5$primary_site,new_data5$age_group_final,new_data5$new_stage)
# Make a nice neat table 1
tbl1 <- new_data5 %>% dplyr::select(`Site recode ICD-O-3/WHO 2008`, new_stage, age_group_final)
tbl1$age_group_final <- factor(tbl1$age_group_final, levels = c('under45','45-50','over50'))
tbl1$`Site recode ICD-O-3/WHO 2008` <- factor(tbl1$`Site recode ICD-O-3/WHO 2008`,
levels = c("Rectum",
"Rectosigmoid Junction",
"Sigmoid Colon",
"Descending Colon",
"Splenic Flexure",
"Transverse Colon",
"Hepatic Flexure",
"Ascending Colon",
"Cecum"))
#table1(~ `Site recode ICD-O-3/WHO 2008` | new_stage*age_group_final, data = tbl1) this prints a pretty html version, to output in github doc we need to use kable
#knitr::kable(table1(~ `Site recode ICD-O-3/WHO 2008` | new_stage*age_group_final, data = tbl1))
table1(~ `Site recode ICD-O-3/WHO 2008` | new_stage*age_group_final, data = tbl1)
```
# Stacked Bar graph figure
```{r, warning=FALSE, message=FALSE}
# Group the 'new_data5' dataframe by 'age_numeric' and 'new_stage' and calculate the proportion
# of cases that can see a sigmoid.
sum_data <- new_data5 %>%
group_by(age_numeric, new_stage) %>%
summarize(sigcan = sum(can_see_sigmoid)/n())
# Calculate the proportion of cases that cannot see a sigmoid and store in a new column 'nocan'.
sum_data$nocan <- 1-sum_data$sigcan
# Reshape the dataframe from wide to long format, converting 'sigcan' and 'nocan' columns to
# 'proptype' and 'proportion' columns.
sum_data <- pivot_longer(sum_data, cols = c(sigcan,nocan), names_to = "proptype", values_to = "proportion")
# Plot the data using ggplot2.
ggplot(sum_data, aes(x = age_numeric, y = proportion, fill = proptype)) +
# Use bars to represent the data. The height of the bars indicates the proportion.
geom_bar(stat = "identity", position="fill", width = 1.1) +
# Split the plot into multiple panels based on the 'new_stage' column.
facet_wrap(~ new_stage, ncol = 1) +
# Use the 'test' theme for the plot. This theme provides a clean and minimalistic look.
theme_test() +
# Label the fill colors to indicate whether the sigmoid can or cannot be visualized.
labs(fill = "Visualization on Sigmoidoscopy") +
# Manually set the fill colors and their corresponding labels.
scale_fill_manual(labels = c("Can NOT be visualized", "Can be visualized"),
values = c("#f4a582", "#92c5de")) +
# Label the y-axis and x-axis.
ylab("Proportion of Total Cases") + xlab("Age at diagnosis") +
# Set the limits and breaks for the x-axis.
scale_x_continuous(limits = c(18,90), breaks = c(20, 30, 40, 45, 50, 60, 70, 80)) +
# Add two vertical dashed lines at x = 44.5 and x = 49.5.
geom_vline(aes(xintercept = 44.5), color = "black", linetype="dashed", size = 0.4) +
geom_vline(aes(xintercept = 49.5), color = "black", linetype="dashed", size = 0.4)
#ggsave(plot=last_plot(),file="bar_graph.png", heigh=4, width=6,dpi = "retina")
#ggsave(plot=last_plot(),file="bar_graph.tiff", heigh=4, width=6)
```
# Logistic Regression
Calculates odds of tumor in location seen on sigmoidoscopy, first univariate then multivariate
```{r}
# Code below calculates, exps, and does CI, then only saves the output to save space
sex_log<-tidy(glm(can_see_sigmoid~Sex,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate), conf.low=exp(conf.low), conf.high=exp(conf.high))
sex_log<-glm(can_see_sigmoid~Sex,data=new_data5,family="binomial")
age_log<-tidy(glm(can_see_sigmoid~age_group_final,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate), conf.low=exp(conf.low), conf.high=exp(conf.high))
stage_log<-tidy(glm(can_see_sigmoid~new_stage,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate), conf.low=exp(conf.low), conf.high=exp(conf.high))
year_log<-tidy(glm(can_see_sigmoid~year_of_diag,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate), conf.low=exp(conf.low), conf.high=exp(conf.high))
race_log<-tidy(glm(can_see_sigmoid~race_eth,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate), conf.low=exp(conf.low), conf.high=exp(conf.high))
# Multivariate, Check out interactions
multi_log<-tidy(glm(can_see_sigmoid~race_eth+Sex+age_group_final+new_stage+year_of_diag,data=new_data5,family="binomial"),conf.int=TRUE)%>%
mutate(estimate=exp(estimate),conf.low=exp(conf.low),conf.high=exp(conf.high))
propensity_log <- glm(can_see_sigmoid~race_eth+Sex,data=new_data5,family="binomial")
summary(propensity_log)
```
## Logistic results: sex
```{r}
knitr::kable(sex_log,digits=2)
```
## Logistic results: age
```{r}
knitr::kable(age_log,digits=2)
```
## Logistic results: stage
```{r}
knitr::kable(stage_log,digits=2)
```
## Logistic results: year
```{r}
knitr::kable(year_log,digits=2)
```
## Logistic results: race
```{r}
knitr::kable(race_log,digits=2)
```
## Logistic results: multivariate
```{r}
knitr::kable(multi_log,digits=2)
```
# Survival exploration w/OS
Makes KM curves using OS, checks proportional hazards assumptions
## KM plots: Can see on sigmoidoscopy
```{r}
# CAN SEE
km_cansee<-survfit(Surv(survival_months, overall_status) ~ can_see_sigmoid, data = new_data5)
plot(km_cansee,fun="cloglog")
ggsurvplot(km_cansee,censor=FALSE) # BIGGEST TIME DEPENDANT!!!
```
## KM plots: Stage
```{r}
# STAGE
km_new_stage<-survfit(Surv(survival_months, overall_status) ~new_stage, data = new_data5)
plot(km_new_stage,fun="cloglog")
ggsurvplot(km_new_stage,censor=FALSE)
```
## KM plots: year of diag
```{r}
# YEAR OF DIAG
km_year_of_diag<-survfit(Surv(survival_months, overall_status) ~ year_of_diag, data = new_data5)
ggsurvplot(km_year_of_diag,censor=FALSE)
plot(km_year_of_diag,fun="cloglog")
```
## KM plots: sex
```{r}
# SEX
km_Sex<-survfit(Surv(survival_months, overall_status) ~ Sex, data = new_data5)
ggsurvplot(km_Sex,censor=FALSE)
plot(km_Sex,fun="cloglog") # lines cross but tiny
```
## KM plots: age group
```{r}
# AGE GROUP
km_age_group_final<-survfit(Surv(survival_months, overall_status) ~ age_group_final, data = new_data5)
ggsurvplot(km_age_group_final,censor=FALSE) # lines cross but tiny
plot(km_age_group_final,fun="cloglog")
```
## KM plots: race_eth
```{r}
# RACE ETH
km_race_eth<-survfit(Surv(survival_months, overall_status) ~ race_eth, data = new_data6)
ggsurvplot(km_race_eth,censor=FALSE) # lines cross but insignificant
plot(km_race_eth,fun="cloglog")
table(new_data5$race_eth)
```
## KM plots: Site
```{r}
# SITE (this is just can see sigmoid with more definition)
km_site<-survfit(Surv(survival_months, overall_status) ~ primary_site, data = new_data5)
ggsurvplot(km_site,censor=FALSE,ylim=c(0.5,1),xlim=c(0,100))
#table(new_data5$primary_site)
```
# New way of calculating RMST
```{r}
# Define a function to compute the adjusted survival curve for a specific age group and stage.
get_adjusted_surv <- function(age_group, stage, osdata) {
# Filter the input dataset based on the given age group and stage.
# Select the relevant columns and recode the 'can_see_sigmoid' variable.
os_surv_data <- osdata %>%
filter(age_group_final == age_group, new_stage == stage) %>%
select(can_see_sigmoid, survival_months, overall_status, race_eth, Sex) %>%
mutate(can_see_sigmoid2 = as.factor(ifelse(can_see_sigmoid == 0, "not_see", "can_see")))
# Fit a logistic regression model to estimate the propensity scores.
# The propensity score model predicts the probability of seeing sigmoid based on race and sex.
propensity_log <- glm(can_see_sigmoid ~ race_eth + Sex, data = os_surv_data, family = "binomial")
# Compute the adjusted survival curves using the inverse probability of treatment weighting (IPTW) method.
# The adjustment is done based on the propensity scores derived from the logistic regression model.
surv_adjusted <- adjustedsurv(os_surv_data,
variable = "can_see_sigmoid2",
ev_time = "survival_months",
event = "overall_status",
method = "iptw_km",
treatment_model = propensity_log,
conf_int = TRUE,
clean_data = TRUE,
bootstrap = TRUE,
n_boot = 2000,
conf_level = 0.95,
# May need to delete the n_cores line if computer crashes!
n_cores = 4)
# Return the computed adjusted survival curve.
return(surv_adjusted)
}
# View the documentation for the 'adjustedsurv' function.
?adjustedsurv
# Define a function to compute the restricted mean survival time (RMST) based on the adjusted survival curve.
get_rmst <- function(adj_surv, age_group, stage, months) {
# Correct the significance level for multiple testing (27 tests).
alpha_corrected <- 1-(0.05/27)
# Compute the RMST for two groups ('not_see' and 'can_see') based on the adjusted survival curve.
# CHANGE DIFFERENCE TO TRUE TO COMPUTE DIFFERENCE IN RMST
rmst_results <- adjusted_rmst(adj_surv,
to = months,
difference = FALSE,
conf_int = TRUE,
group_1 = "not_see",
group_2 = "can_see",
conf_level = alpha_corrected)
# Convert the RMST results to a data frame and add additional columns (age group, stage, and time point).
rmst_tidy <- rmst_results %>%
as.data.frame() %>%
mutate(age_group = age_group, stage = stage, time_p = months)
# Return the tidied RMST results.
return(rmst_tidy)
}
# Get all adjusted curves, set seed for reproducibility
ts <- Sys.time()
set.seed(2023)
surv_local_young <- get_adjusted_surv("under45","Localized", new_data6)
te <- Sys.time()
print(te-ts)
set.seed(2023)
surv_local_mid <- get_adjusted_surv("45-50","Localized", new_data6)
set.seed(2023)
surv_local_old <- get_adjusted_surv("over50","Localized", new_data6)
set.seed(2023)
surv_reg_young <- get_adjusted_surv("under45","Regional", new_data6)
set.seed(2023)
surv_reg_mid <- get_adjusted_surv("45-50","Regional", new_data6)
set.seed(2023)
surv_reg_old <- get_adjusted_surv("over50","Regional", new_data6)
set.seed(2023)
surv_dist_young <- get_adjusted_surv("under45","Distant", new_data6)
set.seed(2023)
surv_dist_mid <- get_adjusted_surv("45-50","Distant", new_data6)
set.seed(2023)
surv_dist_old <- get_adjusted_surv("over50","Distant", new_data6)
# Create a list of adjusted survival curve objects.
# These objects likely represent survival curves for different combinations of age groups and disease stages.
survs <- list(surv_local_young, surv_local_mid, surv_local_old,
surv_reg_young, surv_reg_mid, surv_reg_old,
surv_dist_young, surv_dist_mid, surv_dist_old)
# Initialize an empty data frame to store the results from the RMST calculations.
all_rmsts <- data.frame()
# Define the disease stages and age groups as vectors.
stages <- c("Localized","Regional","Distant")
ages <- c("under45", "45-50", "over50")
# Initialize a counter 'i' to keep track of the current iteration.
i <- 0
# Loop through each adjusted survival curve object in the 'survs' list.
for (surv in survs) {
# For each survival curve object, calculate the RMST at 3 specific time points: 24, 60, and 120 months.
for (m in c(24,60,120)) {
# Call the 'get_rmst' function to compute the RMST for the current survival curve object,
# corresponding age group, and disease stage at the time point 'm'.
# Using modulo arithmetic to cyclically select age groups in the order "under45", "45-50", "over50".
# for i=0,1,2,3,4,5,…, the expression i %% 3 will yield 0, 1, 2, 0, 1, 2 etc.
# Using ceiling to round up values to switch disease states every 3 iterations
# for i=0,1,2,3,4,5,…, the expression ceiling((i+1)/3) will yield 0.33,0.67,1,1.33,1.67,2 etc.
# Append the resulting data frame to the 'all_rmsts ' data frame.
all_rmsts <- rbind(all_rmsts , get_rmst(surv, ages[1 + i %% 3], stages[ceiling((i+1)/3)], m))
}
# Increment the counter 'i' for the next iteration.
i <- i + 1
}
#write.table(deltas, "all_RMST.txt", sep="\t", row.names = FALSE, quote = FALSE)
```
# Survival Analysis
```{r}
# Organize and recode the 'all_rmst' dataframe for plotting.
all_rmst2 <- all_rmst %>%
# Create a new variable 'color_var' combining 'group' and 'time_p' to determine colors in the plot.
mutate(color_var = factor(paste0(group, as.character(time_p)),
levels=c('not_see120','not_see60','not_see24','can_see120','can_see60','can_see24'))) %>%
# Ensure proper ordering of the 'age_group' and 'stage' variables for the plot.
mutate(age_group = factor(age_group, levels=c('under45','45-50','over50')),
stage = factor(stage, levels=c('Distant','Regional','Localized'))) %>%
# Filter out specific combinations of 'age_group', 'time_p', and 'stage'.
# Filtered out because there are not enough data points to accuratly asses confidence interval
filter(!(age_group =="45-50" & time_p == 120 & stage=="Distant"))
# Recode age groups for better readability in the plot.
all_rmst3 <- all_rmst2 %>%
mutate(group2 = case_when(
age_group == "45-50" ~ "45-49",
age_group == "under45" ~ "Under 45",
age_group == "over50" ~ "Over 50"
)) %>%
mutate(group2 = factor(group2, levels=c('Under 45','45-49','Over 50')))
# Plotting the RMST data.
ggplot(data = all_rmst3, aes(y = rmst, x = stage, color = color_var,
ymin = rmst-2.58*se, ymax = rmst+2.58*se,
group = group)) +
# this is the specific geom that makes the graph
geom_pointrange(position = position_dodge(width=0.5)) +
facet_wrap(~group2, ncol=1) +
scale_color_manual(
labels = c("Requires Colonoscopy: 10 years", "Requires Colonoscopy: 5 years", "Requires Colonoscopy: 2 years",
"Sigmoidoscopy: 10 years", "Sigmoidoscopy: 5 years", "Sigmoidoscopy: 2 years"),
values = c("#b2182b","#d6604d","#f4a582","#2166ac","#4393c3","#92c5de"),
name = "Tumor location \nand RMST follow up time"
) +
coord_flip() +
scale_y_continuous(limits=c(0,122), breaks=c(0,24,60,120), labels=c("0","2","5","10")) +
theme_bw() +
theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_line(linetype = 2)) +
ylab("Years of Follow Up") +
xlab("")
# Save the generated plot as a TIFF image.
ggsave(plot = last_plot(), filename = "rmst_fig.tiff", width = 8, height = 6)
```
# KM Plots with better 95% CIs
```{r}
# takes in plot data generated from make_plot_dat plus a string as the fig title, returns ggplot object of KM plot
make_plot <- function(plot_dat,fig_title){
plot_dat2 <- as.data.frame(plot_dat$adjsurv)
p <- ggplot(plot_dat2 ,aes(x=time,y=surv,color=group))+
geom_step()+
# set custom y axis limits, might actually want to just have the at 0,1 for all graphs
ylim(if(str_detect(fig_title,"Distant")) c(0,1) else c(0,1))+
geom_ribbon(aes(ymin=ci_lower,ymax=ci_upper,x=time,fill=group),alpha=.2,size=0.1)+
ggtitle(fig_title)+theme_test()+
theme(
plot.title = element_text(size=12),
axis.title = element_blank(),
panel.grid.major.x = element_line(linewidth=0.2,linetype = 3)
)+
# x axis breaks at 2,5,10, and 15 years
scale_x_continuous(limits=c(0,240),breaks=c(0,24,60,120,180),labels=c("0","2","5","10","15"))+
labs(fill="Tumor Location",color="Tumor Location")+
scale_fill_manual(labels= c("Can be seen on sigmoidoscopy", "Can NOT be seen on sigmoidoscopy"),values=c("#92c5de","#f4a582")) +
scale_color_manual(labels= c("Can be seen on sigmoidoscopy", "Can NOT be seen on sigmoidoscopy"),values=c("#0571b0","#ca0020"))
return(p)
}
#text = element_text(family = "Calibri")
```
## OS plots NEW
```{r,fig.width=9,fig.height=6}
# Local
a_os<-make_plot(surv_local_young,"A: <45 Local")
b_os<-make_plot(surv_local_mid,"B: 45-49 Local")
c_os<-make_plot(surv_local_old ,"C: 50+ Local")
# Regional
d_os<-make_plot(surv_reg_young,"D: <45 Regional")
e_os<-make_plot(surv_reg_mid,"E: 45-49 Regional")
f_os<-make_plot(surv_reg_old,"F: 50+ Regional")
# Distant
g_os<-make_plot(surv_dist_young,"G:<45 Distant")
h_os<-make_plot(surv_dist_mid ,"H: 45-49 Distant")
i_os<-make_plot(surv_dist_old ,"I: 50+ Distant")
# combine and arrage all plots in a single fig
ggarrange(a_os,b_os,c_os,d_os,e_os,f_os,g_os,h_os,i_os,common.legend = TRUE, legend = "bottom")
ggsave(plot=last_plot(),file="os_km2.tiff",height=8, width=11.5)
```
# Session Info
```{r}
sessionInfo()
```