# Main Results, Model Fit, PH Evaluation

### Sources:




## Main Results

We run the central analyses in this file. First, we subset to only individuals of White Britsh ancestry and then test for model fit. We discover that the linear model fits better than either the restricted cubic or restricted quadratic spline and so adopt this functional form moving forward. We also test the proportional hazards assumption with schoenfeld residuals, in which only biological sex is shown to violate. We then stratify by biological sex and then apply this linear model. We then use glht from the **multcomp** package in R to predict the value of the model at different percentiles of genetic and physical activity risk. We only show results for 20th and 40th percentiles of genetic risk and physical activity volume but the same process was followed for physical activity intensity.

In [None]:
# Bash kernel
dx download FINALANALYSISDATAPAPER3.csv

In [None]:
# Reading in needed packages
install.packages("survival")
library(survival)

install.packages("ggplot2")
library(ggplot2)

install.packages("multcomp")
library(multcomp)

data <- read.csv("FINALANALYSISDATAPAPER3.csv")

# RESTRICT to white dataset based on genetics
whitedata <- subset(data, Genetic.Ethnic.Grouping == "Caucasian")
dim(whitedata)
# 66,180 x 74

In [None]:
# -------
# Restricted Cubic Spline Code
# PA Volume
# -------

# First step is checking the knot locations for the spline
knots <- quantile(whitedata$PAEEPOS, probs = c(0.10, 0.50, 0.90))
knots
# 10% 25.9491318778412
# 50% 38.7595302926387
# 90% 54.5613992585524

# ----
# Splines placed at 10th/50th/90th percentiles
# ----

# Coding linear spline terms
whitedata$ls.1 <- (whitedata$PAEEPOS - 25.9491318778412)*as.integer(whitedata$PAEEPOS > 25.9491318778412)
whitedata$ls.2 <- (whitedata$PAEEPOS - 38.7595302926387)*as.integer(whitedata$PAEEPOS > 38.7595302926387)
whitedata$ls.3 <- (whitedata$PAEEPOS - 54.5613992585524)*as.integer(whitedata$PAEEPOS > 54.5613992585524)



# Converting linear spline terms to unrestricted cubic spline
whitedata$cs.1 <- whitedata$ls.1^3
whitedata$cs.2 <- whitedata$ls.2^3
whitedata$cs.3 <- whitedata$ls.3^3

# So these are:
# (v-ti)^3 where i = 1:3
# BUT max value of i here is K - 2
# Since k = 3, i = 1 ONLY
# SO qs.2 = tk-1 and qs.3 = tk

# v if i = 0
# o.w.:
# (v-t1)^3 - tK-t1/(tk - tk-1)*(v - tk-1)^3 + tk-1-ti/(tk-tk-1)*(v-tk)^3

whitedata$rcs.1 <- whitedata$cs.1 - (((54.56 - 25.95)/(54.56 - 38.76))*whitedata$cs.2 + (38.76 - 25.95)/(54.56 - 38.76)*whitedata$cs.3)


# Fitting model with linear term and restricted cubic term
fit.rcs <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + rcs.1 + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + Biological.Sex + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen), data = whitedata)

summary(fit.rcs)


# ---------
# Restricted Quadratic Spline
# ---------


# Unrestricted quadratic spline (using same linear knot terms as cubic)
whitedata$qs.1 <- whitedata$ls.1^2
whitedata$qs.2 <- whitedata$ls.2^2
whitedata$qs.3 <- whitedata$ls.3^2


# Doing the restricting
whitedata$rqs.1 <- whitedata$qs.1 - whitedata$qs.3
whitedata$rqs.2 <- whitedata$qs.2 - whitedata$qs.3

fit.rqs <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + rqs.1 + rqs.2 + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + Biological.Sexx, data = whitedata)


summary(fit.rqs)

# -----
# Linear model
# ------

fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + Biological.Sex, data = whitedata)

summary(fit.lin)

In [None]:
# Checking model fit
BIC(fit.lin, fit.rqs, fit.rcs)
# fit.lin 31 28092.32
# fit.rqs 33 28106.49
# fit.rcs 32 28099.27

# Linear is actually PREFERRED here

AIC(fit.lin, fit.rqs, fit.rcs)
# fit.lin 31 27925.87
# fit.rqs 33 27929.30
# fit.rcs 32 27927.44

In [None]:
# Schoenfeld resids
ph <- cox.zph(fit.ls, transform="km", global=TRUE)
plot(ph, var = 1)
abline(h = coef(fit.ls)[1], col = "red", lwd = 2)
# No PH violations except biological sex

In [None]:
# Repeating this process for PA Intensity

# -------
# Restricted Cubic Spline Code
# PA Intensity
# -------


# Creating spline regression for logPercentMVPA also
knots <- quantile(whitedata$PercentMVPA, probs = c(0.10, 0.50, 0.90))
knots
# 10% 0.213109755634595
# 50% 0.35725720776084
# 90% 0.507174876692717




whitedata$ls.1MVPA <- (whitedata$PercentMVPA - 0.213109755634595)*as.integer(whitedata$PercentMVPA > 0.213109755634595)
whitedata$ls.2MVPA <- (whitedata$PercentMVPA - 0.35725720776084)*as.integer(whitedata$PercentMVPA > 0.35725720776084)
whitedata$ls.3MVPA <- (whitedata$PercentMVPA - 0.507174876692717)*as.integer(whitedata$PercentMVPA > 0.507174876692717)



# Unrestricted cubic spline
whitedata$cs.1MVPA <- whitedata$ls.1MVPA^3
whitedata$cs.2MVPA <- whitedata$ls.2MVPA^3
whitedata$cs.3MVPA <- whitedata$ls.3MVPA^3

# So these are:
# (v-ti)^3 where i = 1:3
# BUT max value of i here is K - 2
# Since k = 3, i = 1 ONLY
# SO qs.2 = tk-1 and qs.3 = tk

# v if i = 0
# o.w.:
# (v-t1)^3 - tK-t1/(tk - tk-1)*(v - tk-1)^3 + tk-1-tj/(tk-tk-1)*(v-tk)^3

# Funky cubic restricting
# THIS would explain why there's only one nonlinear part!
whitedata$rcs.1MVPA <- whitedata$cs.1MVPA - (((0.51 - 0.21)/(0.51 - 0.36))*whitedata$cs.2MVPA + (0.36 - 0.21)/(0.51 - 0.36)*whitedata$cs.3MVPA)




fit.rcs <- coxph(Surv(TimeAge, Status) ~ PAEEPOS + PercentMVPA + rcs.1MVPA + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + Biological.Sex + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + Biological.Sex, data = whitedata)


summary(fit.rcs)


# ---------
# Restricted Quadratic Spline
# ---------


# Unrestricted quadratic spline (using same linear knot terms as cubic)
whitedata$qs.1 <- whitedata$ls.1^2
whitedata$qs.2 <- whitedata$ls.2^2
whitedata$qs.3 <- whitedata$ls.3^2


# Doing the restricting
whitedata$rqs.1 <- whitedata$qs.1 - whitedata$qs.3
whitedata$rqs.2 <- whitedata$qs.2 - whitedata$qs.3

fit.rqs <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + PercentMVPA + rqs.1MVPA + rqs.2MVPA + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + Biological.Sex, data = whitedata)


summary(fit.rqs)

# -----
# Linear model
# ------

fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + PercentMVPA + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + Biological.Sex, data = whitedata)

summary(fit.lin)

In [None]:
# Model fit
BIC(fit.lin, fit.rqs, fit.rcs)
# fit.lin 32 28059.79
# fit.rqs 34 28072.93
# fit.rcs 34 29975.62

In [None]:
# Schoenfeld resids
ph <- cox.zph(fit.ls, transform="km", global=TRUE)
plot(ph, var = 1)
abline(h = coef(fit.ls)[1], col = "red", lwd = 2)
# No PH violations except biological sex

In [None]:
# Fitting the models

# Model 0
fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + StandPGS + PercentMVPA + strata(Biological.Sex), data = whitedata)

summary(fit.lin)



# Model 2
# Sleep dur, BMI, Meds
fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + StandPGS + PercentMVPA + BMI_InstChosen + SleepDur_InstChosen + Meds + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_InstChosen + OilyFish_InstChosen + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + strata(Biological.Sex) + REGION, data = whitedata)

summary(fit.lin)


# Model 1
# Full set of main analysis covariates
fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + as.factor(OilyFish_InstChosen) + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + strata(Biological.Sex) + REGION, data = whitedata)

summary(fit.lin)

In [None]:
# Adding quintiles for PA exposures and genetic risk
PAIntensity <- quantile(whitedatasub$PercentMVPA, probs = c(0.20, 0.40, 0.60, 0.80))
PAIntensity

PAVolume <- quantile(whitedatasub$PAEEPOS, probs = c(0.20, 0.40, 0.60, 0.80))
PAVolume

StandPGS <- quantile(whitedatasub$StandPGS, probs = c(0.20, 0.40, 0.60, 0.80))
StandPGS

In [None]:
# --------
# Showing change in hazard from model from %tile changes in PA volume and genetic risk
# Only for genetic risk 20th and 40th as illustration
# --------

# Change PA from 20th to 40th percentile w/ genetic risk at 20th
48.6690567680804 - 41.5283167027557
k1 <- matrix(c(-7.11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.11140408168133
# Lower = 1.07120635703763
# Upper = 1.153110252439



# Change PA from 20th to 60th percentile w/ genetic risk at 20th
48.50 - 35.91
k1 <- matrix(c(-12.59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.20566766052
# Lower = 1.12952989151296
# Upper = 1.28693761762842



# Change PA from 20th to 80th percentile w/ genetic risk at 20th
48.50 - 30.07
k1 <- matrix(c(-18.43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.31493979495329
# Lower = 1.19518430021643
# Upper = 1.44669459265712


# --------
# REPEATING COMPARISON WITH 40th percentile genetic risk
# --------

# 20th PA/40th genetic risk
-0.83 - -0.26
k1 <- matrix(c(0, 0.57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.25517486527203
# Lower = 1.21628777457933
# Upper = 1.29530525204494

# 40th PA/40th genetic risk
-0.846868164911578 - -0.266874315609926
k1 <- matrix(c(-7.11, 0.57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.39500646848715
# Lower = 1.32968073623136
# Upper = 1.46354158114416


# 60th PA/40th genetic risk
-0.846868164911578 - -0.266874315609926
k1 <- matrix(c(-12.59, 0.57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.51332374335604
# Lower = 1.40841191770701
# Upper = 1.62605039293735



# 80th PA/40th genetic risk
-0.846868164911578 - -0.266874315609926
k1 <- matrix(c(-18.43, 0.57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), nrow=1)


delta.eta <- glht(fit.lin, linfct=k1)
exp(confint(delta.eta)$confint)[,1:3]
# Estimate = 1.65047937997132
# Lower = 1.49351925479814
# Upper = 1.82393509488346

In [None]:
# Interaction Analyses - neither were statistically significant
fit.lin <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + PAEEPOS*StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + as.factor(OilyFish_InstChosen) + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + strata(Biological.Sex) + REGION, data = whitedata)

summary(fit.lin)

fit.linMVPA <- coxph(Surv(AgeBaseline, AgeBaseline + TimeYear, Status) ~ PAEEPOS + PercentMVPA + PercentMVPA*StandPGS + p22009_a1 + p22009_a2 + p22009_a3 + p22009_a4 + p22009_a5 + p22009_a6 + p22009_a7 + p22009_a8 + p22009_a9 + p22009_a10 + SeasonWear + as.factor(Salt_InstChosen) + AlcIntake_Weekly + as.factor(OilyFish_InstChosen) + FnVScore + ProcMeat_InstChosen + ParentHist + MobilityDichot + NewEmploy + Townsend + as.factor(NewEduc) + as.factor(SmokStat_InstChosen) + strata(Biological.Sex) + REGION, data = whitedata)

summary(fit.linMVPA)