# Processing Covariates, Incident CAD for Final Dataset

### Sources:




## Covariate Processing

This section involves the processing of all of the covariates, including defining which instance from which to draw the value of the covariate and saving the final dataset. For our replication of Dempsey *et al.,* the covariates are defined in line with the description in that article. This data processing is not shown here to avoid confusion between the replication and central analyses.

In [None]:
dx download FINALIZEDPADATASET.csv

In [None]:
# Loading necessary packages
install.packages('lubridate')
library(lubridate)

install.packages('data.table')
library(data.table)

# Reading in this full dataset
Dataset <- read.csv("FINALIZEDPADATASET.csv")


# FIRST need to standardize PRS 
Dataset$StandPGS <- scale(Dataset$PGS)


In [None]:
# First recode variable that shows first date started wearing accelerometer
Dataset$AccelDate <- Dataset$p90010


# Age should be RELATIVE TO ACCELEROMETER DATE AND DOB... (DOB and AccelDate)
Dataset$AgeBaseline <- as.Date(Dataset$AccelDate) - as.Date(Dataset$DOB)


# This gives us age in days - convert to years
Dataset$AgeBaseline <- Dataset$AgeBaseline/365.25

In [None]:
ind_wales <- Dataset$REGION %in% c("Wales")
ind_scotland <- Dataset$REGION %in% c("Scotland")


# Note that if hospital and death records have different censoring dates, we use the earlier one
Dataset$date_cens <- "2021-09-30"
Dataset$date_cens[ind_wales] <- "2016-03-31" 
Dataset$date_cens[ind_scotland] <- "2021-07-31"
Dataset$date_cens <- as.Date(Dataset$date_cens)

In [None]:
# Now have all necessary variables for Time (CADDate, AccelDate, date_cens, date_of_death_nocad)
# Convert to Date variables
Dataset$CADDate <- as.Date(Dataset$CADDate)
Dataset$AccelDate <- as.Date(Dataset$AccelDate)
Dataset$date_cens <- as.Date(Dataset$date_cens)
Dataset$date_of_death_nocad <- as.Date(Dataset$date_of_death_nocad)


# First define Time to Event if CADDate exists (ind experienced incident of CAD during study period)
Dataset$Time <- ifelse(is.na(Dataset$CADDate) == FALSE, as.Date(Dataset$CADDate) - as.Date(Dataset$AccelDate), NA)

# Now Time goes until CAD or date of death, or date censored
Dataset$Time <- ifelse(is.na(Dataset$CADDate) == FALSE, Dataset$Time, 
                   ifelse(is.na(Dataset$date_of_death_nocad) == FALSE, Dataset$date_of_death_nocad - as.Date(Dataset$AccelDate), Dataset$date_cens - as.Date(Dataset$AccelDate)))


# Converting Time from days to Years
Dataset$TimeYear <- Dataset$Time/365.24

In [None]:
# UPDATE status to include ONLY INCIDENT CAD - make note of change to Status variable and dataset
# Removing inds who had event BEFORE accelerometer (negative Time)
datsub <- subset(Dataset, Time > 0)

dim(datsub)
# 77474 x 909

summary(as.factor(datsub$Status))
# Censored = 75845
# CAD cases = 1629

In [None]:

# --------
# Make sure covariates are from closest time to accelerometer start date BEFORE wear time started
# Use Date Attending Assess Center Inst 0 to 3 to make this distinction
# Must do for all except time-invariant
# --------


# First days between dates attending assessment center and AccelDate
datsub$Inst0Time <- as.Date(datsub$AccelDate) - as.Date(datsub$Date.Attending.Assess.Center.Inst.0)
datsub$Inst1Time <- as.Date(datsub$AccelDate) - as.Date(datsub$Date.Attending.Assess.Center.Inst.1)
datsub$Inst2Time <- as.Date(datsub$AccelDate) - as.Date(datsub$Date.Attending.Assess.Center.Inst.2)
datsub$Inst3Time <- as.Date(datsub$AccelDate) - as.Date(datsub$Date.Attending.Assess.Center.Inst.3)


# ------
# Looks like instances 2 and 3 are almost entirely AFTER accelerometer wear time started (and largely missing)
# ------

# To keep temporal ordering right will restrict to ONLY those occurring BEFORE wear time started
datsub$Inst2Time <- ifelse(datsub$Inst2Time > 0, datsub$Inst2Time, NA)
datsub$Inst3Time <- ifelse(datsub$Inst3Time > 0, datsub$Inst3Time, NA)

# Convert from time variable to numeric - NONE of instance 3 occurs before accelerometer wear
datsub$Inst0Time <- as.numeric(datsub$Inst0Time)
datsub$Inst1Time <- as.numeric(datsub$Inst1Time)
datsub$Inst2Time <- as.numeric(datsub$Inst2Time)
datsub$Inst3Time <- as.numeric(datsub$Inst3Time)


# Now calculating nearest non-negative instance to accelerometer start date
datsub$NearestInstance <- apply(datsub[ , c("Inst0Time", "Inst1Time", "Inst2Time")], 1, FUN = min, na.rm = TRUE)


summary(datsub$NearestInstance)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 1    1660    1987    1918    2296    3134 


# Creating an index variable to know which instance to choose
datsub$InstanceChoice <- ifelse((datsub$NearestInstance == datsub$Inst0Time) == TRUE, 0,
                                ifelse((datsub$NearestInstance == datsub$Inst1Time) == TRUE, 1,
                                       ifelse((datsub$NearestInstance == datsub$Inst2Time) == TRUE, 2,NA)))

# Any missingness can go to instance 2
datsub$InstanceChoice <- ifelse(is.na(datsub$InstanceChoice) == TRUE, 2, datsub$InstanceChoice)
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.0000  0.0000  0.0000  0.1105  0.0000  2.0000 


summary(as.factor(datsub$InstanceChoice))
# 0 70063
# 1 6264
# 2 1147

In [None]:
# ----------
# Now using chosen instance to select applicable variable (surely much cleaner ways to do this)
# Technically instance 3 won't be chosen ever
# ----------

# Employment Status
datsub$EmploymentStatus_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$EmploymentStatus_Inst3,
                                             ifelse(datsub$InstanceChoice == 2, datsub$EmploymentStatus_Inst2,
                                                    ifelse(datsub$InstanceChoice == 1, datsub$EmploymentStatus_Inst1,
                                                           ifelse(datsub$InstanceChoice == 0, datsub$EmploymentStatus_Inst0, NA))))


# Cooked Veggies
datsub$CookedVeggie_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$CookedVeggie_Inst3,
                                             ifelse(datsub$InstanceChoice == 2, datsub$CookedVeggie_Inst2,
                                                    ifelse(datsub$InstanceChoice == 1, datsub$CookedVeggie_Inst1,
                                                           ifelse(datsub$InstanceChoice == 0, datsub$CookedVeggie_Inst0, NA))))



# Raw Veggies
datsub$RawVeggie_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$RawVeggie_Inst3,
                                         ifelse(datsub$InstanceChoice == 2, datsub$RawVeggie_Inst2,
                                                ifelse(datsub$InstanceChoice == 1, datsub$RawVeggie_Inst1,
                                                       ifelse(datsub$InstanceChoice == 0, datsub$RawVeggie_Inst0, NA))))


# Fresh Fruit
datsub$FreshFruit_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$FreshFruit_Inst3,
                                         ifelse(datsub$InstanceChoice == 2, datsub$FreshFruit_Inst2,
                                                ifelse(datsub$InstanceChoice == 1, datsub$FreshFruit_Inst1,
                                                       ifelse(datsub$InstanceChoice == 0, datsub$FreshFruit_Inst0, NA))))


# Dried Fruit
datsub$DriedFruit_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$DriedFruit_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$DriedFruit_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$DriedFruit_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$DriedFruit_Inst0, NA))))


# Processed Meat
datsub$ProcMeat_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$ProcMeat_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$ProcMeat_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$ProcMeat_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$ProcMeat_Inst0, NA))))


# Salt Intake
datsub$Salt_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$Salt_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$Salt_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$Salt_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$Salt_Inst0, NA))))


# Oily Fish Intake
datsub$OilyFish_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$OilyFish_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$OilyFish_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$OilyFish_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$OilyFish_Inst0, NA))))

# Alcohol Intake
datsub$AlcIntake_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$AlcIntake_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$AlcIntake_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$AlcIntake_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$AlcIntake_Inst0, NA))))

# Sleep Duration
datsub$SleepDur_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$SleepDur_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$SleepDur_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$SleepDur_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$SleepDur_Inst0, NA))))


# Smoking Status
datsub$SmokStat_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$SmokStat_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$SmokStat_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$SmokStat_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$SmokStat_Inst0, NA))))


# Pack Years - has missingness (makes sense for non-smokers)
datsub$PackYears_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$PackYears_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$PackYears_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$PackYears_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$PackYears_Inst0, NA))))

# Father Illness
datsub$FatherIll_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$FatherIll_Inst3,
                                       ifelse(datsub$InstanceChoice == 2, datsub$FatherIll_Inst2,
                                              ifelse(datsub$InstanceChoice == 1, datsub$FatherIll_Inst1,
                                                     ifelse(datsub$InstanceChoice == 0, datsub$FatherIll_Inst0, NA))))

# Mother Illness
datsub$MotherIll_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$MotherIll_Inst3,
                                      ifelse(datsub$InstanceChoice == 2, datsub$MotherIll_Inst2,
                                             ifelse(datsub$InstanceChoice == 1, datsub$MotherIll_Inst1,
                                                    ifelse(datsub$InstanceChoice == 0, datsub$MotherIll_Inst0, NA))))


# Medications
datsub$Meds_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$Meds_Inst3,
                                      ifelse(datsub$InstanceChoice == 2, datsub$Meds_Inst2,
                                             ifelse(datsub$InstanceChoice == 1, datsub$Meds_Inst1,
                                                    ifelse(datsub$InstanceChoice == 0, datsub$Meds_Inst0, NA))))


# Body  Mass Index
datsub$BMI_InstChosen <- ifelse(datsub$InstanceChoice == 3, datsub$Measured.BMI.Inst.3,
                                ifelse(datsub$InstanceChoice == 2, datsub$Measured.BMI.Inst.2,
                                       ifelse(datsub$InstanceChoice == 1, datsub$Measured.BMI.Inst.1,
                                              ifelse(datsub$InstanceChoice == 0, datsub$Measured.BMI.Inst.0, NA))))
# 148 had missing values


In [None]:
# ---------
# F & V should be merged into two variables
# Meds should be DISAGGREGATED BY CONDITION
# Father Illness and Mother Illness should isolate CVD history
# Smoking Status simply is never/current/ever (confirm this works)
# ---------

# Fruit & Vegetable intake - AS TWO SEPARATE VARS

# Converting Fresh Fruit consumption from previous day to numeric
# Most are numeric but recoding less than 1 as 0.5 and Do Not Know to NA (same w/ prefer not to answer)
datsub$FreshFruit_InstChosen <- ifelse(datsub$FreshFruit_InstChosen == "Less than one", "0.5", datsub$FreshFruit_InstChosen)
datsub$FreshFruit_InstChosen <- ifelse(datsub$FreshFruit_InstChosen == "Do not know", NA, datsub$FreshFruit_InstChosen)
datsub$FreshFruit_InstChosen <- ifelse(datsub$FreshFruit_InstChosen == "Prefer not to answer", NA, datsub$FreshFruit_InstChosen)

datsub$FreshFruit_InstChosen <- as.numeric(datsub$FreshFruit_InstChosen)
# Now have 153 NAs

# Same logic for Dried Fruit consumption
# Most are numeric but recoding less than 1 as 0.5 and Do Not Know to NA (same w/ prefer not to answer)
datsub$DriedFruit_InstChosen <- ifelse(datsub$DriedFruit_InstChosen == "Less than one", "0.5", datsub$DriedFruit_InstChosen)
datsub$DriedFruit_InstChosen <- ifelse(datsub$DriedFruit_InstChosen == "Do not know", NA, datsub$DriedFruit_InstChosen)
datsub$DriedFruit_InstChosen <- ifelse(datsub$DriedFruit_InstChosen == "Prefer not to answer", NA, datsub$DriedFruit_InstChosen)


datsub$DriedFruit_InstChosen <- as.numeric(datsub$DriedFruit_InstChosen)
# Far more - 490 NAs here

# Fresh Fruit and Dried Fruit questions are easily combined - both ask how many pieces of fruit eaten per day
datsub$Fruit <- datsub$FreshFruit_InstChosen + datsub$DriedFruit_InstChosen

summary(datsub$Fruit)
# 549 NAs from this combination

# Same process for Vegetable consumption
summary(as.factor(datsub$CookedVeggie_InstChosen))
summary(as.factor(datsub$RawVeggie_InstChosen))


# Most are numeric but recoding less than 1 as 0.5 and Do Not Know to NA (same w/ prefer not to answer)
# Same process for cooked vegetables
datsub$CookedVeggie_InstChosen <- ifelse(datsub$CookedVeggie_InstChosen == "Less than one", "0.5", datsub$CookedVeggie_InstChosen)
datsub$CookedVeggie_InstChosen <- ifelse(datsub$CookedVeggie_InstChosen == "Do not know", NA, datsub$CookedVeggie_InstChosen)
datsub$CookedVeggie_InstChosen <- ifelse(datsub$CookedVeggie_InstChosen == "Prefer not to answer", NA, datsub$CookedVeggie_InstChosen)


datsub$CookedVeggie_InstChosen <- as.numeric(datsub$CookedVeggie_InstChosen)
# Now have 153 NAs

# Same process for raw vegetables
# Most are numeric but recoding less than 1 as 0.5 and Do Not Know to NA (same w/ prefer not to answer)
datsub$RawVeggie_InstChosen <- ifelse(datsub$RawVeggie_InstChosen == "Less than one", "0.5", datsub$RawVeggie_InstChosen)
datsub$RawVeggie_InstChosen <- ifelse(datsub$RawVeggie_InstChosen == "Do not know", NA, datsub$RawVeggie_InstChosen)
datsub$RawVeggie_InstChosen <- ifelse(datsub$RawVeggie_InstChosen == "Prefer not to answer", NA, datsub$RawVeggie_InstChosen)


datsub$RawVeggie_InstChosen <- as.numeric(datsub$RawVeggie_InstChosen)
# Far more - 490 NAs here

# Combining Vegetable variables - heaped tablespoons of each in past day
datsub$Veggie <- datsub$CookedVeggie_InstChosen + datsub$RawVeggie_InstChosen

summary(datsub$Veggie)
# 665 NAs from this combination (note would be NA if missing EITHER category)


# -----
# F&V Intake - split into quartiles... LESS influence of outliers
# -----

datsubrest$FruitnVeg <- datsubrest$Fruit + datsubrest$Veggie

quantile(datsub$FruitnVeg, probs = c(0.20, 0.4, 0.6, 0.8), na.rm = TRUE)
# 20% 5
# 40% 6.5
# 60% 8
# 80% 10.5

datsub$FnVScore <- ifelse(datsub$FruitnVeg < 5, 0,
                             ifelse(datsub$FruitnVeg >= 5 & datsub$FruitnVeg < 6.5, 1,
                                    ifelse(datsub$FruitnVeg >= 6.5 & datsub$FruitnVeg < 8, 2,
                                           ifelse(datsub$FruitnVeg >= 7 & datsub$FruitnVeg < 10.5, 3, 4))))


# -----
# Separating out medications
# -----

datsub$CholMeds <- ifelse(grepl("Cholesterol lowering medication", datsub$Meds_InstChosen) == TRUE, 1, 0)
# Variable for whether cholesterol meds are taken (IRRESPECTIVE to other meds)

datsub$BPMeds <- ifelse(grepl("Blood pressure medication", datsub$Meds_InstChosen), 1, 0)
# Variable for whether BP meds are taken (IRRESPECTIVE to other meds)

summary(datsub$CholMeds)
summary(datsub$BPMeds)
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.0000  0.0000  0.0000  0.0813  0.0000  1.0000 

#Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.0000  0.0000  0.0000  0.0876  0.0000  1.0000 

# Converting this into a single Meds variable
datsubrest$Meds <- ifelse(datsubrest$CholMeds == 1 | datsubrest$BPMeds == 1, 1, 0)

# -----
# Parental history of Heart Disease
# -----

# Father history
summary(as.factor(datsub$FatherIll_InstChosen))

datsub$FatherHeartDisease <- ifelse(grepl("Heart disease", datsub$FatherIll_InstChosen), 1, 0)
# Set as 1 if includes Heart Disease anywhere in set, 0 o.w.

# Mother history
datsub$MotherHeartDisease <- ifelse(grepl("Heart disease", datsub$MotherIll_InstChosen), 1, 0)
# Set as 1 if includes Heart Disease anywhere in set, 0 o.w.

# Converting these separate variables into single ParentHist variable
datsub$ParentHist <- ifelse(datsub$MotherHeartDisease == 1 | datsub$FatherHeartDisease == 1, 1, 0)

In [1]:
# -------
# Coding in missingness for reporting of do not know or prefer not to answer in the rest of the vars
# NO such  category for:
# Mobility Problems, Biological Sex, BMI, Pack Years
# -------

datsub$EmploymentStatus_InstChosen <- ifelse(grepl("None of the above", datsub$EmploymentStatus_InstChosen), NA, datsub$EmploymentStatus_InstChosen)
datsub$EmploymentStatus_InstChosen <- ifelse(grepl("Prefer not to answer", datsub$EmploymentStatus_InstChosen), NA, datsub$EmploymentStatus_InstChosen)
# Recodes if it contains EITHER statement (even if others are included)
# 504 NAs now
# Keeping as categorical factor


# In paid employment/self-employed, unemployed, retired as categories
datsub$EmploymentStatus_InstChosen <- ifelse(grepl("Unemployed", datsub$EmploymentStatus_InstChosen), "Unemployed", datsub$EmploymentStatus_InstChosen)
datsub$EmploymentStatus_InstChosen <- ifelse(grepl("In paid employment or self-employed", datsub$EmploymentStatus_InstChosen), "In paid employment or self-employed", datsub$EmploymentStatus_InstChosen)
datsub$EmploymentStatus_InstChosen <- ifelse(grepl("Retired", datsub$EmploymentStatus_InstChosen), "Retired", datsub$EmploymentStatus_InstChosen)
# 46830 employed, 25096 retired, 1007 unemployed

# Adding the rest of the factors as OTHER
# INCLUDES students, looking after home/family, volunteer work, disability
datsub$EmploymentStatus_InstChosen <- ifelse(datsub$EmploymentStatus_InstChosen == "Retired", "Retired", 
                                             ifelse(datsub$EmploymentStatus_InstChosen == "Unemployed", "Unemployed",
                                                    ifelse(datsub$EmploymentStatus_InstChosen == "In paid employment or self-employed", "In paid employment or self-employed", "Other")))


datsub$EmploymentStatus_InstChosen <- as.factor(datsub$EmploymentStatus_InstChosen)

# ---
# Recoding Employment Status
# code as 1 if 'in paid employment or self-employed' and 0 o.w.
# ---

summary(as.factor(datsubrest$EmploymentStatus_InstChosen))

datsubrest$NewEmploy <- ifelse(grepl("In paid employment or self-employed", datsubrest$EmploymentStatus_InstChosen), 1, 0)
summary(as.factor(datsubrest$NewEmploy))




# -------
# Weekly consumption of processed meat
# --------

# RECODING to be MIDPOINT of categories
# INTERPRET AS PROCESSED MEAT CONSUMED PER WEEK
datsub$ProcMeat_InstChosen <- ifelse(datsub$ProcMeat_InstChosen == "2-4 times a week", 3,
                                     ifelse(datsub$ProcMeat_InstChosen == "5-6 times a week", 5.5,
                                            ifelse(datsub$ProcMeat_InstChosen == "Less than once a week", 0.5,
                                                   ifelse(datsub$ProcMeat_InstChosen == "Never", 0,
                                                          ifelse(datsub$ProcMeat_InstChosen == "Once a week", 1,
                                                                 ifelse(datsub$ProcMeat_InstChosen == "Once or more daily", 7, NA))))))


summary(as.factor(datsub$ProcMeat_InstChosen))
# 0 7934
# 0.5 25386
# 1 21896
# 3 19350
# 5.5 2288
# 7 513
# NA's 107




# ------
# Frequency of Added Salt Consumption - Factor
# ------

datsub$Salt_InstChosen <- ifelse(datsub$Salt_InstChosen == "Never/rarely", 0,
                                 ifelse(datsub$Salt_InstChosen == "Sometimes", 1,
                                        ifelse(datsub$Salt_InstChosen == "Usually", 2,
                                               ifelse(datsub$Salt_InstChosen == "Always", 3, NA))))


datsub$Salt_InstChosen <- as.factor(datsub$Salt_InstChosen)


# -------
# Weekly consumption of Oily Fish
# -------

datsub$OilyFish_InstChosen <- ifelse(datsub$OilyFish_InstChosen == "2-4 times a week", 3,
                                     ifelse(datsub$OilyFish_InstChosen == "5-6 times a week", 5.5,
                                            ifelse(datsub$OilyFish_InstChosen == "Less than once a week", 0.5,
                                                   ifelse(datsub$OilyFish_InstChosen == "Never", 0,
                                                          ifelse(datsub$OilyFish_InstChosen == "Once a week", 1,
                                                                 ifelse(datsub$OilyFish_InstChosen == "Once or more daily", 7, NA))))))


summary(as.factor(datsub$OilyFish_InstChosen))
# 0 7161
# 0.5 26446
# 1 30221
# 3 12804
# 5.5 509
# 7 129
# NA's 204


# ------
# Reconding as yearly consumption of alcohol
# ------

datsub$AlcIntake_InstChosen <- ifelse(datsub$AlcIntake_InstChosen == "Daily or almost daily", 365,
                                      ifelse(datsub$AlcIntake_InstChosen == "Never", 0,
                                             ifelse(datsub$AlcIntake_InstChosen == "Once or twice a week", 78,
                                                    ifelse(datsub$AlcIntake_InstChosen == "One to three times a month", 24,
                                                           ifelse(datsub$AlcIntake_InstChosen == "Special occasions only", 12,
                                                                  ifelse(datsub$AlcIntake_InstChosen == "Three or four times a week", 182, NA))))))
# Will make VERY LITTLE difference but special occasion coding is arbitrary


datsub$AlcIntake_InstChosen <- as.factor(datsub$AlcIntake_InstChosen)

# Converting to Alcohol Intake Weekly from Yearly
datsub$AlcIntake_Weekly <- datsub$AlcIntake_InstChosen/52


# -------
# Recoding Do Not Know and prefer not to answer as NAs in Sleep Duration
# The rest are converted into hours
# -------

datsub$SleepDur_InstChosen <- ifelse(datsub$SleepDur_InstChosen == "Do not know", NA, datsub$SleepDur_InstChosen)
datsub$SleepDur_InstChosen <- ifelse(datsub$SleepDur_InstChosen == "Prefer not to answer", NA, datsub$SleepDur_InstChosen)
# THEN convert to numeric

datsub$SleepDur_InstChosen <- as.numeric(datsub$SleepDur_InstChosen)


# -----
# Making sure smoking status missing values are coded correctly
# -----

datsub$SmokStat_InstChosen <- ifelse(datsub$SmokStat_InstChosen == "Current", "Current",
                                     ifelse(datsub$SmokStat_InstChosen == "Never", "Never",
                                            ifelse(datsub$SmokStat_InstChosen == "Previous", "Previous", NA)))

datsub$SmokStat_InstChosen <- as.factor(datsub$SmokStat_InstChosen)

SyntaxError: invalid syntax (<ipython-input-1-e8e991a85272>, line 7)

In [None]:
# Creating Mobility limitation variable
# Dichotomous for whether or not they indicate any issues
summary(as.factor(datsubrest$MobilProbs))

datsub$MobilityDichot <- ifelse(grepl("I am unable to walk about", datsub$MobilProbs) | grepl("I have moderate problems in walking about", datsub$MobilProbs) | grepl("I have severe problems in walking about", datsub$MobilProbs) | grepl("I have slight problems in walking about", datsub$MobilProbs), 1, 0)


# --------
# Education level - also time invariant
# --------

# treat this as none/uni/other
datsubrest$NewEduc <- ifelse(grepl("None of the above", datsubrest$EA.Inst.0), "None",
                             ifelse(grepl("College or University degree", datsubrest$EA.Inst.0), "Uni", "Other"))

summary(as.factor(datsubrest$NewEduc))
# Matches up well

In [None]:
# -------
# CREATE Season of Wear Covariate (based on AccelDate)
# -------

# First extract month and day from AccelDate variable
# SOLUTION was to normalize the years
datsub$YearZero <- datsub$AccelDate
year(datsub$YearZero) <- 0

datsub$YearZero <- as.Date(datsub$YearZero)


# Making season variable
datsub$SeasonWear <- fifelse(datsub$YearZero >= '0-01-01' & datsub$YearZero <= '0-02-29' | datsub$YearZero >= '0-12-01', "Winter",
                             fifelse(datsub$YearZero >= '0-03-01' & datsub$YearZero <= '0-05-31', "Spring",
                                     fifelse(datsub$YearZero >= '0-06-01' & datsub$YearZero <= '0-08-31', "Summer", "Fall")))

summary(as.factor(datsub$SeasonWear))
# Fall 23014 Spring 17725 Summer 20322 Winter 16413


# SHOULD be write.csv
write.csv(datsub, "FINALANALYSISDATAPAPER3.csv")

In [None]:
# IN BASH - MAKE SURE IT'S ON BRAND NEW CELL WHEN YOU SWITCH!!!
dx upload FINALANALYSISDATAPAPER3.csv