# sMRI Kernel Preprocessing

In [None]:
# Set working directory
setwd("./WorkDir")

# Create holding directory for preprocessed datasets
if (!dir.exists("./CSI/Preprocessed")) {dir.create("./CSI/Preprocessed")}

# Load necessary libraries
suppressPackageStartupMessages(library(tidyverse))

### ABCD sMRI Destrieux Parcellation: _mrisdp_

In [None]:
# Import mrisdp data

abcd_mrisdp10201 <- read_delim("./ABCD_Data/abcd_mrisdp10201.txt", 
                               delim = "\t", 
                               escape_double = FALSE, 
                               col_types = "c", 
                               trim_ws = TRUE, 
                               na = c("", "NA"))[-1,]

abcd_mrisdp20201 <- read_delim("./ABCD_Data/abcd_mrisdp20201.txt", 
                               delim = "\t", 
                               escape_double = FALSE, 
                               col_types = "c", 
                               trim_ws = TRUE, 
                               na = c("", "NA"))[-1,]

abcd_mrisdp30201 <- read_delim("./ABCD_Data/abcd_mrisdp30201.txt", 
                               delim = "\t", 
                               escape_double = FALSE, 
                               col_types = "c", 
                               trim_ws = TRUE, 
                               na = c("", "NA"))[-1,]

In [None]:
# Preprocess abcd_mrisdp10201
mrisdp10201 <- abcd_mrisdp10201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("mrisdp_"))) %>%
    mutate_at(., vars(contains("mrisdp_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("mrisdp_")), mean, na.rm=T)

dim(mrisdp10201)
head(mrisdp10201)

In [None]:
# Preprocess abcd_mrisdp20201
mrisdp20201 <- abcd_mrisdp20201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("mrisdp_"))) %>%
    mutate_at(., vars(contains("mrisdp_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("mrisdp_")), mean, na.rm=T)

dim(mrisdp20201)
head(mrisdp20201)

In [None]:
# Preprocess abcd_mrisdp30201
mrisdp30201 <- abcd_mrisdp30201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("mrisdp_"))) %>%
    mutate_at(., vars(contains("mrisdp_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("mrisdp_")), mean, na.rm=T)

dim(mrisdp30201)
head(mrisdp30201)

In [None]:
# Combine all mrisdp frames into a single frame
mrisdp <- mrisdp10201 %>%
    full_join(mrisdp20201, by=c("subjectkey"="subjectkey")) %>%
    full_join(mrisdp30201, by=c("subjectkey"="subjectkey"))

dim(mrisdp)
head(mrisdp)

In [None]:
# Save drsip, remove intermediate files
saveRDS(mrisdp, file="./CSI/Preprocessed/mrisdp.rds")
rm(mrisdp10201, mrisdp20201, mrisdp30201, abcd_mrisdp10201, abcd_mrisdp20201, abcd_mrisdp30201)

### ABCD sMRI: _smrip_

In [None]:
# Import smrip data
abcd_smrip10201 <- read_delim("./ABCD_Data/abcd_smrip10201.txt", 
                              delim = "\t", 
                              escape_double = FALSE, 
                              col_types = "c", 
                              trim_ws = TRUE, 
                              na = c("", "NA"))[-1,]

abcd_smrip20201 <- read_delim("./ABCD_Data/abcd_smrip20201.txt", 
                              delim = "\t", 
                              escape_double = FALSE, 
                              col_types = "c", 
                              trim_ws = TRUE, 
                              na = c("", "NA"))[-1,]

abcd_smrip30201 <- read_delim("./ABCD_Data/abcd_smrip30201.txt", 
                              delim = "\t", 
                              escape_double = FALSE, 
                              col_types = "c", 
                              trim_ws = TRUE, 
                              na = c("", "NA"))[-1,]

In [None]:
# Preprocess abcd_smrip10201
smrip10201 <- abcd_smrip10201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("smri_"))) %>%
    select(-smri_visitid) %>%
    mutate_at(., vars(contains("smri_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("smri_")), mean, na.rm=T)

dim(smrip10201)
head(smrip10201)

In [None]:
# Preprocess abcd_smrip20201
smrip20201 <- abcd_smrip20201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("smri_"))) %>%
    mutate_at(., vars(contains("smri_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("smri_")), mean, na.rm=T)

dim(smrip20201)
head(smrip20201)

In [None]:
# Preprocess abcd_smrip30201
smrip30201 <- abcd_smrip30201 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("smri_"))) %>%
    mutate_at(., vars(contains("smri_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("smri_")), mean, na.rm=T)

dim(smrip30201)
head(smrip30201)

In [None]:
# Combine all smrip frames into a single frame
smrip <- smrip10201 %>%
    full_join(smrip20201, by=c("subjectkey"="subjectkey")) %>%
    full_join(smrip30201, by=c("subjectkey"="subjectkey"))

dim(smrip)
head(smrip)

In [None]:
# Save smrip, remove intermediate files
saveRDS(smrip, file="./CSI/Preprocessed/smrip.rds")
rm(smrip10201, smrip20201, smrip30201, abcd_smrip10201, abcd_smrip20201, abcd_smrip30201)

### MRI QC Raw: _mriqcrp_

In [None]:
# Import partial mriqcrp data
mriqcrp10301 <- read_delim("./ABCD_Data/mriqcrp10301.txt", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = "c", 
                           trim_ws = TRUE, 
                           na = c("", "NA"))[-1,]

mriqcrp20301 <- read_delim("./ABCD_Data/mriqcrp20301.txt", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = "c", 
                           trim_ws = TRUE, 
                           na = c("", "NA"))[-1,]

mriqcrp302 <- read_delim("./ABCD_Data/mriqcrp302.txt", 
                         delim = "\t", 
                         escape_double = FALSE, 
                         col_types = "c", 
                         trim_ws = TRUE, 
                         na = c("", "NA"))[-1,]

In [None]:
# Load partial mriqcrp data and join

mriqcrp10301 <- mriqcrp10301 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("iqc")))

mriqcrp20301 <- mriqcrp20301 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("iqc")))

mriqcrp302 <- mriqcrp302 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("iqc")))

mriqcrp <- mriqcrp10301 %>%
    full_join(mriqcrp20301, by=c("subjectkey"="subjectkey")) %>%
    full_join(mriqcrp302, by=c("subjectkey"="subjectkey"))

In [None]:
# Create a sMRI_mriqcrp subset
sMRI_mriqcrp <- mriqcrp %>%
    select(c(subjectkey, starts_with("iqc_t1_"), starts_with("iqc_t2_"))) %>%
    select(-c(ends_with("sub_02"), ends_with("seriestime"), ends_with("studydate"), ends_with("seuid"), ends_with("suid"))) %>%
    select(-matches("iqc_t1_\\d_.{0,}|iqc_t2_\\d_.{0,}"))%>%
    mutate_at(., vars(contains("iqc_t")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(contains("iqc_t")), mean, na.rm=T)

dim(sMRI_mriqcrp)
head(sMRI_mriqcrp)

In [None]:
# Save sMRI_mriqcrp, remove intermediate files
saveRDS(sMRI_mriqcrp, file="./CSI/Preprocessed/sMRI_mriqcrp.rds")
rm(mriqcrp, mriqcrp10301, mriqcrp20301, mriqcrp302)

### Automated Post-Processing QC Metrics: _postqc_

In [None]:
# Import partial postqc data
postqc01 <- read_delim("./ABCD_Data/abcd_auto_postqc01.txt", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = "c", 
                           trim_ws = TRUE, 
                           na = c("", "NA"))[-1,]

In [None]:
# Create a sMRI_postqc subset
sMRI_postqc <- postqc01 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("apqc_smri_"))) %>%
    mutate_at(., vars(starts_with("apqc_smri_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(starts_with("apqc_smri_")), mean, na.rm=T)

dim(sMRI_postqc)
head(sMRI_postqc)

In [None]:
# Save sMRI_postqc, remove intermediate files
saveRDS(sMRI_postqc, file="./CSI/Preprocessed/sMRI_postqc.rds")
rm(postqc01)

### ABCD sMRI T2w Post Processing QC: _t2wqc_

In [None]:
# Import partial t2wqc data
abcd_t2wqc01 <- read_delim("./ABCD_Data/abcd_t2wqc01.txt", 
                           delim = "\t", 
                           escape_double = FALSE, 
                           col_types = "c", 
                           trim_ws = TRUE, 
                           na = c("", "NA"))[-1,]

In [None]:
# Create a t2wqc subset

t2wqc <- abcd_t2wqc01 %>%
    filter(eventname %in% c("baseline_year_1_arm_1", "1_year_follow_up_y_arm_1", "2_year_follow_up_y_arm_1")) %>%
    select(c(subjectkey, starts_with("t2w_postqc_"))) %>%
    mutate_at(., vars(starts_with("t2w_postqc_")), as.numeric) %>%
    group_by(subjectkey) %>%
    summarise_at(., vars(starts_with("t2w_postqc_")), mean, na.rm=T)

dim(t2wqc)
head(t2wqc)

In [None]:
# Save t2wqc, remove intermediate files
saveRDS(t2wqc, file="./CSI/Preprocessed/t2wqc.rds")
rm(abcd_t2wqc01)

### Remove high missingness

In [None]:
# Count Infinities, NAs, and NaNs per variable
NAs_mrisdp <- apply(mrisdp, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
INFs_mrisdp <- apply(mrisdp, 2, function(x) {is.infinite(x)/length(x)})

NAs_smrip <- apply(smrip, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
INFs_smrip <- apply(smrip, 2, function(x) {is.infinite(x)/length(x)})

NAs_sMRI_mriqcrp <- apply(sMRI_mriqcrp, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
INFs_sMRI_mriqcrp <- apply(sMRI_mriqcrp, 2, function(x) {is.infinite(x)/length(x)})

NAs_sMRI_postqc <- apply(sMRI_postqc, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
INFs_sMRI_postqc <- apply(sMRI_postqc, 2, function(x) {is.infinite(x)/length(x)})

NAs_t2wqc <- apply(t2wqc, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
INFs_t2wqc <- apply(t2wqc, 2, function(x) {is.infinite(x)/length(x)})

In [None]:
# Max missingness
cat("NAs_mrisdp: ", round(max(NAs_mrisdp), 3),
   "\nINFs_mrisdp: ", round(max(INFs_mrisdp), 3),
    
   "\n\nNAs_smrip: ", round(max(NAs_smrip), 3),
   "\nINFs_smrip: ", round(max(INFs_smrip), 3),
    
   "\n\nNAs_sMRI_mriqcrp: ", round(max(NAs_sMRI_mriqcrp), 3),
   "\nINFs_sMRI_mriqcrp: ", round(max(INFs_sMRI_mriqcrp), 3),
    
   "\n\nNAs_sMRI_postqc: ", round(max(NAs_sMRI_postqc), 3),
   "\nINFs_sMRI_postqc: ", round(max(INFs_sMRI_postqc), 3),
   
   "\n\nNAs_t2wqc: ", round(max(NAs_t2wqc), 3),
   "\nINFs_t2wqc: ", round(max(INFs_t2wqc), 3))

In [None]:
# Remove columns with high rates of NAs

sMRI_mriqcrp <- sMRI_mriqcrp[!colSums(is.na(sMRI_mriqcrp)) > 0.1]
t2wqc <- t2wqc[!colSums(is.na(t2wqc)) > 0.1]

NAs_sMRI_mriqcrp <- apply(sMRI_mriqcrp, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})
NAs_t2wqc <- apply(t2wqc, 2, function(x) {sum(is.na(x)|is.nan(x))/length(x)})

cat("NAs_sMRI_mriqcrp: ", round(max(NAs_sMRI_mriqcrp), 3),    
   "\nNAs_t2wqc: ", round(max(NAs_t2wqc), 3))

### Combine and process datasets

In [None]:
# Calculate number of predictors for all datasets

cat("mrisdp:", dim(mrisdp)[2]-1,
    "\nsmrip:", dim(smrip)[2]-1,
    "\nsMRI_mriqcrp:", dim(sMRI_mriqcrp)[2]-1,
    "\nsMRI_postqc:", dim(sMRI_postqc)[2]-1,
    "\nt2wqc:", dim(t2wqc)[2]-1)

In [None]:
# Combine non-qc and qc frames
sk <- purrr::reduce(list(mrisdp$subjectkey, smrip$subjectkey, sMRI_mriqcrp$subjectkey, sMRI_postqc$subjectkey, t2wqc$subjectkey), union)

sMRI <- data.frame(subjectkey = sk) %>%
    full_join(mrisdp, by=c("subjectkey"="subjectkey")) %>%
    full_join(smrip, by=c("subjectkey"="subjectkey")) %>%
    full_join(sMRI_mriqcrp, by=c("subjectkey"="subjectkey")) %>%
    full_join(sMRI_postqc, by=c("subjectkey"="subjectkey")) %>%
    full_join(t2wqc, by=c("subjectkey"="subjectkey"))

sum(is.na(sMRI))

In [None]:
# Mean impute sMRI
for(i in 2:ncol(sMRI)){
  sMRI[is.na(sMRI[,i]), i] <- mean(sMRI[,i], na.rm = TRUE)
}

sum(is.na(sMRI))

In [None]:
# Remove variables with range of 0
Range <- apply(sMRI[,-1], 2, function(x) {range(x)[2]-range(x)[1]})
sMRI <- sMRI %>% select(-names(Range[Range==0]))

dim(sMRI)
head(sMRI)

In [None]:
# Range-normalize data
sMRI[,-1] <- apply(sMRI[,-1], 2, function(x) {(x - min(x))/(max(x) - min(x))})
sum(is.na(sMRI))

dim(sMRI)
head(sMRI)

In [None]:
# Remove variables with low variances
Vars <- apply(sMRI[,-1], 2, function(x) {var(x)})
sMRI <- sMRI %>% select(-names(Vars[Vars<0.001]))

dim(sMRI)
head(sMRI)

In [None]:
# Save data
saveRDS(sMRI, file="./CSI/Preprocessed/kernel_sMRI.rds")