# Final Batching

In [None]:
# Set working directory
setwd("./WorkDir")

# Create holding directory for preprocessed datasets
if (!dir.exists("./CSI/Preprocessed")) {dir.create("./CSI/Preprocessed")}

# Load necessary libraries
suppressPackageStartupMessages(library(tidyverse))

### Import Kernels

In [None]:
# Genetic
Gene <- readRDS("./CSI/Preprocessed/kernel_Genetic.rds"); dim(Gene)

# Phenotype
Phen <- readRDS("./CSI/Preprocessed/kernel_Pheno.rds") %>% rename(SampleID = subjectkey); dim(Phen)

# Imaging
sMRI <- readRDS("./CSI/Preprocessed/kernel_sMRI.rds") %>% rename(SampleID = subjectkey); dim(sMRI)
dMRI <- readRDS("./CSI/Preprocessed/kernel_dMRI.rds") %>% rename(SampleID = subjectkey); dim(dMRI)
tsfMRI <- readRDS("./CSI/Preprocessed/kernel_tsfMRI.rds") %>% rename(SampleID = subjectkey); dim(tsfMRI)
rsfMRI <- readRDS("./CSI/Preprocessed/kernel_rsfMRI.rds") %>% rename(SampleID = subjectkey); dim(rsfMRI)

In [None]:
### Get a census of all names, intersect and compare to OCD Diagnosnis 

OCD <- read.table("./OCD.pheno", sep="\t", header=F); dim(OCD)

UNION <- purrr::reduce(list(Gene$SampleID, Phen$SampleID, sMRI$SampleID, dMRI$SampleID, rsfMRI$SampleID, tsfMRI$SampleID), union); length(UNION)

INTERSECTION  <- purrr::reduce(list(Gene$SampleID, Phen$SampleID, sMRI$SampleID, dMRI$SampleID, rsfMRI$SampleID, tsfMRI$SampleID), intersect); length(INTERSECTION)



### Batch samples

In [None]:
# Update OCD table
colnames(OCD) <- c("SampleID", "OCD")
OCD$OCD[OCD$OCD == 1] <- NA
OCD$OCD[OCD$OCD == 2] <- 1
saveRDS(OCD, file="./CSI/OCD.rds")

In [None]:
# Define ambiguous OCD cases into experimental batch
BAT <- OCD %>%
    mutate(Experimental = ifelse(is.na(OCD), 1, 0))

In [None]:
# Create vectors for cases and controls
BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% INTERSECTION)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% INTERSECTION)) %>%
    pull(SampleID)

In [None]:
# Create Test set
set.seed(1)

SEL <- c(sample(BAT_Case, 50), sample(BAT_Cont, 50))

BAT <- BAT %>%
    mutate(Test = ifelse(SampleID %in% SEL, 1, 0))

BAT_Case <- BAT_Case[!BAT_Case %in% SEL]
BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]

# Create Validation Set
set.seed(1)

SEL <- c(sample(BAT_Case, 50), sample(BAT_Cont, 50))

BAT <- BAT %>%
    mutate(Valid = ifelse(SampleID %in% SEL, 1, 0))

BAT_Case <- BAT_Case[!BAT_Case %in% SEL]
BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]

# Create a list of samples to remove from final UNION list, and remove them
REM <- c(
    pull(filter(BAT, Experimental == 1), SampleID),
    pull(filter(BAT, Test == 1), SampleID),
    pull(filter(BAT, Valid == 1), SampleID)
    )

# Remove those samples from UNION
UNION <- UNION[!UNION %in% REM]

In [None]:
# Create batches for genetic set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% Gene$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% Gene$SampleID)) %>%
    pull(SampleID)

BAT_GENE <- BAT %>% filter(SampleID %in% Gene$SampleID)

while(length(BAT_Cont)>=94){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 94))
    BAT_GENE <- BAT_GENE %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_GENE <- BAT_GENE %>%
    mutate_at(vars(Experimental:Train_77), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_GENE)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_GENE)[6:81]))))

saveRDS(BAT_GENE, file="./CSI/Preprocessed/Batch_Gene.rds")

In [None]:
# Create batches for pheno set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% Phen$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% Phen$SampleID)) %>%
    pull(SampleID)

BAT_PHEN <- BAT %>% filter(SampleID %in% Phen$SampleID)

while(length(BAT_Cont)>=124){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 124))
    BAT_PHEN <- BAT_PHEN %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_PHEN <- BAT_PHEN %>%
    mutate_at(vars(Experimental:Train_82), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_PHEN)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_PHEN)[6:81]))))

saveRDS(BAT_PHEN, file="./CSI/Preprocessed/Batch_Phen.rds")

In [None]:
# Create batches for sMRI set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% sMRI$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% sMRI$SampleID)) %>%
    pull(SampleID)

BAT_sMRI <- BAT %>% filter(SampleID %in% sMRI$SampleID)

while(length(BAT_Cont)>=109){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 109))
    BAT_sMRI <- BAT_sMRI %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_sMRI <- BAT_sMRI %>%
    mutate_at(vars(Experimental:Train_93), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_sMRI)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_sMRI)[6:81]))))

saveRDS(BAT_sMRI, file="./CSI/Preprocessed/Batch_sMRI.rds")

In [None]:
# Create batches for dMRI set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% dMRI$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% dMRI$SampleID)) %>%
    pull(SampleID)

BAT_dMRI <- BAT %>% filter(SampleID %in% dMRI$SampleID)

while(length(BAT_Cont)>=109){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 109))
    BAT_dMRI <- BAT_dMRI %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_dMRI <- BAT_dMRI %>%
    mutate_at(vars(Experimental:Train_93), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_dMRI)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_dMRI)[6:81]))))

saveRDS(BAT_dMRI, file="./CSI/Preprocessed/Batch_dMRI.rds")

In [None]:
# Create batches for rsfMRI set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% rsfMRI$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% rsfMRI$SampleID)) %>%
    pull(SampleID)

BAT_rsfMRI <- BAT %>% filter(SampleID %in% rsfMRI$SampleID)

while(length(BAT_Cont)>=109){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 109))
    BAT_rsfMRI <- BAT_rsfMRI %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_rsfMRI <- BAT_rsfMRI %>%
    mutate_at(vars(Experimental:Train_23), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_rsfMRI)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_rsfMRI)[6:81]))))

saveRDS(BAT_rsfMRI, file="./CSI/Preprocessed/Batch_rsfMRI.rds")

In [None]:
# Create batches for tsfMRI set
i=1
set.seed(1)

BAT_Case <- OCD %>%
    filter((OCD==1) & (SampleID %in% UNION) & (SampleID %in% tsfMRI$SampleID)) %>%
    pull(SampleID)

BAT_Cont <- OCD %>%
    filter((OCD==0) & (SampleID %in% UNION) & (SampleID %in% tsfMRI$SampleID)) %>%
    pull(SampleID)

BAT_tsfMRI <- BAT %>% filter(SampleID %in% tsfMRI$SampleID)

while(length(BAT_Cont)>=109){
    name <- paste0("Train_",i,collapse="")
    SEL <- c(sample(BAT_Case, 100), sample(BAT_Cont, 109))
    BAT_tsfMRI <- BAT_tsfMRI %>%
        mutate(!!sym(name) := ifelse(SampleID %in% SEL, 1, 0))
    BAT_Cont <- BAT_Cont[!BAT_Cont %in% SEL]
    i=i+1
    cat(name, "-", length(BAT_Cont), "\n")
}

BAT_tsfMRI <- BAT_tsfMRI %>%
    mutate_at(vars(Experimental:Train_23), as.numeric) %>%
    rowwise() %>% 
    mutate(ALL = sum(c_across(all_of(names(BAT_tsfMRI)[3:81]))),
        TRAIN = sum(c_across(all_of(names(BAT_tsfMRI)[6:81]))))

saveRDS(BAT_tsfMRI, file="./CSI/Preprocessed/Batch_tsfMRI.rds")

In [None]:
# Save Kernels
saveRDS(Gene, file="./CSI/Preprocessed/Kernel_Gene.rds")
saveRDS(Phen, file="./CSI/Preprocessed/Kernel_Phen.rds")
saveRDS(sMRI, file="./CSI/Preprocessed/Kernel_sMRI.rds")
saveRDS(dMRI, file="./CSI/Preprocessed/Kernel_dMRI.rds")
saveRDS(rsfMRI, file="./CSI/Preprocessed/Kernel_rsfMRI.rds")
saveRDS(tsfMRI, file="./CSI/Preprocessed/Kernel_tsfMRI.rds")

### Specify Experimental Data

In [None]:
# Intersect all ambiguous cases with experimental data, we will use them for ML-PRS part
Intersection <- purrr::reduce(list(
    filter(BAT_PHEN, Experimental == 1)$SampleID,
    filter(BAT_sMRI, Experimental == 1)$SampleID,
    filter(BAT_dMRI, Experimental == 1)$SampleID,
    filter(BAT_rsfMRI, Experimental == 1)$SampleID,
    filter(BAT_tsfMRI, Experimental == 1)$SampleID,
    filter(BAT_GENE, Experimental == 1)$SampleID
    ), intersect)

length(Intersection)

In [None]:
# Pull Intersected data and save new table for analysis
BAT_EXP <- BAT_PHEN %>% 
    filter(SampleID %in% Intersection) %>%
    select(c(SampleID, Experimental))

dim(BAT_EXP)
saveRDS(BAT_EXP, file="./CSI/Preprocessed/Batch_Experimental.rds")