# Load libraries

In [1]:
#rhumba::set_channels(c("conda-forge", "default")) # Rhumba makes it easier to install packages in conda; set channels for rhumba

if (!require("psych")) rhumba::install("r-psych")
if (!require("dplyr")) rhumba::install("r-dplyr")
if (!require("ggplot2")) rhumba::install("r-ggplot2")
if (!require("car")) rhumba::install("r-car")
#if (!require("easystats")) install.packages("easystats")
if (!require("janitor")) rhumba::install("r-janitor")
if (!require("readxl")) rhumba::install("r-readxl")
library(matrixStats)
#if (!require("remotes")) rhumba::install("r-remotes")
#if (!require("ggseg")) install.packages("ggseg")
#library(ggsegDKT)

Loading required package: psych

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggplot2


Attaching package: ‘ggplot2’


The following objects are masked from ‘package:psych’:

    %+%, alpha


Loading required package: car

Loading required package: carData


Attaching package: ‘car’


The following object is masked from ‘package:dplyr’:

    recode


The following object is masked from ‘package:psych’:

    logit


Loading required package: janitor


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


Loading required package: readxl


Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count




# Read data and exclude individuals with neurological disorders

In [2]:
data=read.table('/dagher/dagher11/filip/MAPT_OB/data/extracted_data_updated_2024.csv', header=TRUE, na.strings = "", sep=',', quote = "\"")
icddate=read.table('/dagher/dagher11/filip/UPF/data/icd10date.csv', header=TRUE, na.strings = "", sep=',', quote = "\"")
data=merge(data,icddate)

exclusions_neuro=c('^G','^E0','^E10','^E2','^E3','^A8','^C70','^C71','^C72','^I6')
# G - nervous system; F - Mental and behavioural; E0 - thyroid; E11-14 - diabetes, E2-3 endocrine glands; 
# A8 - viral infections of the CNS; C70-72 malignant neoplasms of brain, meninges and spinal cord; 
# I6 - cerebrovscular

icd_dates_cols=grep('icd10_41280', colnames(data))
index=1
data$neurological_disorder = NA

for (i in as.numeric(grep('icd10_41270', colnames(data)))) {
    for (d in 1:length(exclusions_neuro)) {
        data$neurological_disorder[grepl(exclusions_neuro[d], data[[i]]) & 
                                   data[icd_dates_cols[index]]<data$`date_of_attending_assessment_centre_53.2.0`] = 1
    }
    index=index+1
}

data$excluded <- NA
data$excluded[data$neurological_disorder ==1] <- 1

data=data[!is.na(data$`mean_thickness_of_caudalanteriorcingulate_left_hemisphere_27174.2.0`),]
data$included<-car::recode(data$excluded, "1='excluded'; else='included'")
nrow(data)
#Select subjects if included 
data_excluded <- subset(data, included=="included")


ukbb_all=data_excluded

ukbb_all$WHR = ukbb_all$waist_circumference_48.0.0/ukbb_all$hip_circumference_49.0.0
ukbb_all$agesq=ukbb_all$age_when_attended_assessment_centre_21003.2.0^2

nrow(ukbb_all)

In [None]:
46810
41175

# Remove outliers

In [3]:
sum(is.na(ukbb_all))
for (j in 1:ncol(ukbb_all)) { # 
    if (is.numeric(ukbb_all[[j]])){
        #print(colnames(ukbb_all)[j])
        Q3=as.numeric(quantile(ukbb_all[j],0.75, na.rm=TRUE))
        Q1=as.numeric(quantile(ukbb_all[j],0.25, na.rm=TRUE))
        upper=Q3+(2.2*(Q3-Q1))
        lower=Q1-(2.2*(Q3-Q1))
        ukbb_all[j][ukbb_all[j]<lower]=NA
        ukbb_all[j][ukbb_all[j]>upper]=NA
        }
    }
sum(is.na(ukbb_all))

# Merge with genotype data

In [4]:
data_conf=read.table('/dagher/dagher11/filip/MAPT_OB/data/UKBB_CA_CT_CV_SCV.csv', sep=',', header=T, skip=1)
#data_conf=clean_names(data_conf)

In [5]:
data_APOE=read_excel('/dagher/dagher11/filip/MAPT_OB/data/APOE_annotated.xlsx')
colnames(data_APOE)=c(colnames(data_APOE)[1:length(colnames(data_APOE))-1],'haplotype_APOE')
data_MAPT=read.table('/dagher/dagher11/filip/MAPT_OB/data/MAPT_haplotype_UKBB.csv', sep=',', header=T)
colnames(data_MAPT)=c(colnames(data_MAPT)[1:length(colnames(data_MAPT))-1],'haplotype_MAPT')

In [6]:
data_all=merge(ukbb_all, data_MAPT, by.x='eid', by.y='fid', all.x=T)
data_all=merge(data_all, data_APOE, by.x='eid', by.y='fid', all.x=T)
#data_all=merge(data_all, dplyr::select(data_conf, encoded_anonymised_participant_id_eid,
#                                      genotype_measurement_batch_uses_data_coding_22000_22000_0_0),
#              by.x='eid', by.y='encoded_anonymised_participant_id_eid', all.x=T)

# Save

In [7]:
write.csv(data_all, '/dagher/dagher11/filip/MAPT_OB/data/dataset_excluded_new.csv', quote=T, row.names=F)