# Code for creating phenotype file for running GWAS in Regenie

The phenotype file needs to include the following in its header:

* FID - family identifier (in this case since the participants are unrelated we set the FID to 0 for all rows so that Regenie accepts the file as input)
* IID - the individual identifier number (i.e. subjectID in the provided LASI-DAD folder)
* Phenotypes (can have multiple columns for separate phenotypes that you want to run simultaneous GWAS on) - we are using Neurocognitive Disorder (ncd); high blood pressure (hypertension); type 2 diabetes (T2DM); coronary heart disease (CHD); stroke
    * Because NCD is a continuous phenotype (0,1,2) it is included in the continuous pheno tsv, then we produce a separate folder with all the remaining binary phenotypes. This is because Regenie will need to be run separately to account for doing linear versus logistic regression.
    
**Note to NEUROHACK facilitators: This code is written for use with the full dataset for LASI-DAD to run it in the full LASI-DAD data**

In [1]:
library(dplyr)
library(data.table)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [7]:
# load provided LASI-DAD files
pheno_main <- fread("LASI_DAD_Neurohack_Phenotype_full.csv")
pheno_main$FID = 0
pheno_comorbid <- fread("LASI_DAD_Neurohack_health_pheno_full.csv")
head(pheno_main)
head(pheno_comorbid)

subjectID,r1agey,ragender,raeducl,ncd,fm_ad_dad,fm_ad_mom,rs429358,rs7412,unrelated,FID
<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>
L0001,70,1.male,1.less than lower secondary,0,0,0,0,0,1,0
L0002,84,1.male,2.upper secondary & vocational training,0,0,0,0,0,1,0
L0003,62,1.male,2.upper secondary & vocational training,0,0,0,0,0,1,0
L0004,62,2.female,2.upper secondary & vocational training,0,0,0,0,0,1,0
L0005,68,2.female,1.less than lower secondary,0,0,0,0,0,1,0
L0006,66,2.female,1.less than lower secondary,1,0,0,0,0,1,0


subjectID,r1hibpe,r1diabe,r1hearte,r1stroke,r1psyche,r1alzdeme,r1conhrtfe,r1hrtatte,r1hrtrhme,r1hrtatt,r1mbmi,r1mbmicat
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>
L0001,1,1,0,0,0,0,0,0,0,0.0,24.86436,2
L0002,0,1,0,0,0,0,0,0,0,0.0,18.67785,2
L0003,1,0,1,0,0,0,0,1,0,,20.25111,2
L0004,0,0,0,0,0,0,0,0,0,0.0,19.51196,2
L0005,1,0,0,0,0,0,0,0,0,0.0,32.00309,4
L0006,0,0,0,0,0,0,0,0,0,0.0,15.13764,1


In [8]:
full_pheno <- merge(pheno_main, pheno_comorbid, by = "subjectID")
head(full_pheno)


subjectID,r1agey,ragender,raeducl,ncd,fm_ad_dad,fm_ad_mom,rs429358,rs7412,unrelated,⋯,r1hearte,r1stroke,r1psyche,r1alzdeme,r1conhrtfe,r1hrtatte,r1hrtrhme,r1hrtatt,r1mbmi,r1mbmicat
<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>
L0001,70,1.male,1.less than lower secondary,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0.0,24.86436,2
L0002,84,1.male,2.upper secondary & vocational training,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0.0,18.67785,2
L0003,62,1.male,2.upper secondary & vocational training,0,0,0,0,0,1,⋯,1,0,0,0,0,1,0,,20.25111,2
L0004,62,2.female,2.upper secondary & vocational training,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0.0,19.51196,2
L0005,68,2.female,1.less than lower secondary,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0.0,32.00309,4
L0006,66,2.female,1.less than lower secondary,1,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0.0,15.13764,1


In [12]:
pheno_regenie <- full_pheno %>% select(FID, subjectID, ncd, r1hibpe, r1diabe, r1hearte, r1stroke) %>% rename(IID = "subjectID") %>% rename(hypertension = "r1hibpe") %>% rename(T2DM = "r1diabe") %>% rename(CHD = "r1hearte") %>% rename(stroke = "r1stroke")
head(pheno_regenie)

# file with continuous phenotypes

pheno_cont <- pheno_regenie %>% select(FID, IID, ncd)
head(pheno_cont)

# file with binary phenotypes

pheno_bin <- pheno_regenie %>% select(FID, IID, hypertension, T2DM, CHD, stroke)
head(pheno_bin)

FID,IID,ncd,hypertension,T2DM,CHD,stroke
<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>
0,L0001,0,1,1,0,0
0,L0002,0,0,1,0,0
0,L0003,0,1,0,1,0
0,L0004,0,0,0,0,0
0,L0005,0,1,0,0,0
0,L0006,1,0,0,0,0


FID,IID,ncd
<dbl>,<chr>,<int>
0,L0001,0
0,L0002,0
0,L0003,0
0,L0004,0
0,L0005,0
0,L0006,1


FID,IID,hypertension,T2DM,CHD,stroke
<dbl>,<chr>,<int>,<int>,<int>,<int>
0,L0001,1,1,0,0
0,L0002,0,1,0,0
0,L0003,1,0,1,0
0,L0004,0,0,0,0
0,L0005,1,0,0,0
0,L0006,0,0,0,0


In [13]:
write.table(pheno_regenie, file = "phenotype_regenie.tsv", col.names = T, sep = "\t", na = "NA", row.names = F, quote = F)
write.table(pheno_cont, file = "continuous_phenotype_regenie.tsv", col.names = T, sep = "\t", na = "NA", row.names = F, quote = F)
write.table(pheno_bin, file = "binary_phenotypes_regenie.tsv", col.names = T, sep = "\t", na = "NA", row.names = F, quote = F)