# Calculating the PCA for the GMMAT and SMMAT Analyses 
This notebook documents recalculation of PCs for the population specific GMMAT and SMMAT analyses  

Pheno data
 > /mnt/mfs/statgen/alzheimers-family/pheno/pheno_updated_20221121/
 
Geno data: jointly called WGS data from EFIGA and NIALOAD
 > /mnt/mfs/statgen/alzheimers-family/normalized_bed/normalized_merged_autosome.*  


# Common Variants analyses: PCA

In [1]:
# split the geno file per pop. 
ml Singularity
for i in African European Hispanic; do
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb qc:1 \
  --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA \
  --genoFile /mnt/mfs/statgen/alzheimers-family/normalized_bed/normalized_merged_autosome.bed \
  --keep_samples /mnt/mfs/statgen/alzheimers-family/pheno/pheno_updated_20221121/$i.id \
  --name $i \
  --container /mnt/vast/hpc/csg/containers/lmm.sif
done

INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.bed[0m
INFO: Workflow qc (ID=w40505755e477e8fe) is executed successfully with 1 completed step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.European.filtered.bed[0m
INFO: Workflow qc (ID=w806094c862aae420) is executed successfully with 1 completed step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalize

In [2]:
for i in African European Hispanic; do
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb king \
  --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King \
  --container /mnt/vast/hpc/csg/containers/lmm.sif \
  --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed
done

INFO: Running [32mking_1[0m: Inference of relationships in the sample to identify closely related individuals
INFO: [32mking_1[0m is [32mcompleted[0m.
INFO: [32mking_1[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.African.filtered.kin0[0m
INFO: Running [32mking_2[0m: Select a list of unrelated individual with an attempt to maximize the unrelated individuals selected from the data
INFO: [32mking_2[0m is [32mcompleted[0m.
INFO: [32mking_2[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.African.filtered.related_id[0m
INFO: Running [32mking_3[0m: Split genotype data into related and unrelated samples, if related individuals are detected
INFO: [32mking_3[0m is [32mcompleted[0m.
INFO: [32mking_3[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.African.filtered.unrelated.bed /mnt/mfs/statg

In [9]:
for i in African European Hispanic; do
# unrelated individuals
ml Singularity
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb qc \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed \
    --remove_samples /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.$i.filtered.related_id \
    --maf_filter 0.01 \
    --geno_filter 0.1 \
    --mind_filter 0.1 \
    --hwe_filter 5e-08 \
    --name unrelated \
    --window 50 \
    --shift 10 \
    --r2 0.2 \
    --container /mnt/mfs/statgen/containers/lmm.sif
done

INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mqc_1[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.bed[0m
INFO: Running [32mLD pruning[0m: LD prunning and remove related individuals (both ind of a pair)
INFO: [32mLD pruning[0m is [32mcompleted[0m.
INFO: [32mLD pruning[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.bed /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.in[0m
INFO: Workflow qc (ID=wb8444235f09d5938) is executed successfully with 1 completed step and 1 ignored step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mqc_1[0m (index=0) is [32

In [10]:
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.in | wc -l
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.out | wc -l

3799108
16137551


In [11]:
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.log | grep 'samples ('
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.log | grep 'remain'

45 samples (0 females, 0 males, 45 ambiguous; 45 founders) loaded from
--extract: 3812519 variants remaining.
3812519 variants remaining after main filters.


In [12]:
wc -l /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.bim

3812519 /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.prune.bim


In [15]:
for i in African European Hispanic; do
# related individuals same set of variants
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb qc:1 \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed \
    --keep_samples /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.$i.filtered.related_id\
    --keep_variants /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.unrelated.filtered.prune.in \
    --maf_filter 0 \
    --geno_filter 1 \
    --mind_filter 0.1 \
    --hwe_filter 0 \
    --name related \
    --container /mnt/mfs/statgen/containers/lmm.sif
done

INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.related.filtered.extracted.bed[0m
INFO: Workflow qc (ID=w91cfa771347598a6) is executed successfully with 1 completed step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.European.filtered.related.filtered.extracted.bed[0m
INFO: Workflow qc (ID=w83ef60972346f2b2) is executed successfully with 1 completed step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/stat

In [16]:
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.related.filtered.extracted.log | grep 'remain'
cat /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.African.filtered.unrelated.filtered.log | grep 'samples ('

--extract: 3812887 variants remaining.
--keep: 68 samples remaining.
68 samples (0 females, 0 males, 68 ambiguous; 68 founders) remaining after main
3812887 variants remaining after main filters.
45 samples (0 females, 0 males, 45 ambiguous; 45 founders) loaded from


In [17]:
for i in African European Hispanic; do
sos run ~/project2022/notebook/AD/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb flashpca \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots\
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.unrelated.filtered.prune.bed \
    --phenoFile /mnt/mfs/statgen/alzheimers-family/pheno/pheno_updated_20221121/$i.txt \
    --k 10 \
    --label_col pop \
    --pop_col pop \
    --container /mnt/mfs/statgen/containers/lmm.sif
done

INFO: Running [32mflashpca_1[0m: Run PCA analysis using flashpca
INFO: [32mflashpca_1[0m is [32mcompleted[0m.
INFO: [32mflashpca_1[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.rds[0m
INFO: Running [32mflashpca_2[0m: 
INFO: [32mflashpca_2[0m is [32mcompleted[0m (pending nested workflow).
INFO: Running [32mdetect_outliers[0m: Calculate Mahalanobis distance per population and report outliers
INFO: [32mdetect_outliers[0m is [32mcompleted[0m.
INFO: [32mdetect_outliers[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.mahalanobis.rds /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.outliers... (5 items)[0m
INFO: [32mflashpca_2[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.mahalanobis.rds /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.outliers... (5 items)[0m
INFO: Running

In [None]:
# step 5 project back for related samples, detect and generate outlier samples
for i in African European Hispanic; do
sos run ~/project2022/notebook/AD/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb project_samples \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.related.filtered.extracted.bed \
    --phenoFile /mnt/mfs/statgen/alzheimers-family/pheno/pheno_updated_20221121/$i.txt\
    --pca_model /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/$i.pca.rds \
    --k 10 \
    --label_col pop \
    --pop_col pop \
    --container /mnt/vast/hpc/csg/containers/flashpcaR.sif
done

INFO: Running [32mproject_samples_1[0m: Project back to PCA model additional samples
INFO: [32mproject_samples_1[0m is [32mcompleted[0m.
INFO: [32mproject_samples_1[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.rds[0m
INFO: Running [32mproject_samples_2[0m: 
INFO: [32mproject_samples_2[0m is [32mcompleted[0m (pending nested workflow).
INFO: Running [32mdetect_outliers[0m: Calculate Mahalanobis distance per population and report outliers
INFO: [32mdetect_outliers[0m is [32mcompleted[0m.
INFO: [32mdetect_outliers[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.mahalanobis.rds /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.outliers... (5 items)[0m
INFO: [32mproject_samples_2[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.mahalanobis.rds /mnt/mfs/statgen/alz

In [None]:
# step 5 part 2
for i in African European Hispanic; do
sos run ~/project2022/notebook/AD/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb plot_pca \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed \
    --phenoFile /mnt/mfs/statgen/alzheimers-family/pheno/pheno_updated_20221121/$i.txt \
    --pop_col pop \
    --label_col pop\
    --k 10 \
    --plot_data /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/$i.pca.projected.rds\
    --container /mnt/vast/hpc/csg/containers/flashpcaR.sif
done

INFO: Running [32mplot_pca[0m: Plot PCA results. Can be used independently as "plot_pca" or combined with other workflow as eg "flashpca+plot_pca"
INFO: [32mplot_pca[0m is [32mcompleted[0m.
INFO: [32mplot_pca[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.pc.png /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/African.pca.projected.scree.png... (3 items)[0m
INFO: Workflow plot_pca (ID=w11ec71d77a952316) is executed successfully with 1 completed step.
INFO: Running [32mplot_pca[0m: Plot PCA results. Can be used independently as "plot_pca" or combined with other workflow as eg "flashpca+plot_pca"
INFO: [32mplot_pca[0m is [32mcompleted[0m.
INFO: [32mplot_pca[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/European.pca.projected.pc.png /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/plots/European.pca.projected.scree.png... (3 items)[0m
INFO: Workflow plot_p

# Gnerate QCed genoFile without LD pruning to use in the GMMAT analysis


In [3]:
for i in African European Hispanic; do
# unrelated individuals
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb qc_no_prune \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/ \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed \
    --remove_samples /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.$i.filtered.related_id \
    --maf_filter 0.01 \
    --geno_filter 0.1 \
    --mind_filter 0.1 \
    --hwe_filter 5e-08 \
    --name unrelated \
    --container /mnt/mfs/statgen/containers/lmm.sif
# related individuals same set of variants
sos run ~/project2022/notebook/AD/xqtl-pipeline/pipeline/GWAS_QC.ipynb qc:1 \
    --cwd /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/ \
    --genoFile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/normalized_merged_autosome.$i.filtered.bed \
    --keep_samples /mnt/mfs/statgen/alzheimers-family/AD_common_variants/PCA/King/normalized_merged_autosome.$i.filtered.related_id \
    --keep_variants /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.$i.filtered.unrelated.filtered.bim \
    --maf_filter 0 \
    --geno_filter 0.1 \
    --mind_filter 0.1 \
    --hwe_filter 0 \
    --name related \
    --container /mnt/mfs/statgen/containers/lmm.sif 
done

INFO: Running [32mqc_no_prune[0m: Filter SNPs and select individuals
INFO: [32mqc_no_prune[0m is [32mcompleted[0m.
INFO: [32mqc_no_prune[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.unrelated.filtered.bed[0m
INFO: Workflow qc_no_prune (ID=w214389013524cfa6) is executed successfully with 1 completed step.
INFO: Running [32mbasic QC filters[0m: Filter SNPs and select individuals
INFO: [32mbasic QC filters[0m is [32mcompleted[0m.
INFO: [32mbasic QC filters[0m output:   [32m/mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.related.filtered.extracted.bed[0m
INFO: Workflow qc (ID=w55663afe177e45bb) is executed successfully with 1 completed step.
INFO: Running [32mqc_no_prune[0m: Filter SNPs and select individuals
INFO: [32mqc_no_prune[0m is [32mcompleted[0m.
INFO: [32mqc_no_prune[0m output:   [32m/mnt/mfs/statgen/alzheimers-fam

In [4]:
# merge two data-sets
bash: container= '/mnt/mfs/statgen/containers/lmm.sif'
for i in African European Hispanic; do
    plink --bfile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.$i.filtered.related.filtered.extracted \
         --bmerge /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.$i.filtered.unrelated.filtered.bed \
                  /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.$i.filtered.unrelated.filtered.bim \
                  /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.$i.filtered.unrelated.filtered.fam \
        --make-bed --keep-allele-order --out /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/$i
done

bash: bash:: command not found
PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/African.log.
Options in effect:
  --bfile /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.related.filtered.extracted
  --bmerge /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.unrelated.filtered.bed /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.unrelated.filtered.bim /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/normalized_merged_autosome.African.filtered.unrelated.filtered.fam
  --keep-allele-order
  --make-bed
  --out /mnt/mfs/statgen/alzheimers-family/AD_common_variants/geno_qced/African

515677 MB RAM detected; reserving 25783