## Notebook to do any addtional setup on instance and pull input data for analysis

In [1]:
!date

Thu Feb 18 17:21:23 UTC 2021


In [5]:
import os

In [8]:
# parameter variables
cohort = 'foundin'
version = 'amppdv1'
cohort_version = f'{cohort}.{version}'
quant_type = 'vst'

# directories for initial setup
home_dir = '/home/jupyter'
nbs_dir = '/home/jupyter/notebooks'
wrk_dir = f'/home/jupyter/{cohort}/eqtl'
geno_dir = f'{wrk_dir}/genotypes'
expr_dir = f'{wrk_dir}/expression'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'


# input data
data_bucket = 'gs://foundin-processed-assay'
eqtl_bucket_path = f'{data_bucket}/analysis/eqtl'
nbs_bucket_path = f'{eqtl_bucket_path}/notebooks'
genos_bucket_path = f'{eqtl_bucket_path}/genotypes/{cohort_version}.chr*'
quants_bucket_path = f'gs://foundin-processed-assay/loni/RNAB/aggregated_expression/{quant_type}Table.tsv'
quants_local_file = f'{expr_dir}/{quant_type}.genes.tsv'
gencode_bucket_path = f'{eqtl_bucket_path}/expression/gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
tissue_genes_bckt_path = f'{eqtl_bucket_path}/expression/tissue_category_rna_brain_Tissue.tsv'
fasta_index_bucket_path = 'gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai'
sample_info_bckt_path = f'{eqtl_bucket_path}/sample_info'
# gwas_results_bckt_path = f'{eqtl_bucket_path}/gwas/pd_meta5v2_cojo_results.jma.cojo.csv'
gwas_results_bckt_path = f'{eqtl_bucket_path}/gwas'
scaden_fracs_bckt_path = 'gs://foundin-processed-assay/analysis/deconvolution/rnab_cell_fracs_scaden.csv'
hipsci_results_bckt_path = 'gs://foundin-processed-assay/analysis/eqtl/public/hipsci/\
eqtl_summary_stats_renamed/D52.DA.untreated.qtl_results_all.sorted.txt.gz*'
metabrain_results_bckt_path = 'gs://foundin-processed-assay/analysis/eqtl/public/\
metabrain/2020-05-26-Cortex-EUR-*-biogenformat.txt.gz'

In [4]:
# make the dirs
os.makedirs(wrk_dir, exist_ok=True)
os.makedirs(geno_dir, exist_ok=True)
os.makedirs(expr_dir, exist_ok=True)
os.makedirs(info_dir, exist_ok=True)
os.makedirs(public_dir, exist_ok=True)

#### pull down the rest of the analysis notebooks

In [5]:
this_cmd = f'gsutil -mq cp -P {nbs_bucket_path}/* {nbs_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {nbs_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/notebooks/* /home/jupyter/notebooks/
total 6.5M
-rw-r--r-- 1 jupyter jupyter 7.4K Feb  9 16:52 Readme.md
-rw-r--r-- 1 jupyter jupyter 5.2K Feb 17 22:39 finish_and_push_back.ipynb
-rw-r--r-- 1 jupyter jupyter 4.6K Oct  2 04:03 foundin_cis_eqtl_runner.ipynb
-rw-r--r-- 1 jupyter jupyter  16K Feb 12 05:54 foundin_day_cis_eqtl.ipynb
-rw-r--r-- 1 jupyter jupyter  16K Feb 17 23:20 foundin_day_cis_eqtl_plink2.ipynb
-rw-r--r-- 1 jupyter jupyter  16K Feb 17 23:51 foundin_day_cis_eqtl_tensorqtl.ipynb
-rw-r--r-- 1 jupyter jupyter 891K Feb 18 00:08 foundin_day_expression_prep.ipynb
-rw-r--r-- 1 jupyter jupyter 4.9K Oct  1 21:02 foundin_expr_prep_runner.ipynb
-rw-r--r-- 1 jupyter jupyter 3.1M Feb  9 23:35 foundin_format_expression_covariates.ipynb
-rw-r--r-- 1 jupyter jupyter 9.2K Feb 17 23:50 foundin_frmt_tensorqt_genos.ipynb
-rw-r--r-- 1 jupyter jupyter  19K Oct  1 20:01 foundin_gcs_raw.ipynb
-rw-r--r-- 1 jupyter jupyter 7.6K Oct  1 20:01

#### mirror down input data from the input buckets

In [6]:
# pull genotypes to local disk
this_cmd = f'gsutil -mq cp -P {genos_bucket_path} {geno_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {geno_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/genotypes/foundin.amppdv1.chr* /home/jupyter/foundin/eqtl/genotypes/
total 16G
-rw-r--r-- 1 jupyter jupyter 1.3K Feb 18 17:21 foundin.amppdv1.chr1.log
-rw-r--r-- 1 jupyter jupyter  14M Feb 18 17:21 foundin.amppdv1.chr1.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Feb 18 17:21 foundin.amppdv1.chr1.psam
-rw-r--r-- 1 jupyter jupyter 1.2G Feb 18 17:22 foundin.amppdv1.chr1.pvar
-rw-r--r-- 1 jupyter jupyter 1.3K Feb 18 17:21 foundin.amppdv1.chr10.log
-rw-r--r-- 1 jupyter jupyter 9.2M Feb 18 17:21 foundin.amppdv1.chr10.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Feb 18 17:21 foundin.amppdv1.chr10.psam
-rw-r--r-- 1 jupyter jupyter 763M Feb 18 17:21 foundin.amppdv1.chr10.pvar
-rw-r--r-- 1 jupyter jupyter 1.3K Feb 18 17:21 foundin.amppdv1.chr11.log
-rw-r--r-- 1 jupyter jupyter 9.0M Feb 18 17:21 foundin.amppdv1.chr11.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Feb 18 17:21 foundin.amppdv1.chr11.psam
-rw-r--r-- 1 jupyter jupyter 801M Feb 18 17:22 foundin

#### pull expression phenos to local disk

In [7]:
this_cmd = f'gsutil -mq cp -P {quants_bucket_path} {quants_local_file}'
print(this_cmd)
!{this_cmd}

gsutil -mq cp -P gs://foundin-processed-assay/loni/RNAB/aggregated_expression/vstTable.tsv /home/jupyter/foundin/eqtl/expression/vst.genes.tsv


#### pull genome fasta index file

In [8]:
fasta_index_bucket_path
this_cmd = f'gsutil -mq cp -P {fasta_index_bucket_path} {expr_dir}/'
print(this_cmd)
!{this_cmd}

gsutil -mq cp -P gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai /home/jupyter/foundin/eqtl/expression/


#### pull gencode annotation file

In [9]:
this_cmd = f'gsutil -mq cp -P {gencode_bucket_path} {expr_dir}/'
print(this_cmd)
!{this_cmd}

this_cmd = f'gsutil -mq cp -P {tissue_genes_bckt_path} {expr_dir}/'
print(this_cmd)
!{this_cmd}

!ls -lh {expr_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/expression/gencode_v29.lncipedia_v5_2_hc.annotation.pkl /home/jupyter/foundin/eqtl/expression/
gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/expression/tissue_category_rna_brain_Tissue.tsv /home/jupyter/foundin/eqtl/expression/
total 1.9G
-rw-r--r-- 1 jupyter jupyter 158K Feb 18 17:24 Homo_sapiens_assembly38.fasta.fai
-rw-r--r-- 1 jupyter jupyter 1.5G Oct  1 20:22 gencode_v29.lncipedia_v5_2_hc.annotation.pkl
-rw-r--r-- 1 jupyter jupyter 5.5M Feb 18 17:24 tissue_category_rna_brain_Tissue.tsv
-rw-r--r-- 1 jupyter jupyter 464M Feb 18 17:24 vst.genes.tsv


#### pull down the sample info

In [10]:
this_cmd = f'gsutil -mq cp -P {sample_info_bckt_path}/* {info_dir}/'
print(this_cmd)
!{this_cmd}

this_cmd = f'gsutil -mq cp -P {scaden_fracs_bckt_path} {info_dir}/'
print(this_cmd)
!{this_cmd}

!ls -lh {info_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/sample_info/* /home/jupyter/foundin/eqtl/sample_info/
gsutil -mq cp -P gs://foundin-processed-assay/analysis/deconvolution/rnab_cell_fracs_scaden.csv /home/jupyter/foundin/eqtl/sample_info/
total 1.1M
-rw-r--r-- 1 jupyter jupyter  11K Feb 18 17:24 Expanded_overview_of_included_PPMI_samples_GRS.csv
-rw-r--r-- 1 jupyter jupyter  21K Feb 18 17:24 Expanded_overview_of_included_PPMI_samples_overview.csv
-rw-r--r-- 1 jupyter jupyter 856K Feb 18 17:24 amppd_demographicsPlus_2019_v1release_1015.csv
-rw-r--r-- 1 jupyter jupyter  13K Feb 18 17:24 cell_metadata.csv
-rw-r--r-- 1 jupyter jupyter 2.5K Oct  1 22:29 foundin.d0.psam
-rw-r--r-- 1 jupyter jupyter 3.7K Oct  1 20:47 foundin.d0.umap.covs.csv
-rw-r--r-- 1 jupyter jupyter 2.4K Oct  2 00:12 foundin.d25.psam
-rw-r--r-- 1 jupyter jupyter 4.2K Oct  1 20:52 foundin.d25.umap.covs.csv
-rw-r--r-- 1 jupyter jupyter 2.4K Oct  2 02:05 foundin.d65.psam
-rw-r--r-- 1 jupyter jupyter 3.7K Oct  1 20

#### pull down any need gwas results

In [2]:
this_cmd = f'gsutil -mq cp -P {gwas_results_bckt_path}/* {public_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {public_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/gwas/* /home/jupyter/foundin/eqtl/gwas/
total 28K
-rw-r--r-- 1 jupyter jupyter 1.1K Feb 18 23:18 pd.risk.variants
-rw-r--r-- 1 jupyter jupyter 6.9K Apr 15 14:09 pd.table_s2.clean.txt
-rw-r--r-- 1 jupyter jupyter  15K Apr 15 14:09 pd_meta5v2_cojo_results.jma.cojo.csv


#### pull down other relevant CNS eQTL for comparisons
- HipSci (Jerber et al 2021) differentiated day 52 untreated DA neurons
- MetaBrain (de Klein et al 2021) meta eQTL analyis of brain cortex data

In [9]:
this_cmd = f'gsutil -mq cp -P {hipsci_results_bckt_path} {public_dir}/'
print(this_cmd)
!{this_cmd}

this_cmd = f'gsutil -mq cp -P {metabrain_results_bckt_path} {public_dir}/'
print(this_cmd)
!{this_cmd}

!ls -lh {public_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/public/hipsci/eqtl_summary_stats_renamed/D52.DA.untreated.qtl_results_all.sorted.txt.gz* /home/jupyter/foundin/eqtl/public/
gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/public/metabrain/2020-05-26-Cortex-EUR-*-biogenformat.txt.gz /home/jupyter/foundin/eqtl/public/
total 28G
-rw-r--r-- 1 jupyter jupyter 2.5G Aug  1 21:20 2020-05-26-Cortex-EUR-1-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 1.1G Aug  1 21:20 2020-05-26-Cortex-EUR-10-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 1.9G Aug  1 21:20 2020-05-26-Cortex-EUR-11-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 1.4G Aug  1 21:20 2020-05-26-Cortex-EUR-12-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 462M Aug  1 21:19 2020-05-26-Cortex-EUR-13-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 899M Aug  1 21:20 2020-05-26-Cortex-EUR-14-biogenformat.txt.gz
-rw-r--r-- 1 jupyter jupyter 735M Aug  1 21:20 2020-05-26-Cortex-EUR-15-biogenformat.txt.gz
-rw-r--

#### add plink2

In [2]:
!wget http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip -O /tmp/plink_linux_x86_64.zip
!unzip /tmp/plink_linux_x86_64.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64.zip
!sudo mv /tmp/plink/plink2 /usr/local/bin/
!rm -rf /tmp/plink

--2021-07-22 03:40:21--  http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.166.232
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.166.232|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8879151 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64.zip’


2021-07-22 03:40:22 (31.6 MB/s) - ‘/tmp/plink_linux_x86_64.zip’ saved [8879151/8879151]

Archive:  /tmp/plink_linux_x86_64.zip
  inflating: /tmp/plink/plink2       


In [3]:
!plink2

PLINK v2.00a2.3LM 64-bit Intel (24 Jan 2020)   www.cog-genomics.org/plink/2.0/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --indep-pairwise, --ld, --sample-diff,
--make-king, --king-cutoff, --write-samples, --write-snplist, --make-grm-list,
--pca, --glm, --adjust-file, --score, --variant-score, --genotyping-rate,
--pgen-info, --validate, and --zst-decompress.

"plink2 --help | more" describes all functions.


#### install bcftools

In [8]:
!sudo apt-get --quiet install -y libbz2-dev liblzma-dev libtool-bin libncurses-dev

samtools_version = '1.12'
# htslib
!wget --quiet https://github.com/samtools/htslib/releases/download/{samtools_version}/htslib-{samtools_version}.tar.bz2 -O {home_dir}/htslib-{samtools_version}.tar.bz2
!tar -xf {home_dir}/htslib-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/htslib-{samtools_version}
os.chdir(f'{home_dir}/htslib-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet
!sudo make --quiet install
!rm {home_dir}/htslib-{samtools_version}.tar.bz2
# also need bcftools for subsetting vcf
!wget --quiet https://github.com/samtools/bcftools/releases/download/{samtools_version}/bcftools-{samtools_version}.tar.bz2 -O {home_dir}/bcftools-{samtools_version}.tar.bz2
!tar -xf /{home_dir}/bcftools-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/bcftools-{samtools_version}
os.chdir(f'{home_dir}/bcftools-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet {home_dir}/
!sudo make --quiet install
!rm {home_dir}/bcftools-{samtools_version}.tar.bz2

Reading package lists...
Building dependency tree...
Reading state information...
libbz2-dev is already the newest version (1.0.6-9.2~deb10u1).
libtool-bin is already the newest version (2.4.6-9).
libncurses-dev is already the newest version (6.1+20181013-2+deb10u2).
liblzma-dev is already the newest version (5.2.4-1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


#### add plink1.9

In [14]:
!wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip -O /tmp/plink_linux_x86_64_20201019.zip
!unzip /tmp/plink_linux_x86_64_20201019.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64_20201019.zip
!sudo mv /tmp/plink/plink /usr/local/bin/
!rm -rf /tmp/plink

--2021-02-18 17:24:45--  http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.111.69
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.111.69|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8916695 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64_20201019.zip’


2021-02-18 17:24:46 (31.8 MB/s) - ‘/tmp/plink_linux_x86_64_20201019.zip’ saved [8916695/8916695]

Archive:  /tmp/plink_linux_x86_64_20201019.zip
  inflating: /tmp/plink/plink        
  inflating: /tmp/plink/LICENSE      
  inflating: /tmp/plink/toy.ped      
  inflating: /tmp/plink/toy.map      
  inflating: /tmp/plink/prettify     


In [15]:
!plink

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink --help [flag name(s)...]

Commands include --make-bed, --recode, --flip-scan, --merge-list,
--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,
--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,
--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,
--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,
--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,
--make-perm-pheno, --tdt, --qfam, --annotate, --clump, --gene-report,
--meta-analysis, --epistasis, --fast-epistasis, and --score.



#### install umap and ppscore

In [2]:
!pip install -U --quiet umap-learn ppscore dask[complete] tables plotly-express

In [17]:
import umap
import ppscore
import dask.dataframe as dd

#### install tensorQTL
there is an older version in pypi, but need a couple of the bug fixes so install from github

In [1]:
# required by tensorQTL
!pip install --quiet pandas-plink

Collecting pandas-plink
  Downloading pandas_plink-2.2.9-cp37-cp37m-manylinux2010_x86_64.whl (100 kB)
[K     |████████████████████████████████| 100 kB 4.5 MB/s ta 0:00:011
[?25hCollecting pytest>=5.2.2
  Downloading pytest-6.2.4-py3-none-any.whl (280 kB)
[K     |████████████████████████████████| 280 kB 9.6 MB/s eta 0:00:01
Collecting zstandard>=0.13.0
  Downloading zstandard-0.15.2-cp37-cp37m-manylinux2014_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 7.4 MB/s eta 0:00:01
Collecting Deprecated>=1.2.6
  Downloading Deprecated-1.2.12-py2.py3-none-any.whl (9.5 kB)
Collecting xarray>=0.18.2
  Downloading xarray-0.18.2-py3-none-any.whl (807 kB)
[K     |████████████████████████████████| 807 kB 28.5 MB/s eta 0:00:01
Collecting py>=1.8.2
  Using cached py-1.10.0-py2.py3-none-any.whl (97 kB)
Collecting iniconfig
  Using cached iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting pluggy<1.0.0a1,>=0.12
  Using cached pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Insta

In [None]:
### NOTE: had some permission problems, used jupyterlab to clone the git

# !git clone --quiet git@github.com:broadinstitute/tensorqtl.git
# os.chdir(f'{nbs_dir}/tensorqtl')
# !pip install -r install/requirements.txt

#### if want to use Story q-value call with tensorQTL need to add R

In [None]:
sudo apt-get --quiet install -y r-base build-essential libcurl4-gnutls-dev \
libxml2-dev libssl-dev

In [None]:
#sudo -i R
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")

In [21]:
import tensorqtl.tensorqtl as tensorqtl

