## Notebook to do any additional setup on instance and pull input data for analysis

In [1]:
!date

Tue Aug  3 22:37:16 UTC 2021


In [2]:
import os

In [4]:
# parameter variables
cohort = 'foundin'
version = 'amppdv1'
cohort_version = f'{cohort}.{version}'
quant_type = 'scrn'

# directories for initial setup
home_dir = '/home/jupyter'
nbs_dir = '/home/jupyter/notebooks'
wrk_dir = f'/home/jupyter/sceqtl'
geno_dir = f'{wrk_dir}/genotypes'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# input data
data_bucket = 'gs://foundin-processed-assay'
qtl_bucket_path = f'{data_bucket}/analysis/sceqtl'
nbs_bucket_path = f'{qtl_bucket_path}/notebooks'
genos_bucket_path = f'gs://foundin-processed-assay/analysis/eqtl/genotypes/{cohort_version}.chr*'
quants_bucket_path = 'gs://foundin-processed-assay/analysis/scrn_eqtl/\
Matrix_AvgGeneExpressionSampleBroadCellTypeCluster_NormRNA_18Nov2020.txt'
quants_local_file = f'{quants_dir}/{quant_type}.avgnormbroad.csv'
features_bucket_path = 'gs://foundin-processed-assay/analysis/eqtl/expression/\
gencode_v29.lncipedia_v5_2_hc.annotation.pkl'
tissue_genes_bckt_path = 'gs://foundin-processed-assay/analysis/eqtl/expression/\
tissue_category_rna_brain_Tissue.tsv'
# fasta_index_bucket_path = 'gs://gcp-public-data--broad-references/hg38/v0/\
# Homo_sapiens_assembly38.fasta.fai'
sample_info_bckt_path = f'{qtl_bucket_path}/sample_info'
# gwas_results_bckt_path = f'{qtl_bucket_path}/gwas/pd_meta5v2_cojo_results.jma.cojo.csv'
gwas_results_bckt_path = 'gs://foundin-processed-assay/analysis/eqtl/gwas'

In [4]:
# make the dirs
os.makedirs(wrk_dir, exist_ok=True)
os.makedirs(geno_dir, exist_ok=True)
os.makedirs(quants_dir, exist_ok=True)
os.makedirs(info_dir, exist_ok=True)
os.makedirs(public_dir, exist_ok=True)

#### mirror down input data from the input buckets

In [5]:
# pull genotypes to local disk
this_cmd = f'gsutil -mq cp -P {genos_bucket_path} {geno_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {geno_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/genotypes/foundin.amppdv1.chr* /home/jupyter/sceqtl/genotypes/
total 16G
-rw-r--r-- 1 jupyter jupyter 1.3K Aug  3 22:37 foundin.amppdv1.chr1.log
-rw-r--r-- 1 jupyter jupyter  14M Aug  3 22:37 foundin.amppdv1.chr1.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Aug  3 22:37 foundin.amppdv1.chr1.psam
-rw-r--r-- 1 jupyter jupyter 1.2G Aug  3 22:37 foundin.amppdv1.chr1.pvar
-rw-r--r-- 1 jupyter jupyter 1.3K Aug  3 22:37 foundin.amppdv1.chr10.log
-rw-r--r-- 1 jupyter jupyter 9.2M Aug  3 22:37 foundin.amppdv1.chr10.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Aug  3 22:37 foundin.amppdv1.chr10.psam
-rw-r--r-- 1 jupyter jupyter 763M Aug  3 22:37 foundin.amppdv1.chr10.pvar
-rw-r--r-- 1 jupyter jupyter 1.3K Aug  3 22:37 foundin.amppdv1.chr11.log
-rw-r--r-- 1 jupyter jupyter 9.0M Aug  3 22:37 foundin.amppdv1.chr11.pgen
-rw-r--r-- 1 jupyter jupyter 3.2K Aug  3 22:37 foundin.amppdv1.chr11.psam
-rw-r--r-- 1 jupyter jupyter 801M Aug  3 22:37 foundin.amppd

#### pull quantified features to local disk

In [5]:
this_cmd = f'gsutil -mq cp -P {quants_bucket_path} {quants_local_file}'
print(this_cmd)
!{this_cmd}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/scrn_eqtl/Matrix_AvgGeneExpressionSampleBroadCellTypeCluster_NormRNA_18Nov2020.txt /home/jupyter/sceqtl/quants/sc.avgnormbroad.csv


#### pull genome fasta index file

In [9]:
# this_cmd = f'gsutil -mq cp -P {fasta_index_bucket_path} {quants_dir}/'
# print(this_cmd)
# !{this_cmd}

#### pull features annotation file

In [6]:
this_cmd = f'gsutil -mq cp -P {features_bucket_path} {quants_dir}/'
print(this_cmd)
!{this_cmd}

this_cmd = f'gsutil -mq cp -P {tissue_genes_bckt_path} {quants_dir}/'
print(this_cmd)
!{this_cmd}

!ls -lh {quants_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/expression/gencode_v29.lncipedia_v5_2_hc.annotation.pkl /home/jupyter/sceqtl/quants/
gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/expression/tissue_category_rna_brain_Tissue.tsv /home/jupyter/sceqtl/quants/
total 1.7G
-rw-r--r-- 1 jupyter jupyter 1.5G Oct  1  2020 gencode_v29.lncipedia_v5_2_hc.annotation.pkl
-rw-r--r-- 1 jupyter jupyter 277M Aug  4 20:50 sc.avgnormbroad.csv
-rw-r--r-- 1 jupyter jupyter 5.5M Feb 18 17:24 tissue_category_rna_brain_Tissue.tsv


#### pull down the sample info

In [9]:
this_cmd = f'gsutil -mq cp -P {sample_info_bckt_path}/* {info_dir}/'
print(this_cmd)
!{this_cmd}

!ls -lh {info_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/sceqtl/sample_info/* /home/jupyter/sceqtl/sample_info/
CommandException: No URLs matched: gs://foundin-processed-assay/analysis/sceqtl/sample_info/*
CommandException: 1 file/object could not be transferred.
total 0


#### pull down any need gwas results

In [10]:
this_cmd = f'gsutil -mq cp -P {gwas_results_bckt_path}/* {public_dir}/'
print(this_cmd)
!{this_cmd}
!ls -lh {public_dir}

gsutil -mq cp -P gs://foundin-processed-assay/analysis/eqtl/gwas/* /home/jupyter/sceqtl/public/
total 825M
-rw-r--r-- 1 jupyter jupyter 6.9K Aug  3 22:43 pd.table_s2.clean.txt
-rw-r--r-- 1 jupyter jupyter  15K Aug  3 22:43 pd_meta5v2_cojo_results.jma.cojo.csv
-rw-r--r-- 1 jupyter jupyter 825M Aug  3 22:43 pdmeta_sumstats_hg38.h5


#### add plink2

In [11]:
!wget http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip -O /tmp/plink_linux_x86_64.zip
!unzip /tmp/plink_linux_x86_64.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64.zip
!sudo mv /tmp/plink/plink2 /usr/local/bin/
!rm -rf /tmp/plink

--2021-08-03 22:43:26--  http://s3.amazonaws.com/plink2-assets/alpha2/plink2_linux_x86_64.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.95.184
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.95.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8879151 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64.zip’


2021-08-03 22:43:26 (30.8 MB/s) - ‘/tmp/plink_linux_x86_64.zip’ saved [8879151/8879151]

Archive:  /tmp/plink_linux_x86_64.zip
  inflating: /tmp/plink/plink2       


In [12]:
!plink2

PLINK v2.00a2.3LM 64-bit Intel (24 Jan 2020)   www.cog-genomics.org/plink/2.0/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink2 <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink2 --help [flag name(s)...]

Commands include --rm-dup list, --make-bpgen, --export, --freq, --geno-counts,
--sample-counts, --missing, --hardy, --indep-pairwise, --ld, --sample-diff,
--make-king, --king-cutoff, --write-samples, --write-snplist, --make-grm-list,
--pca, --glm, --adjust-file, --score, --variant-score, --genotyping-rate,
--pgen-info, --validate, and --zst-decompress.

"plink2 --help | more" describes all functions.


#### install bcftools

In [13]:
!sudo apt-get --quiet install -y libbz2-dev liblzma-dev libtool-bin libncurses-dev

samtools_version = '1.12'
# htslib
!wget --quiet https://github.com/samtools/htslib/releases/download/{samtools_version}/htslib-{samtools_version}.tar.bz2 -O {home_dir}/htslib-{samtools_version}.tar.bz2
!tar -xf {home_dir}/htslib-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/htslib-{samtools_version}
os.chdir(f'{home_dir}/htslib-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet
!sudo make --quiet install
!rm {home_dir}/htslib-{samtools_version}.tar.bz2
# also need bcftools for subsetting vcf
!wget --quiet https://github.com/samtools/bcftools/releases/download/{samtools_version}/bcftools-{samtools_version}.tar.bz2 -O {home_dir}/bcftools-{samtools_version}.tar.bz2
!tar -xf /{home_dir}/bcftools-{samtools_version}.tar.bz2 -C {home_dir}
# !cd {tools_dir}/bcftools-{samtools_version}
os.chdir(f'{home_dir}/bcftools-{samtools_version}')
!./configure --quiet --prefix=/usr/local
!make --quiet {home_dir}/
!sudo make --quiet install
!rm {home_dir}/bcftools-{samtools_version}.tar.bz2

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  autoconf automake autotools-dev bzip2-doc libltdl-dev libsigsegv2 libtool m4
Suggested packages:
  autoconf-archive gnu-standards autoconf-doc gettext libtool-doc liblzma-doc
  ncurses-doc gfortran | fortran95-compiler gcj-jdk m4-doc
The following NEW packages will be installed:
  autoconf automake autotools-dev bzip2-doc libbz2-dev libltdl-dev liblzma-dev
  libncurses-dev libsigsegv2 libtool libtool-bin m4
0 upgraded, 12 newly installed, 0 to remove and 1 not upgraded.
Need to get 3632 kB of archives.
After this operation, 10.9 MB of additional disk space will be used.
Get:1 http://deb.debian.org/debian buster/main amd64 libsigsegv2 amd64 2.12-2 [32.8 kB]
Get:2 http://deb.debian.org/debian buster/main amd64 m4 amd64 1.4.18-2 [203 kB]
Get:3 http://deb.debian.org/debian buster/main amd64 autoconf all 2.69-11 [341 kB]
Get:4 http://deb.debian.org/debian 

#### add plink1.9

In [14]:
!wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip -O /tmp/plink_linux_x86_64_20201019.zip
!unzip /tmp/plink_linux_x86_64_20201019.zip -d /tmp/plink
!rm -f /tmp/plink_linux_x86_64_20201019.zip
!sudo mv /tmp/plink/plink /usr/local/bin/
!rm -rf /tmp/plink

--2021-08-03 22:46:15--  http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.68.150
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.68.150|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8916695 (8.5M) [application/zip]
Saving to: ‘/tmp/plink_linux_x86_64_20201019.zip’


2021-08-03 22:46:15 (31.4 MB/s) - ‘/tmp/plink_linux_x86_64_20201019.zip’ saved [8916695/8916695]

Archive:  /tmp/plink_linux_x86_64_20201019.zip
  inflating: /tmp/plink/plink        
  inflating: /tmp/plink/LICENSE      
  inflating: /tmp/plink/toy.ped      
  inflating: /tmp/plink/toy.map      
  inflating: /tmp/plink/prettify     


In [15]:
!plink

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3

  plink <input flag(s)...> [command flag(s)...] [other flag(s)...]
  plink --help [flag name(s)...]

Commands include --make-bed, --recode, --flip-scan, --merge-list,
--write-snplist, --list-duplicate-vars, --freqx, --missing, --test-mishap,
--hardy, --mendel, --ibc, --impute-sex, --indep-pairphase, --r2, --show-tags,
--blocks, --distance, --genome, --homozyg, --make-rel, --make-grm-gz,
--rel-cutoff, --cluster, --pca, --neighbour, --ibs-test, --regress-distance,
--model, --bd, --gxe, --logistic, --dosage, --lasso, --test-missing,
--make-perm-pheno, --tdt, --qfam, --annotate, --clump, --gene-report,
--meta-analysis, --epistasis, --fast-epistasis, and --score.



#### install umap and ppscore

In [16]:
!pip install -U --quiet umap-learn ppscore dask[complete] tables plotly-express

In [17]:
import umap
import ppscore
import dask.dataframe as dd

#### install tensorQTL
there is an older version in pypi, but need a couple of the bug fixes so install from github

In [18]:
# required by tensorQTL
!pip install --quiet pandas-plink

In [21]:
## NOTE: had some permission problems, used jupyterlab to clone the git

# !git clone --quiet git@github.com:broadinstitute/tensorqtl.git
# os.chdir(f'{nbs_dir}/tensorqtl')
# !pip install -r install/requirements.txt

The authenticity of host 'github.com (140.82.112.4)' can't be established.
RSA key fingerprint is SHA256:nThbg6kXUpJWGl7E1IGOCspRomTxdCARLviKw6E5SY8.
Are you sure you want to continue connecting (yes/no)? ^C


In [19]:
import tensorqtl.tensorqtl as tensorqtl



#### if want to use Story q-value call with tensorQTL need to add R

In [None]:
# sudo apt-get --quiet install -y r-base build-essential libcurl4-gnutls-dev \
# libxml2-dev libssl-dev

In [None]:
#sudo -i R
# install.packages("devtools")
# library("devtools")
# install_github("jdstorey/qvalue")

#### install GSEApy or gene set enrichment stuff

In [7]:
!pip install -U --quiet gseapy