Merge pull request #436 from ENCODE-DCC/dev

v2.2.3
ENCODE-DCC · Feb 15, 2024 · 47ba8df · 47ba8df
2 parents 5fbf14b + 915c4d4
commit 47ba8df
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 15 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -66,7 +66,7 @@ jobs:
           name: build image
           command: |
             source ${BASH_ENV}
-            export DOCKER_CACHE_TAG=v2.0.0
+            export DOCKER_CACHE_TAG=v2.2.3-cache2
             echo "pulling ${DOCKER_CACHE_TAG}!"
             docker pull encodedcc/atac-seq-pipeline:${DOCKER_CACHE_TAG}
             docker login -u=${DOCKERHUB_USER} -p=${DOCKERHUB_PASS}

diff --git a/README.md b/README.md
@@ -2,13 +2,17 @@
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.156534.svg)](https://doi.org/10.5281/zenodo.156534)[![CircleCI](https://circleci.com/gh/ENCODE-DCC/atac-seq-pipeline/tree/master.svg?style=svg)](https://circleci.com/gh/ENCODE-DCC/atac-seq-pipeline/tree/master)
 
-
 ## Introduction
 
 This pipeline is designed for automated end-to-end quality control and processing of ATAC-seq and DNase-seq data. The pipeline can be run on compute clusters with job submission engines as well as on stand alone machines. It inherently makes uses of parallelized/distributed computing. Pipeline installation is also easy as most dependencies are automatically installed. The pipeline can be run end-to-end, starting from raw FASTQ files all the way to peak calling and signal track generation using a single caper submit command. One can also start the pipeline from intermediate stages (for example, using alignment files as input). The pipeline supports both single-end and paired-end data as well as replicated or non-replicated datasets. The outputs produced by the pipeline include 1) formatted HTML reports that include quality control measures specifically designed for ATAC-seq and DNase-seq data, 2) analysis of reproducibility, 3) stringent and relaxed thresholding of peaks, 4) fold-enrichment and pvalue signal tracks. The pipeline also supports detailed error reporting and allows for easy resumption of interrupted runs. It has been tested on some human, mouse and yeast ATAC-seq datasets as well as on human and mouse DNase-seq datasets.
 
 The ATAC-seq pipeline protocol specification is [here](https://docs.google.com/document/d/1f0Cm4vRyDQDu0bMehHD7P7KOMxTOP-HiNoIvL1VcBt8/edit?usp=sharing). Some parts of the ATAC-seq pipeline were developed in collaboration with Jason Buenrostro, Alicia Schep and Will Greenleaf at Stanford.
 
+## Issues with PE Fastqs downloaded from SRA
+
+Read names in PE Fastqs should be consistent across the files pair. Do not use `--readids` in `fastq-dump` so that reads in a pair have the same read name. Inconsitent read names (for example, `READNAME.1` in FQ1 and `READNAME.2` in FQ2) will result in an empty BAM error in a `filter` step.
+
+
 ### Features
 
 * **Portability**: The pipeline run can be performed across different cloud platforms such as Google, AWS and DNAnexus, as well as on cluster engines such as SLURM, SGE and PBS.

diff --git a/atac.wdl b/atac.wdl
@@ -7,10 +7,10 @@ struct RuntimeEnvironment {
 }
 
 workflow atac {
-    String pipeline_ver = 'v2.2.2'
+    String pipeline_ver = 'v2.2.3'
 
     meta {
-        version: 'v2.2.2'
+        version: 'v2.2.3'
 
         author: 'Jin wook Lee'
         email: 'leepc12@gmail.com'
@@ -19,8 +19,8 @@ workflow atac {
 
         specification_document: 'https://docs.google.com/document/d/1f0Cm4vRyDQDu0bMehHD7P7KOMxTOP-HiNoIvL1VcBt8/edit?usp=sharing'
 
-        default_docker: 'encodedcc/atac-seq-pipeline:v2.2.2'
-        default_singularity: 'https://encode-pipeline-singularity-image.s3.us-west-2.amazonaws.com/atac-seq-pipeline_v2.2.2.sif'
+        default_docker: 'encodedcc/atac-seq-pipeline:v2.2.3'
+        default_singularity: 'https://encode-pipeline-singularity-image.s3.us-west-2.amazonaws.com/atac-seq-pipeline_v2.2.3.sif'
         default_conda: 'encd-atac'
         croo_out_def: 'https://storage.googleapis.com/encode-pipeline-output-definition/atac.croo.v5.json'
 
@@ -72,8 +72,8 @@ workflow atac {
     }
     input {
         # group: runtime_environment
-        String docker = 'encodedcc/atac-seq-pipeline:v2.2.2'
-        String singularity = 'https://encode-pipeline-singularity-image.s3.us-west-2.amazonaws.com/atac-seq-pipeline_v2.2.2.sif'
+        String docker = 'encodedcc/atac-seq-pipeline:v2.2.3'
+        String singularity = 'https://encode-pipeline-singularity-image.s3.us-west-2.amazonaws.com/atac-seq-pipeline_v2.2.3.sif'
         String conda = 'encd-atac'
         String conda_macs2 = 'encd-atac-macs2'
         String conda_spp = 'encd-atac-spp'

diff --git a/dev/docker_image/Dockerfile b/dev/docker_image/Dockerfile
@@ -30,7 +30,7 @@ ENV PATH="/software:${PATH}"
 RUN wget https://github.com/openssl/openssl/archive/OpenSSL_1_0_2t.tar.gz && tar zxvf OpenSSL_1_0_2t.tar.gz && cd openssl-OpenSSL_1_0_2t/ && ./config && make && make install && cd ../ && rm -rf openssl-OpenSSL_1_0_2t* && rm /usr/bin/openssl && ln -s /usr/local/ssl/bin/openssl /usr/bin/openssl
 
 # Install system/math python packages (python3)
-RUN pip3 install --no-cache-dir jsondiff==1.1.1 common python-dateutil cython pandas==0.25.1 jinja2==2.10.1 matplotlib==3.1.1
+RUN pip3 install --no-cache-dir jsondiff==1.1.1 common python-dateutil pandas==0.25.1 jinja2==2.10.1 matplotlib==3.1.1
 
 # Install genomic python package (python3)
 RUN pip3 install --no-cache-dir pyBigwig==0.3.13 cutadapt==2.5 pyfaidx==0.5.5.2 pybedtools==0.8.0 pysam==0.15.3 deeptools==3.3.1
@@ -40,9 +40,10 @@ RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.r-project.org'; opt
     Rscript -e "install.packages('snow')" && \
     Rscript -e "install.packages('snowfall')" && \
     Rscript -e "install.packages('bitops')" && \
-    Rscript -e "install.packages('caTools')" && \
     Rscript -e "install.packages('Rcpp')"
 
+RUN wget "https://cran.r-project.org/src/contrib/Archive/caTools/caTools_1.17.1.4.tar.gz" && R CMD INSTALL caTools_1.17.1.4.tar.gz && rm -f caTools_1.17.1.4.tar.gz
+
 # Install bioconductor and Rsamtools which is required by spp package
 RUN Rscript -e "source('http://bioconductor.org/biocLite.R'); biocLite('Rsamtools')"
 
@@ -94,13 +95,16 @@ RUN git clone --branch 2.0.4.2 --single-branch https://github.com/kundajelab/idr
 RUN pip2 install --no-cache-dir numpy scipy matplotlib==2.2.4 bx-python==0.8.2 biopython==1.76
 RUN pip3 install --no-cache-dir biopython==1.76
 
-# Install genomic python packages (python2)
-RUN pip2 install --no-cache-dir metaseq==0.5.6
-
 # Install MACS2 (python3)
-RUN pip3 install --no-cache-dir Cython
+RUN pip3 install --no-cache-dir Cython==0.29.0
 RUN pip3 install --no-cache-dir macs2==2.2.4
 
+# Install genomic python packages (python2)
+RUN pip2 install --no-cache-dir Cython==0.29.0 versioneer setuptools==44.1.1
+RUN pip2 install --no-cache-dir pybedtools==0.6.9
+RUN pip2 install --no-cache-dir metaseq==0.5.6
+RUN pip2 install --no-cache-dir gffutils==0.10.1
+
 # Install UCSC tools (v377)
 RUN git clone https://github.com/ENCODE-DCC/kentUtils_bin_v377
 ENV PATH=${PATH}:/software/kentUtils_bin_v377/bin
@@ -143,4 +147,3 @@ ENV PYTHONPATH="/software/atac-seq-pipeline/src"
 COPY src atac-seq-pipeline/src/
 COPY atac.wdl atac-seq-pipeline/
 COPY dev/test/test_py atac-seq-pipeline/dev/test/test_py/
-