diff --git a/paper/.gitignore b/paper/.gitignore index a34144a..ca5d677 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -35,6 +35,11 @@ SyntheticStrain/results/hg19* ENCODE_CellLines/results/BAM ENCODE_CellLines/results/BAM-nospike ENCODE_CellLines/results/ID +BY4742-chipseq/logs/*.out +BY4742-chipseq/logs/*.err +BY4742-chipseq/results/FASTQ +BY4742-chipseq/results/BAM +BY4742-chipseq/results/ID CENPK-chipseq/logs/*.out CENPK-chipseq/logs/*.err CENPK-chipseq/results/FASTQ diff --git a/paper/BY4742-chipseq/README b/paper/BY4742-chipseq/README new file mode 100644 index 0000000..33dfa46 --- /dev/null +++ b/paper/BY4742-chipseq/README @@ -0,0 +1,13 @@ +# Run StrainID on BY4742 datasets to evaluate StrainID's ability to detect the variant-based strain background + +# "Molecular mechanisms that distinguish TFIID housekeeping from regulatable SAGA promoters" +# (de Jonge et al, 2017) + +# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81787 + +# The default sacCer3 StrainID database is used +# Download data using SRA accessions using `job/00_download_data.pbs` +# Align FASTQ files and process using `job/01_align_data.pbs` +# Run StrainID on BAM inputs using `job/02_run_StrainID.pbs` to determine if StrainID can successfully identify the strain background + +#specifically look into performance in distinguishing two closesly related strains: BY4741 and BY4742 diff --git a/paper/BY4742-chipseq/SraRunInfo.csv b/paper/BY4742-chipseq/SraRunInfo.csv new file mode 100644 index 0000000..e310d19 --- /dev/null +++ b/paper/BY4742-chipseq/SraRunInfo.csv @@ -0,0 +1,12 @@ +Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash +SRR3497399,2017-05-11 00:00:55,2016-05-11 11:04:54,67640691,3281838684,0,48,1816,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497399/SRR3497399.1,SRX1756249,Hsf1_ChIP,ChIP-Seq,ChIP,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432863,SAMN04966256,simple,4932,Saccharomyces cerevisiae,Hsf1_ChIP,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,F73F96964C383156F61A669AC0074AD9,443BEFEC858B5AF9C065249419528A69 +SRR3497410,2017-05-11 00:00:55,2016-05-11 11:08:21,80694279,3905823162,0,48,2028,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497410/SRR3497410.1,SRX1756250,Hsf1_input,ChIP-Seq,unspecified,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432864,SAMN04966257,simple,4932,Saccharomyces cerevisiae,Hsf1_input,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,C76260FC0AED0743E40E038EEC232960,D96A93109D0C695D8263594CD89E9473 +SRR3497446,2017-05-11 00:00:55,2016-05-17 11:46:57,56996253,8537022864,56996253,149,3350,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497446/SRR3497446.1,SRX1756259,Hsf1_t0_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432872,SAMN04966258,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,1D79B28E58E73793E3F2416BE52FC99B,A4C02F8943A763A0C413FF046BA716B5 +SRR3497452,2017-05-11 00:00:56,2016-05-11 11:24:59,62257597,9364630504,62257597,150,3611,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497452/SRR3497452.1,SRX1756270,Hsf1_t0_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432882,SAMN04966260,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,103A6282A4032934D6050C6DD2B5E6BF,871F9BAEC203D09E57EB5726F7A84CD1 +SRR3497453,2017-05-11 00:00:56,2016-05-11 12:06:21,61425579,9217294875,61425579,150,3523,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497453/SRR3497453.1,SRX1756274,Hsf1_t0_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432886,SAMN04966262,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,851EA8BF587B0C7F08802CD998CA5246,5BDD7B50B89F79F3CEC01DA4ABA31660 +SRR3497454,2017-05-11 00:00:56,2016-05-11 12:01:49,48284496,7235073133,48284496,149,2828,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497454/SRR3497454.1,SRX1756275,Hsf1_t0_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432887,SAMN04966264,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,220B1999B9CB4227F5A35B51BCC72724,2F01C0E9269BD0FA1523461BC01D8198 +SRR3497449,2017-05-11 00:00:56,2016-05-11 13:19:10,53608235,8028655564,53608235,149,3153,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497449/SRR3497449.1,SRX1756263,Hsf1_t30_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432875,SAMN04966259,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,A5A12C5FFFE69D3ABB4F0C42AE0FF7EA,5D151CA79CC01ACCE3C98CF5E646DD47 +SRR3497456,2017-05-11 00:00:56,2016-05-11 14:19:38,62404674,9388451136,62404674,150,3646,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497456/SRR3497456.1,SRX1756283,Hsf1_t30_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432895,SAMN04966261,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,4D316816A01301B7B30F9CC1381B5018,DDDB3A1D0A7A3F6255418B0C27D316E6 +SRR3497459,2017-05-11 00:00:56,2016-05-11 14:21:03,70846857,10638854137,70846857,150,4074,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497459/SRR3497459.1,SRX1756286,Hsf1_t30_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432898,SAMN04966263,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,9C84D57EE58BF1B09AF37E3504851620,C6850B43795D5AF9504DF7A5F4625B13 +SRR3497461,2017-05-11 00:00:56,2016-05-11 15:19:34,52591831,7857035482,52591831,149,3032,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497461/SRR3497461.1,SRX1756287,Hsf1_t30_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432899,SAMN04966265,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,00F78D41D1508EDEFAC4DE473028A626,6EB7361E2E83CF786951734E6596C8F0 + diff --git a/paper/BY4742-chipseq/job/00_download_data.pbs b/paper/BY4742-chipseq/job/00_download_data.pbs new file mode 100644 index 0000000..1f946aa --- /dev/null +++ b/paper/BY4742-chipseq/job/00_download_data.pbs @@ -0,0 +1,33 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=6 +#PBS -l pmem=24gb +#PBS -l walltime=03:00:00 +#PBS -A open +#PBS -o logs/download.data.log.out +#PBS -e logs/download.data.log.err +#PBS -t 1-10 + +# Requires +# parallel fastq dump v2.8.0 + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +module load anaconda3 +source activate ~/work/myconda/genopipe/ + +[ -d logs ] || mkdir logs +[ -d results/FASTQ ] || mkdir -p results/FASTQ + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SRR=`echo $INFO | cut -d"," -f1` +#echo $INFO + +# FASTQ-DUMP +echo "($INDEX) Begin downloading $SRR FASTQ..." +parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s $SRR +echo "Complete" diff --git a/paper/BY4742-chipseq/job/01_align_data.pbs b/paper/BY4742-chipseq/job/01_align_data.pbs new file mode 100644 index 0000000..88c5aa1 --- /dev/null +++ b/paper/BY4742-chipseq/job/01_align_data.pbs @@ -0,0 +1,57 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=4 +#PBS -l pmem=16gb +#PBS -l walltime=02:00:00 +#PBS -A open +#PBS -o logs/align.data.log.out +#PBS -e logs/align.data.log.err +#PBS -t 1-10 + +module load gcc +module load samtools +module load bwa +module load anaconda3 +source activate ~/work/myconda/genopipe/ + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +[ -d logs ] || mkdir logs +[ -d results/BAM ] || mkdir -p results/BAM +[ -d results/uniq-BAM ] || mkdir -p results/uniq-BAM + +YGENOME=$WRK/../input/sacCer3.fa +CSGENOME=$WRK/../input/sacCer3_index + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SRR=`echo $INFO | cut -d"," -f1` +SAMPLE=`echo $INFO | cut -d"," -f12` +PLATFORM=`echo $INFO | cut -d"," -f19` +#PAIR=`echo $INFO | cut -d"," -f16` +#echo $INFO + +FQ=$WRK/results/FASTQ/$SRR +BAM=$WRK/results/BAM/$SAMPLE + +echo "($PBS_ARRAYID) Aligned $SRR $PLATFORM reads > $BAM" +if [[ " $PLATFORM " =~ " ABI_SOLID " ]]; then + bowtie -C -S $CSGENOME <(gzip -dc $YGENOME $FQ\_1.fastq.gz) \ + | samtools sort \ + > $BAM.bam + echo "(PBS_ARRAYID) $BAM single aligned (bowtie color space)" +elif [[ " $PLATFORM " =~ " ILLUMINA " ]]; then + bwa mem $YGENOME $FQ\_1.fastq.gz $FQ\_2.fastq.gz -t 4 \ + | samtools sort \ + > $BAM.bam + echo "($PBS_ARRAYID) $BAM pair aligned (BWA)" +fi + +#samtools view -b -F4 $BAM > $WRK/results/uniq-BAM/$SAMPLE.bam + +echo "($PBS_ARRAYID) Indexing..." +samtools index $BAM.bam +echo "($PBS_ARRAYID) Complete!" diff --git a/paper/BY4742-chipseq/job/02_indexed_runSID.pbs b/paper/BY4742-chipseq/job/02_indexed_runSID.pbs new file mode 100644 index 0000000..f3df29e --- /dev/null +++ b/paper/BY4742-chipseq/job/02_indexed_runSID.pbs @@ -0,0 +1,52 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=6 +#PBS -l pmem=24gb +#PBS -l walltime=03:00:00 +#PBS -A open +#PBS -o logs/sid.log.out +#PBS -e logs/sid.log.err +#PBS -t 1-10 + +module load anaconda3 +source activate genopipe + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +[ -d logs ] || mkdir logs +[ -d results/ID ] || mkdir -p results/ID + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SAMPLE=`echo $INFO | cut -d"," -f12` +#echo $INFO + +# Store directory paths +DATABASE=$WRK/../db/sacCer3_VCF +GENOME=$WRK/../input/sacCer3.fa +SEED=$PBS_ARRAYID +GENOPIPE=$WRK/../.. + +BAM=$WRK/results/BAM/$SAMPLE +#BAM=$WRK/results/uniq-BAM/$SAMPLE +ID=$WRK/results/ID/ + +# Set-up Temp directory +TEMP=$WRK/temp-$PBS_ARRAYID +[ -d $TEMP ] || mkdir $TEMP +cd $TEMP +ln -s $BAM.bam +ln -s $BAM.bam.bai + +## Execute Single StrainID and record time +cd $GENOPIPE/StrainID +echo "**Begin executing StrainID for ${SAMPLE}..." +{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$SAMPLE.std ; } 2> $ID/$SAMPLE.time +echo "...single StrainID for ($PBS_ARRAYID) ${SAMPLE} finished." + +## Clean-up +rm -r $TEMP + diff --git a/paper/BY4742-chipseq/logs/README b/paper/BY4742-chipseq/logs/README new file mode 100644 index 0000000..c418cee --- /dev/null +++ b/paper/BY4742-chipseq/logs/README @@ -0,0 +1 @@ +# logfiles from STDERR and STDOUT of running job files go here diff --git a/paper/BY4742-chipseq/results/README b/paper/BY4742-chipseq/results/README new file mode 100644 index 0000000..c4eea82 --- /dev/null +++ b/paper/BY4742-chipseq/results/README @@ -0,0 +1 @@ +# Downloaded FASTQ files and StrainID results go here diff --git a/paper/README b/paper/README index 61091de..ee71424 100644 --- a/paper/README +++ b/paper/README @@ -16,6 +16,7 @@ paper |--ENCODEdata-CellLines |--HIV_samples |--YKOC-wgs +|--BY4742-chipseq |--CENPK-chipseq @@ -55,5 +56,8 @@ contains the scripts and information for downloading, processing, and running Ep ## YKOC-wgs contains the scripts and information for downloading, processing, and running DeletionID on the Puddu et al, 2019 dataset for identifying deletions +## BY4742-chipseq +contains the scripts and information for downloading, processing, and running StrainID on the BAM files + ## CENPK-chipseq -contains the scripts and information for downloading, processing, and running StrainID on the +contains the scripts and information for downloading, processing, and running StrainID on the BAM files diff --git a/paper/setup.sh b/paper/setup.sh index 12df39c..bef4593 100644 --- a/paper/setup.sh +++ b/paper/setup.sh @@ -13,8 +13,10 @@ # Required software: # wget +# Python 3 # Perl 5.18+ # bwa v0.7.14+ +# bowtie v1.2.3 # # Optional software: # twoBitToFa @@ -149,3 +151,8 @@ cd $WRK/db ln -s ../../StrainID/sacCer3_VCF ln -s ../../StrainID/hg19_VCF cd $WRK + +# Setup color-space index for yeast genome +# (used by BY4742-chipseq) +bowtie-build -C input/sacCer3.fa input/sacCer3_index +