From 4067d6e8cf15a272e725a03606660a5b46cb93cd Mon Sep 17 00:00:00 2001 From: Olivia Wen-Mei Lang Date: Sat, 23 Oct 2021 11:23:50 -0400 Subject: [PATCH 1/5] setup BY4742 test of StrainID on real data Using the Hsf1 ChIPseq data from de Jong et al 2017, we can test StrainID's ability to identify the BY4742 strain background. This commit includes the metadata and README updates including placeholders for results and log directories. SraRunInfo.csv pulled from NCBI SRA search. --- paper/BY4742-chipseq/README | 13 +++++++++++++ paper/BY4742-chipseq/SraRunInfo.csv | 12 ++++++++++++ paper/BY4742-chipseq/logs/README | 1 + paper/BY4742-chipseq/results/README | 1 + paper/README | 6 +++++- 5 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 paper/BY4742-chipseq/README create mode 100644 paper/BY4742-chipseq/SraRunInfo.csv create mode 100644 paper/BY4742-chipseq/logs/README create mode 100644 paper/BY4742-chipseq/results/README diff --git a/paper/BY4742-chipseq/README b/paper/BY4742-chipseq/README new file mode 100644 index 0000000..33dfa46 --- /dev/null +++ b/paper/BY4742-chipseq/README @@ -0,0 +1,13 @@ +# Run StrainID on BY4742 datasets to evaluate StrainID's ability to detect the variant-based strain background + +# "Molecular mechanisms that distinguish TFIID housekeeping from regulatable SAGA promoters" +# (de Jonge et al, 2017) + +# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81787 + +# The default sacCer3 StrainID database is used +# Download data using SRA accessions using `job/00_download_data.pbs` +# Align FASTQ files and process using `job/01_align_data.pbs` +# Run StrainID on BAM inputs using `job/02_run_StrainID.pbs` to determine if StrainID can successfully identify the strain background + +#specifically look into performance in distinguishing two closesly related strains: BY4741 and BY4742 diff --git a/paper/BY4742-chipseq/SraRunInfo.csv b/paper/BY4742-chipseq/SraRunInfo.csv new file mode 100644 index 0000000..e310d19 --- /dev/null +++ b/paper/BY4742-chipseq/SraRunInfo.csv @@ -0,0 +1,12 @@ +Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash +SRR3497399,2017-05-11 00:00:55,2016-05-11 11:04:54,67640691,3281838684,0,48,1816,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497399/SRR3497399.1,SRX1756249,Hsf1_ChIP,ChIP-Seq,ChIP,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432863,SAMN04966256,simple,4932,Saccharomyces cerevisiae,Hsf1_ChIP,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,F73F96964C383156F61A669AC0074AD9,443BEFEC858B5AF9C065249419528A69 +SRR3497410,2017-05-11 00:00:55,2016-05-11 11:08:21,80694279,3905823162,0,48,2028,GCF_000146045.2,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-7/SRR3497410/SRR3497410.1,SRX1756250,Hsf1_input,ChIP-Seq,unspecified,GENOMIC,SINGLE,0,0,ABI_SOLID,AB 5500xl-W Genetic Analysis System,SRP074822,PRJNA321111,,321111,SRS1432864,SAMN04966257,simple,4932,Saccharomyces cerevisiae,Hsf1_input,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,C76260FC0AED0743E40E038EEC232960,D96A93109D0C695D8263594CD89E9473 +SRR3497446,2017-05-11 00:00:55,2016-05-17 11:46:57,56996253,8537022864,56996253,149,3350,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497446/SRR3497446.1,SRX1756259,Hsf1_t0_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432872,SAMN04966258,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,1D79B28E58E73793E3F2416BE52FC99B,A4C02F8943A763A0C413FF046BA716B5 +SRR3497452,2017-05-11 00:00:56,2016-05-11 11:24:59,62257597,9364630504,62257597,150,3611,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497452/SRR3497452.1,SRX1756270,Hsf1_t0_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432882,SAMN04966260,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,103A6282A4032934D6050C6DD2B5E6BF,871F9BAEC203D09E57EB5726F7A84CD1 +SRR3497453,2017-05-11 00:00:56,2016-05-11 12:06:21,61425579,9217294875,61425579,150,3523,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497453/SRR3497453.1,SRX1756274,Hsf1_t0_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432886,SAMN04966262,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,851EA8BF587B0C7F08802CD998CA5246,5BDD7B50B89F79F3CEC01DA4ABA31660 +SRR3497454,2017-05-11 00:00:56,2016-05-11 12:01:49,48284496,7235073133,48284496,149,2828,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497454/SRR3497454.1,SRX1756275,Hsf1_t0_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432887,SAMN04966264,simple,4932,Saccharomyces cerevisiae,Hsf1_t0_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,220B1999B9CB4227F5A35B51BCC72724,2F01C0E9269BD0FA1523461BC01D8198 +SRR3497449,2017-05-11 00:00:56,2016-05-11 13:19:10,53608235,8028655564,53608235,149,3153,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497449/SRR3497449.1,SRX1756263,Hsf1_t30_0.05U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432875,SAMN04966259,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.05U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,A5A12C5FFFE69D3ABB4F0C42AE0FF7EA,5D151CA79CC01ACCE3C98CF5E646DD47 +SRR3497456,2017-05-11 00:00:56,2016-05-11 14:19:38,62404674,9388451136,62404674,150,3646,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497456/SRR3497456.1,SRX1756283,Hsf1_t30_0.2U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432895,SAMN04966261,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.2U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,4D316816A01301B7B30F9CC1381B5018,DDDB3A1D0A7A3F6255418B0C27D316E6 +SRR3497459,2017-05-11 00:00:56,2016-05-11 14:21:03,70846857,10638854137,70846857,150,4074,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos2/sra-pub-run-3/SRR3497459/SRR3497459.1,SRX1756286,Hsf1_t30_0.8U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432898,SAMN04966263,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_0.8U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,9C84D57EE58BF1B09AF37E3504851620,C6850B43795D5AF9504DF7A5F4625B13 +SRR3497461,2017-05-11 00:00:56,2016-05-11 15:19:34,52591831,7857035482,52591831,149,3032,,https://sra-downloadb.st-va.ncbi.nlm.nih.gov/sos1/sra-pub-run-8/SRR3497461/SRR3497461.1,SRX1756287,Hsf1_t30_3.0U,MNase-Seq,MNase,GENOMIC,PAIRED,0,0,ILLUMINA,NextSeq 500,SRP074822,PRJNA321111,,321111,SRS1432899,SAMN04966265,simple,4932,Saccharomyces cerevisiae,Hsf1_t30_3.0U,,,,,,,no,,,,,PRINCESS MÁXIMA CENTER FOR PEDIATRIC ONCOLOGY,SRA424700,,public,00F78D41D1508EDEFAC4DE473028A626,6EB7361E2E83CF786951734E6596C8F0 + diff --git a/paper/BY4742-chipseq/logs/README b/paper/BY4742-chipseq/logs/README new file mode 100644 index 0000000..c418cee --- /dev/null +++ b/paper/BY4742-chipseq/logs/README @@ -0,0 +1 @@ +# logfiles from STDERR and STDOUT of running job files go here diff --git a/paper/BY4742-chipseq/results/README b/paper/BY4742-chipseq/results/README new file mode 100644 index 0000000..c4eea82 --- /dev/null +++ b/paper/BY4742-chipseq/results/README @@ -0,0 +1 @@ +# Downloaded FASTQ files and StrainID results go here diff --git a/paper/README b/paper/README index 61091de..ee71424 100644 --- a/paper/README +++ b/paper/README @@ -16,6 +16,7 @@ paper |--ENCODEdata-CellLines |--HIV_samples |--YKOC-wgs +|--BY4742-chipseq |--CENPK-chipseq @@ -55,5 +56,8 @@ contains the scripts and information for downloading, processing, and running Ep ## YKOC-wgs contains the scripts and information for downloading, processing, and running DeletionID on the Puddu et al, 2019 dataset for identifying deletions +## BY4742-chipseq +contains the scripts and information for downloading, processing, and running StrainID on the BAM files + ## CENPK-chipseq -contains the scripts and information for downloading, processing, and running StrainID on the +contains the scripts and information for downloading, processing, and running StrainID on the BAM files From ccc43a677158bdb2c8a64cb35955b738a61061d6 Mon Sep 17 00:00:00 2001 From: Olivia Wen-Mei Lang Date: Sat, 23 Oct 2021 11:34:20 -0400 Subject: [PATCH 2/5] add scripts download BY4742 fastq This commit includes the PBS script for downloading the FASTQ files from the Sra metadata associated with de Jonge 2017. The `.gitignore` file is updated appropriately to ignore log and FASTQ files. --- paper/.gitignore | 3 ++ paper/BY4742-chipseq/job/00_download_data.pbs | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 paper/BY4742-chipseq/job/00_download_data.pbs diff --git a/paper/.gitignore b/paper/.gitignore index a34144a..8d0d56b 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -35,6 +35,9 @@ SyntheticStrain/results/hg19* ENCODE_CellLines/results/BAM ENCODE_CellLines/results/BAM-nospike ENCODE_CellLines/results/ID +BY4742-chipseq/logs/*.out +BY4742-chipseq/logs/*.err +BY4742-chipseq/results/FASTQ CENPK-chipseq/logs/*.out CENPK-chipseq/logs/*.err CENPK-chipseq/results/FASTQ diff --git a/paper/BY4742-chipseq/job/00_download_data.pbs b/paper/BY4742-chipseq/job/00_download_data.pbs new file mode 100644 index 0000000..1f946aa --- /dev/null +++ b/paper/BY4742-chipseq/job/00_download_data.pbs @@ -0,0 +1,33 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=6 +#PBS -l pmem=24gb +#PBS -l walltime=03:00:00 +#PBS -A open +#PBS -o logs/download.data.log.out +#PBS -e logs/download.data.log.err +#PBS -t 1-10 + +# Requires +# parallel fastq dump v2.8.0 + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +module load anaconda3 +source activate ~/work/myconda/genopipe/ + +[ -d logs ] || mkdir logs +[ -d results/FASTQ ] || mkdir -p results/FASTQ + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SRR=`echo $INFO | cut -d"," -f1` +#echo $INFO + +# FASTQ-DUMP +echo "($INDEX) Begin downloading $SRR FASTQ..." +parallel-fastq-dump --gzip --split-files -t 8 -O results/FASTQ -s $SRR +echo "Complete" From da96f8792d0f3711def82f9d423525ebdf50525a Mon Sep 17 00:00:00 2001 From: Olivia Wen-Mei Lang Date: Tue, 26 Oct 2021 05:53:10 -0400 Subject: [PATCH 3/5] scripts to align BY4742 data This commit includes scripts and setup needed to align the BY4742 ChIPseq data which includes some ABI SOLiD data. `paper/setup.sh` was updated to create a bowtie colorspace index of sacCer3 for the ABI samples `paper/.gitignore` was updated to include the BAM alignment output `paper/BY4742-chipseq/job01_align_data.pbs` is the PBS script to call either BWA or bowtie (as appropriate for sequencing platform used) to align the raw FASTQ data. --- paper/.gitignore | 1 + paper/BY4742-chipseq/job/01_align_data.pbs | 57 ++++++++++++++++++++++ paper/setup.sh | 7 +++ 3 files changed, 65 insertions(+) create mode 100644 paper/BY4742-chipseq/job/01_align_data.pbs diff --git a/paper/.gitignore b/paper/.gitignore index 8d0d56b..b088e62 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -38,6 +38,7 @@ ENCODE_CellLines/results/ID BY4742-chipseq/logs/*.out BY4742-chipseq/logs/*.err BY4742-chipseq/results/FASTQ +BY4742-chipseq/results/BAM CENPK-chipseq/logs/*.out CENPK-chipseq/logs/*.err CENPK-chipseq/results/FASTQ diff --git a/paper/BY4742-chipseq/job/01_align_data.pbs b/paper/BY4742-chipseq/job/01_align_data.pbs new file mode 100644 index 0000000..88c5aa1 --- /dev/null +++ b/paper/BY4742-chipseq/job/01_align_data.pbs @@ -0,0 +1,57 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=4 +#PBS -l pmem=16gb +#PBS -l walltime=02:00:00 +#PBS -A open +#PBS -o logs/align.data.log.out +#PBS -e logs/align.data.log.err +#PBS -t 1-10 + +module load gcc +module load samtools +module load bwa +module load anaconda3 +source activate ~/work/myconda/genopipe/ + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +[ -d logs ] || mkdir logs +[ -d results/BAM ] || mkdir -p results/BAM +[ -d results/uniq-BAM ] || mkdir -p results/uniq-BAM + +YGENOME=$WRK/../input/sacCer3.fa +CSGENOME=$WRK/../input/sacCer3_index + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SRR=`echo $INFO | cut -d"," -f1` +SAMPLE=`echo $INFO | cut -d"," -f12` +PLATFORM=`echo $INFO | cut -d"," -f19` +#PAIR=`echo $INFO | cut -d"," -f16` +#echo $INFO + +FQ=$WRK/results/FASTQ/$SRR +BAM=$WRK/results/BAM/$SAMPLE + +echo "($PBS_ARRAYID) Aligned $SRR $PLATFORM reads > $BAM" +if [[ " $PLATFORM " =~ " ABI_SOLID " ]]; then + bowtie -C -S $CSGENOME <(gzip -dc $YGENOME $FQ\_1.fastq.gz) \ + | samtools sort \ + > $BAM.bam + echo "(PBS_ARRAYID) $BAM single aligned (bowtie color space)" +elif [[ " $PLATFORM " =~ " ILLUMINA " ]]; then + bwa mem $YGENOME $FQ\_1.fastq.gz $FQ\_2.fastq.gz -t 4 \ + | samtools sort \ + > $BAM.bam + echo "($PBS_ARRAYID) $BAM pair aligned (BWA)" +fi + +#samtools view -b -F4 $BAM > $WRK/results/uniq-BAM/$SAMPLE.bam + +echo "($PBS_ARRAYID) Indexing..." +samtools index $BAM.bam +echo "($PBS_ARRAYID) Complete!" diff --git a/paper/setup.sh b/paper/setup.sh index 12df39c..bef4593 100644 --- a/paper/setup.sh +++ b/paper/setup.sh @@ -13,8 +13,10 @@ # Required software: # wget +# Python 3 # Perl 5.18+ # bwa v0.7.14+ +# bowtie v1.2.3 # # Optional software: # twoBitToFa @@ -149,3 +151,8 @@ cd $WRK/db ln -s ../../StrainID/sacCer3_VCF ln -s ../../StrainID/hg19_VCF cd $WRK + +# Setup color-space index for yeast genome +# (used by BY4742-chipseq) +bowtie-build -C input/sacCer3.fa input/sacCer3_index + From 5ad1d70b6a5a807073178b1970030ae8ab494ac2 Mon Sep 17 00:00:00 2001 From: Olivia Wen-Mei Lang Date: Tue, 26 Oct 2021 06:04:24 -0400 Subject: [PATCH 4/5] add PBS script to run StrainID on BY4742 data This PBS script wraps up a sample-by-sample run of calling StrainID on all the BY4742 samples. --- .../BY4742-chipseq/job/02_indexed_runSID.pbs | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 paper/BY4742-chipseq/job/02_indexed_runSID.pbs diff --git a/paper/BY4742-chipseq/job/02_indexed_runSID.pbs b/paper/BY4742-chipseq/job/02_indexed_runSID.pbs new file mode 100644 index 0000000..f3df29e --- /dev/null +++ b/paper/BY4742-chipseq/job/02_indexed_runSID.pbs @@ -0,0 +1,52 @@ +#!/bin/bash +#PBS -l nodes=1:ppn=6 +#PBS -l pmem=24gb +#PBS -l walltime=03:00:00 +#PBS -A open +#PBS -o logs/sid.log.out +#PBS -e logs/sid.log.err +#PBS -t 1-10 + +module load anaconda3 +source activate genopipe + +# FIRST CHANGE PATH TO EXECUTE +WRK=/path/to/GenoPipe/paper/BY4742-chipseq +cd $WRK + +[ -d logs ] || mkdir logs +[ -d results/ID ] || mkdir -p results/ID + +INDEX=$(($PBS_ARRAYID+1)) + +METADATA=SraRunInfo.csv +INFO=`sed "${INDEX}q;d" $METADATA` +SAMPLE=`echo $INFO | cut -d"," -f12` +#echo $INFO + +# Store directory paths +DATABASE=$WRK/../db/sacCer3_VCF +GENOME=$WRK/../input/sacCer3.fa +SEED=$PBS_ARRAYID +GENOPIPE=$WRK/../.. + +BAM=$WRK/results/BAM/$SAMPLE +#BAM=$WRK/results/uniq-BAM/$SAMPLE +ID=$WRK/results/ID/ + +# Set-up Temp directory +TEMP=$WRK/temp-$PBS_ARRAYID +[ -d $TEMP ] || mkdir $TEMP +cd $TEMP +ln -s $BAM.bam +ln -s $BAM.bam.bai + +## Execute Single StrainID and record time +cd $GENOPIPE/StrainID +echo "**Begin executing StrainID for ${SAMPLE}..." +{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$SAMPLE.std ; } 2> $ID/$SAMPLE.time +echo "...single StrainID for ($PBS_ARRAYID) ${SAMPLE} finished." + +## Clean-up +rm -r $TEMP + From 3d09e83a84d14f111bf770339139b715dd634681 Mon Sep 17 00:00:00 2001 From: Olivia Wen-Mei Lang Date: Tue, 26 Oct 2021 06:06:11 -0400 Subject: [PATCH 5/5] update .gitignore with BY4742 StrainID output The .gitignore file is updated with the output from StrainID running on BY4742 data --- paper/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/.gitignore b/paper/.gitignore index b088e62..ca5d677 100644 --- a/paper/.gitignore +++ b/paper/.gitignore @@ -39,6 +39,7 @@ BY4742-chipseq/logs/*.out BY4742-chipseq/logs/*.err BY4742-chipseq/results/FASTQ BY4742-chipseq/results/BAM +BY4742-chipseq/results/ID CENPK-chipseq/logs/*.out CENPK-chipseq/logs/*.err CENPK-chipseq/results/FASTQ