## RiboSeq processing pipeline 

#### Steps

This pipe assumes you have the fastq files to process in a dedicated directory within a subdirectory called data.

1) Run Fastqc to check the read length distribution.  Size selection was used during the experiment, expect both 26-34nt for monosome and 54-68nt for disome according to Erica's email.

2) Run cutadapt using the parameters provided by Ezrabio.    "-j 8 -g "^GGG" -a "A{10}" -n 2 -m 15 --max-n=0.1 --discard-casava -o output.fastq.gz input.fastq.gz"

3) Remove reads where the first position quality score is <=10

4) Align reads with Bowtie to non-coding RNA, https://downloads.yeastgenome.org/sequence/S288C_reference/rna/archive/rna_coding_R64-1-1_20110203.fasta.gz reads that align will be discarded.  Allow 1 mismatch in bowtie alignment.

5) Align the remaining reads with Bowtie to YPS1009 and S288C reference genomes.

6) Run samtools mpileup to generate counts for all genes in YPS1009 and S288C.





In [3]:
# import required Python modules
import glob
import os
import re
import sys

# set global variables
parentDir = os.getcwd() + "/"
referenceDir = "/mnt/bigdata/linuxhome/mplace/scripts/riboSeqPipeline/reference/"


In [2]:
# create a list file of fastq files for processing, we are assuming the original fastq files are in a directory called data.
dataDir =  parentDir + "data/"        
with open('inputFastq.txt', 'w') as out:
    for fstq in glob.glob(dataDir + "*.fastq"):
        out.write(fstq + "\n")       


In [4]:
# Step 1)
# write the fastqc condor submit file
with open('fastqc.submit', 'w') as submit:
            submit.write( "Universe                 = vanilla\n" )
            submit.write( "Executable               = runFastqc.sh\n")
            submit.write( "Arguments                = $(fastqFile)\n")
            submit.write( "Error                    = fastq.submit.err\n")
            submit.write( "Log                      = fastq.submit.log\n")  
            submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
            submit.write( "Queue fastqFile from inputFastq.txt\n" )
submit.close()  

# write shell script to run fastqc
with open('runFastqc.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("fastqc $1\n")
    out.write("conda deactivate")
out.close()

os.chmod('runFastqc.sh', 0o0777)


### Using the GLBRC condor submit node submit the above fastq.submit file

 condor_submit fastqc.submit  

 check that the job is running

 condor_q , you should see something like the following

Schedd: scarcity-submit.glbrc.org : <144.92.98.21:9618?... @ 12/04/23 14:27:03

OWNER  BATCH_NAME    SUBMITTED   DONE   RUN    IDLE  TOTAL JOB_IDS

mplace ID: 814372  12/4  14:25      _      _      2      2 814372.0-1



In [5]:
# Step 2)
# setup input file for cutadapt
cutadaptOutDir = parentDir + 'cutadapt/'
if os.path.exists(cutadaptOutDir):
    print("Directory exists.")
else:
    os.mkdir(cutadaptOutDir)

with open('inputFastq.txt', 'r') as f, open('cutadaptInput.txt', 'w') as out:
    for fstq in f:
        fstqName = re.sub('.fastq', '-clean.fastq', os.path.basename(fstq.rstrip()))
        fstqOutName = cutadaptOutDir + fstqName
        out.write(f'{fstq.rstrip()} {fstqOutName}\n')
f.close()
out.close()

In [6]:
# write the cutadapt condor submit file
with open('cutadapt.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runCutAdapt.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(outFastq)\n")
    submit.write( "Error                    = cutadapt.submit.err\n")
    submit.write( "Log                      = cutadapt.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, outFastq from cutadaptInput.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runCutAdapt.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("cutadapt -j 8 -g ^GGG -a A{10} -n 2 -m 15 --max-n=0.1 --discard-casava -o $2 $1\n")
    out.write("conda deactivate")
out.close()

os.chmod('runCutAdapt.sh', 0o0777)

In [None]:
# Step 3) Remove reads where the first position quality score is <=10

In [9]:
# Step 4 ) Align reads with Bowtie to non-coding RNA, 
# https://downloads.yeastgenome.org/sequence/S288C_reference/rna/archive/rna_coding_R64-1-1_20110203.fasta.gz 
# reads that align will be discarded.  Allow 1 mismatch in bowtie2 alignment.
# bowtie2 -p 8 --phred33 -N 1 -x $REFERENCE -U $file -S $out.sam 
# -p number of threads
# -N Sets the number of mismatches
# -x The basename of the index for the reference genome
# -U file to align (unpaired)
# -S File to write SAM alignments to

# setup input file for bowtie2 alignment to non-coding RNA
nonCodingOutDir = parentDir + 'alignNonCodingRNA/'
if os.path.exists(nonCodingOutDir):
    print("Directory exists.")
else:
    os.mkdir(nonCodingOutDir)

# get a list of cutadapt cleaned fastq files for alignment
with open('alignmentInput.txt', 'w') as out:
    for cleanfstq in glob.glob(cutadaptOutDir + '*clean.fastq'):
        samFile = re.sub('cutadapt', 'alignNonCodingRNA', re.sub('-clean.fastq', '.sam', cleanfstq))        
        out.write(cleanfstq + ' ' + samFile + '\n')
out.close()

Directory exists.


In [8]:
# write the bowtie2 condor submit file
with open('bowtie2.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runBowtie2.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(sam)\n")
    submit.write( "Error                    = bowtie2.submit.err\n")
    submit.write( "Log                      = bowtie2.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, sam from alignmentInput.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runBowtie2.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("bowtie2 -p 8 --phred33 -N 1 -x /mnt/bigdata/linuxhome/mplace/scripts/riboseq/reference/rna_coding_R64-1-1 -U $1 -S $2\n")
    out.write("conda deactivate")
out.close()

os.chmod('runBowtie2.sh', 0o0777)