## RiboSeq processing pipeline 

#### Steps

This pipe assumes you have the fastq files to process in a dedicated directory within a subdirectory called data.

1) Run Fastqc to check the read length distribution.  Size selection was used during the experiment, expect both 26-34nt for monosome and 54-68nt for disome according to Erica's email.

2) Run cutadapt using the parameters provided by Ezrabio.    "-j 8 -g "^GGG" -a "A{10}" -n 2 -m 15 --max-n=0.1 --discard-casava -o output.fastq.gz input.fastq.gz"

3) Remove reads where the first position quality score is <=10

4) Align reads with Bowtie to non-coding RNA, https://downloads.yeastgenome.org/sequence/S288C_reference/rna/archive/rna_coding_R64-1-1_20110203.fasta.gz reads that align will be discarded.  Allow 1 mismatch in bowtie alignment.

5) Align the remaining reads with Bowtie to YPS1009 and S288C reference genomes.

6) Run samtools mpileup to generate counts for all genes in YPS1009 and S288C.





Samples labeled as -RNA are strictly RNA-Seq and will be processed with the normal condor RNA-Seq pipeline.

/mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/RNA-Seq

~/scripts/Condor-RNA-Seq-Pipeline/rnaSeqCondor_paired.py -e mplace@wisc.edu -f filelist -ref R64

~/scripts/Condor-RNA-Seq-Pipeline/rnaSeqCondor_paired.py -e mplace@wisc.edu -f filelist -ref YPS1009

In [1]:
# import required Python modules
import glob
import itertools
import os
import re
import subprocess
import sys

# set global variables
parentDir = os.getcwd() + "/"
referenceDir = "/mnt/bigdata/linuxhome/mplace/scripts/riboSeqPipeline/reference/"
print(parentDir)

/mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/


In [2]:
# create a list file of fastq files for processing, we are assuming the original fastq files are in a directory called data.
dataDir =  parentDir + "data/"        
with open('inputFastq.txt', 'w') as out:
    for fstq in glob.glob(dataDir + "*.fastq"):
        out.write(fstq + "\n")               

In [3]:
# Step 1)
# write the fastqc condor submit file
with open('fastqc.submit', 'w') as submit:
            submit.write( "Universe                 = vanilla\n" )
            submit.write( "Executable               = runFastqc.sh\n")
            submit.write( "Arguments                = $(fastqFile)\n")
            submit.write( "Error                    = fastqc.submit.err\n")
            submit.write( "Log                      = fastqc.submit.log\n")  
            submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
            submit.write( "Queue fastqFile from inputFastq.txt\n" )
submit.close()  

# write shell script to run fastqc
with open('runFastqc.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("fastqc $1\n")
    out.write("conda deactivate")
out.close()

os.chmod('runFastqc.sh', 0o0777)


### Using the GLBRC condor submit node submit the above fastq.submit file

 condor_submit fastqc.submit  

 check that the job is running

 condor_q , you should see something like the following

Schedd: scarcity-submit.glbrc.org : <144.92.98.21:9618?... @ 12/04/23 14:27:03

OWNER  BATCH_NAME    SUBMITTED   DONE   RUN    IDLE  TOTAL JOB_IDS

mplace ID: 814372  12/4  14:25      _      _      2      2 814372.0-1



In [2]:
# Step 2)
# setup input file for cutadapt
cutadaptOutDir = parentDir + 'cutadapt/'
if os.path.exists(cutadaptOutDir):
    print("Directory exists.")
else:
    os.mkdir(cutadaptOutDir)

with open('inputFastq.txt', 'r') as f, open('cutadaptInput.txt', 'w') as out:
    for fstq in f:
        fstqName = re.sub('.fastq', '-clean.fastq', os.path.basename(fstq.rstrip()))
        fstqOutName = cutadaptOutDir + fstqName
        out.write(f'{fstq.rstrip()} {fstqOutName}\n')
f.close()
out.close()

In [3]:
# write the cutadapt condor submit file
with open('cutadapt.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runCutAdapt.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(outFastq)\n")
    submit.write( "Error                    = cutadapt.submit.err\n")
    submit.write( "Log                      = cutadapt.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, outFastq from cutadaptInput.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runCutAdapt.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("cutadapt -j 8 -g ^GGG -a A{10} -n 2 -m 15 --max-n=0.1 --discard-casava -o $2 $1\n")
    out.write("conda deactivate")
out.close()

os.chmod('runCutAdapt.sh', 0o0777)

In [4]:
# Step 3) Remove reads where the first position quality score is <=10

qualScores = ['+', '*', ')','(', "'", '&', '%', '$', '#', '"', '!']
for fastq in glob.glob('cutadapt/*-clean.fastq'):
    name = re.sub('-clean.fastq', '-filt.fastq', fastq)
    with open(fastq, 'r') as f, open(name, 'w') as out:
        for hdr, seq, plus, qual in itertools.zip_longest(*[f]*4):
            firstQual = [*qual][0]
            if not firstQual in qualScores:
                out.write(hdr)
                out.write(seq)
                out.write(plus)
                out.write(qual)
            else:
                print(hdr, '  ', qual)



@VH00552:28:AAF7KG5M5:1:1401:19292:1000 1:N:0:TGACCA
    #CCCCC;--CCC;CCC;CCCCCCC-C-CCC-;#

@VH00552:28:AAF7KG5M5:1:1401:19708:1000 1:N:0:TGACCA
    #;;;C;CC-CCCCCCCCCCCCCCCC;C;CCCC#

@VH00552:28:AAF7KG5M5:1:1401:19898:1000 1:N:0:TGACCA
    #;CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC#

@VH00552:28:AAF7KG5M5:1:1101:21753:1000 1:N:0:CGATGT
    #-C;CCCCCCCCCCCCCC;CCCCCCCCCCCCCCCCC;;C--CCCCCCC

@VH00552:28:AAF7KG5M5:1:1401:18610:1000 1:N:0:CGATGT
    #CCCCC;CCCCCCCCCC

@VH00552:28:AAF7KG5M5:1:1401:20276:1000 1:N:0:CGATGT
    #;;CCC-CCCCCCCCCCCC;CC-CCC;CCC--#CCCCCCCCCC-C;CC;CC

@VH00552:28:AAF7KG5M5:1:1401:18231:1000 1:N:0:ACTTGA
    #CCCCCCCCCCCCCCCCCCC;CCCC-CCCCCC#CCC-CCCCCCCCCCCCCC

@VH00552:28:AAF7KG5M5:1:1101:21488:1000 1:N:0:ATCACG
    #-;;CC-CCCCCCCCCCCCCCCCCCCCCCCC

@VH00552:28:AAF7KG5M5:1:1101:22094:1000 1:N:0:ATCACG
    #-CCCC-C-CCC;CCCCCC;CCCCCCCCCC

@VH00552:28:AAF7KG5M5:1:1101:22814:1000 1:N:0:ATCACG
    #-CCCC-CCCCCCCCCCCCCCCCCCCCCCCC

@VH00552:28:AAF7KG5M5:1:1101:22852:1000 1:N:0:ATCACG

In [5]:
# Step 4 ) Align reads with Bowtie to non-coding RNA, 
# https://downloads.yeastgenome.org/sequence/S288C_reference/rna/archive/rna_coding_R64-1-1_20110203.fasta.gz 
# reads that align will be discarded.  Allow 1 mismatch in bowtie2 alignment.
# bowtie2 -p 8 --phred33 -N 1 -x $REFERENCE -U $file -S $out.sam 
# -p number of threads
# -N Sets the number of mismatches
# -x The basename of the index for the reference genome
# -U file to align (unpaired)
# -S File to write SAM alignments to

# setup input file for bowtie2 alignment to non-coding RNA
nonCodingOutDir = parentDir + 'alignNonCodingRNA/'
if os.path.exists(nonCodingOutDir):
    print("Directory exists.")
else:
    os.mkdir(nonCodingOutDir)

# get a list of cutadapt cleaned & filtered fastq files for alignment
with open('alignmentInput.txt', 'w') as out:
    for cleanfstq in glob.glob(cutadaptOutDir + '*-filt.fastq'):
        samFile = re.sub('cutadapt', 'alignNonCodingRNA', re.sub('-filt.fastq', '.sam', cleanfstq))        
        out.write(cleanfstq + ' ' + samFile + '\n')
out.close()

In [6]:
# write the bowtie2 condor submit file
with open('ncbowtie2.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runncBowtie2.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(sam)\n")
    submit.write( "Error                    = ncbowtie2.submit.err\n")
    submit.write( "Log                      = ncbowtie2.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, sam from alignmentInput.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runncBowtie2.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("bowtie2 -p 8 --phred33 -N 1 -x /mnt/bigdata/linuxhome/mplace/scripts/riboseq/reference/rna_coding_R64-1-1 -U $1 -S $2\n")
    out.write("conda deactivate")
out.close()

os.chmod('runncBowtie2.sh', 0o0777)

In [7]:
# get a list of reads which aligned to Non-Coding RNA and remove them from the clean.fastq files
#samtools view -F 4 -u  -O SAM -o mapped.sam TestSample1.sam
with open('filterSamInput.txt', 'w') as out:
    for sam in glob.glob(nonCodingOutDir + '*.sam'):
        sampleName = re.sub('.sam', '', os.path.basename(sam))
        outSam = nonCodingOutDir + sampleName + '-unmapped.sam'
        out.write(sam + ' ' + outSam + '\n')
out.close()

In [8]:
# write the samtools filter UNMAPPED (reads which did not align to Non-Coding RNA) reads condor submit file
# these reads will be aligned to S288C and YPS1009
with open('filter.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runfilter.sh\n")
    submit.write( "Arguments                = $(sam) $(ncsam)\n")
    submit.write( "Error                    = filter.submit.err\n")
    submit.write( "Log                      = filter.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue sam, ncsam from filterSamInput.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runfilter.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("samtools view -f 4 -u -O SAM -o $2 $1\n")
    out.write("conda deactivate")
out.close()

os.chmod('runfilter.sh', 0o0777)

In [9]:
# create output files for genome alignments
if not os.path.exists(parentDir + 'alignments'):
    os.mkdir(parentDir + 'alignments')
    os.mkdir(parentDir + 'alignments/S288C')
    os.mkdir(parentDir + 'alignments/YPS1009')

In [10]:
# Create new fastq files by filtering for reads that DID NOT ALIGN to the Non-Coding RNA
# First create a new file containing the read names (for reads we want to keep)
# nonCodingOutDir
for unmapped in glob.glob(nonCodingOutDir + '*-unmapped.sam'):
    nameFile = re.sub('-unmapped.sam', '', os.path.basename(unmapped))
    outFile  = parentDir + 'alignments/' + nameFile + '-names.txt'
    with open(unmapped) as f, open(outFile, 'w') as out:
        for line in f:
            name = line.split('\t')[0]
            out.write(f'{name}\n')
    f.close()
    out.close()

In [11]:

# Use seqtk to subset the unmapped reads for use with bowtie2
for fstq in glob.glob(cutadaptOutDir + '*-filt.fastq'):
    print('processing: ', fstq)
    outFile = parentDir + 'alignments/' +  re.sub('-filt.fastq', '.fastq', os.path.basename(fstq))    # create output file name
    nameLst = parentDir + 'alignments/' + re.sub('-filt.fastq', '-names.txt', os.path.basename(fstq)) 
    cmd = [ 'seqtk', 'subseq', fstq, nameLst ]
    # run command and capture output
    output = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()    
    # write results to file
    with open(outFile, 'w') as out:
        out.write(output[0].decode('utf-8'))
    out.close()     

processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/EU-LOG-30_S4_R1_001-filt.fastq


processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/ANEU-Q-60_S5_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/ANEU-LOG-30_S2_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/EU-Q-60_S7_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/ANEU-Q-30_S1_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/ANEU-LOG-60_S6_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/EU-Q-30_S3_R1_001-filt.fastq
processing:  /mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/cutadapt/EU-LOG-60_S8_R1_001-filt.fastq


In [12]:
# Create input file for alignment to the S288C reference genome.
outputPath = parentDir + 'alignments/S288C/'
with open('refAlignmentS288C_input.txt', 'w') as out:
    for inFastq in glob.glob(parentDir + 'alignments/*.fastq'):
        sampleName = re.sub('.fastq', '.sam', os.path.basename(inFastq))
        out.write(inFastq + ' ' + outputPath + sampleName + ' /home/glbrc.org/mplace/data/reference/S288C_reference_genome_R64-1-1_20110203/s.cerevisiae-R64-1-1' + '\n')
out.close

<function TextIOWrapper.close()>

In [13]:
# Create input file for alignment to the YPS1009 reference genome.
outputPath = parentDir + 'alignments/YPS1009/'
with open('refAlignmentYPS1009_input.txt', 'w') as out:
    for inFastq in glob.glob(parentDir + 'alignments/*.fastq'):
        sampleName = re.sub('.fastq', '.sam', os.path.basename(inFastq))
        out.write(inFastq + ' ' + outputPath + sampleName + ' /home/glbrc.org/mplace/data/reference/YPS1009/YPS1009' + ' \n')
out.close

<function TextIOWrapper.close()>

In [14]:
# Step 5) Align reads to reference genomes S288C and YPS1009
# align reads to S288C using bowtie2 
# write the bowtie2 condor submit file
#S288C
with open('s288cbowtie2.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runBowtie2.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(sam) $(ref)\n")
    submit.write( "Error                    = s288c_bowtie2.submit.err\n")
    submit.write( "Log                      = s288c_bowtie2.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, sam, ref from refAlignmentS288C_input.txt\n" )
submit.close()
#YPS1009
with open('yps1009bowtie2.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runBowtie2.sh\n")
    submit.write( "Arguments                = $(fastqFile) $(sam) $(ref)\n")
    submit.write( "Error                    = yps1009_bowtie2.submit.err\n")
    submit.write( "Log                      = yps1009_bowtie2.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue fastqFile, sam, ref from refAlignmentYPS1009_input.txt\n" )
submit.close()

# write shell script to run bowtie2
with open('runBowtie2.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write(f"bowtie2 -p 8 --phred33 -N 1 -x $3 -U $1 -S $2\n")
    out.write("conda deactivate")
out.close()

os.chmod('runBowtie2.sh', 0o0777)

In [10]:
# samtools sort ANEU-LOG-30_S2_R1_001.sam > ANEU-LOG-30_S2_R1_001-sorted.sam
# samtools mpileup -f ~/data/reference/S288C_reference_genome_R64-1-1_20110203/S288C_reference_sequence_R64-1-1_20110203.fsa ANEU-LOG-30_S2_R1_001-sorted.sam > ANEU-LOG-30_pileup.txt
# Create input file for samtools sort 
inputFilePath = parentDir + 'alignments/YPS1009/'
print(inputFilePath)
with open('sort-YPS1009_input.txt', 'w') as out:
    for sam in glob.glob(inputFilePath + '*.sam'):
        sampleName = re.sub('.sam', '.bam', os.path.basename(sam))
        out.write(sam + ' ' + inputFilePath + sampleName + ' \n')
out.close

/mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/alignments/YPS1009/


<function TextIOWrapper.close()>

In [11]:
inputFilePath = parentDir + 'alignments/S288C/'
print(inputFilePath)
with open('sort-S288C_input.txt', 'w') as out:
    for sam in glob.glob(inputFilePath + '*.sam'):
        sampleName = re.sub('.sam', '.bam', os.path.basename(sam))
        out.write(sam + ' ' + inputFilePath + sampleName + ' \n')
out.close

/mnt/bigdata/linuxhome/mplace/data/Ribo-seq/Leah-12142023/alignments/S288C/


<function TextIOWrapper.close()>

In [13]:
# write the samtools filter UNMAPPED (reads which did not align to Non-Coding RNA) reads condor submit file
# these reads will be aligned to S288C and YPS1009
with open('sort.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runsort.sh\n")
    submit.write( "Arguments                = $(sam) $(bam)\n")
    submit.write( "Error                    = sort.submit.err\n")
    submit.write( "Log                      = sort.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    #submit.write( "Queue sam, bam from  sort-S288C_input.txt\n" )
    submit.write( "Queue sam, bam from sort-YPS1009_input.txt\n" )
submit.close()



# write shell script to run cutadapt
with open('runsort.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write("samtools sort $1  > $2\n")
    out.write("conda deactivate")
out.close()

os.chmod('runsort.sh', 0o0777)

In [15]:
# create pileup output directories
pileupPath = parentDir + 'pileup/'
os.mkdir(pileupPath)
os.mkdir(pileupPath + 'YPS1009')
os.mkdir(pileupPath + 'S288C')

In [21]:
# Create samtools pileup input file for YPS1009
yps1009Ref = '/home/glbrc.org/mplace/data/reference/YPS1009/YPS1009.fasta'
tmpPath = pileupPath + 'YPS1009/'
with open('pileUp_YPS1009_input.txt', 'w') as out:
    for sam in glob.glob(parentDir + 'alignments/YPS1009/*.bam'):
        sampleName = re.sub('.bam', '-pileup.txt', os.path.basename(sam))
        out.write(sam + ' ' + tmpPath + sampleName + ' ' + yps1009Ref + '\n')
out.close



<function TextIOWrapper.close()>

In [25]:
# Create pileup.sh input file for S288C
s288cRef = '/mnt/bigdata/linuxhome/mplace/data/reference/S288C_reference_genome_R64-1-1_20110203/S288C_reference_sequence_R64-1-1_20110203.fsa'
tmpPath = pileupPath + 'S288C/'
with open('pileUp_S288C_input.txt', 'w') as out:
    for bam in glob.glob(parentDir + 'alignments/S288C/*.bam'):
        sampleName = re.sub('.bam', '-pileup.txt', os.path.basename(bam))
        out.write(bam + ' ' + tmpPath + sampleName + ' ' + s288cRef + '\n')
out.close

<function TextIOWrapper.close()>

In [None]:
# samtools mpileup

#samtools mpileup -f ~/data/reference/S288C_reference_genome_R64-1-1_20110203/S288C_reference_sequence_R64-1-1_20110203.fsa /
# ANEU-LOG-30_S2_R1_001-sorted.sam > ANEU-LOG-30_pileup.txt
# create input files for 

In [23]:
# Setup pileup.sh job for S288C
with open('s288cPileUp.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runPileUp.sh\n")
    submit.write( "Arguments                = $(bam) $(pileup) $(ref)\n")
    submit.write( "Error                    = s288c_pileup.submit.err\n")
    submit.write( "Log                      = s288c_pileup.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue bam, pileup, ref from pileUp_S288C_input.txt\n" )
submit.close()
#YPS1009
with open('yps1009PileUp.submit', 'w') as submit:
    submit.write( "Universe                 = vanilla\n" )
    submit.write( "Executable               = runPileUp.sh\n")
    submit.write( "Arguments                = $(bam) $(pileup) $(ref)\n")
    submit.write( "Error                    = yps1009_pileup.submit.err\n")
    submit.write( "Log                      = yps1009_pileup.submit.log\n")  
    submit.write( "Requirements             = OpSysandVer == \"CentOS7\"\n")
    submit.write( "Queue bam, pileup, ref from pileUp_YPS1009_input.txt\n" )
submit.close()

# write shell script to run cutadapt
with open('runPileUp.sh', 'w') as out:
    out.write("#!/bin/bash\n")
    out.write("source /opt/bifxapps/miniconda3/etc/profile.d/conda.sh\n")
    out.write("unset PYTHONPATH\n")  
    out.write("conda activate /home/glbrc.org/mplace/.conda/envs/riboSeq\n")
    out.write(f"samtools mpileup -f $3 $1 > $2\n")
    out.write("conda deactivate")
out.close()

os.chmod('runPileUp.sh', 0o0777)

In [13]:
# S288C chromosome sizes
S288C_chromSizes = {}
with open('/mnt/bigdata/linuxhome/mplace/data/reference/S288C_reference_genome_R64-1-1_20110203/individualChroms/S288C.chromsizes', 'r') as f:
    for ln in f:
        chrom, length = ln.rstrip().split()
        if chrom not in S288C_chromSizes and chrom != '2-micron':
            S288C_chromSizes[chrom] = int(length)

print(S288C_chromSizes)

{'ref|NC_001133|': 230218, 'ref|NC_001134|': 813184, 'ref|NC_001135|': 316620, 'ref|NC_001136|': 1531933, 'ref|NC_001137|': 576874, 'ref|NC_001138|': 270161, 'ref|NC_001139|': 1090940, 'ref|NC_001140|': 562643, 'ref|NC_001141|': 439888, 'ref|NC_001142|': 745751, 'ref|NC_001143|': 666816, 'ref|NC_001144|': 1078177, 'ref|NC_001145|': 924431, 'ref|NC_001146|': 784333, 'ref|NC_001147|': 1091291, 'ref|NC_001148|': 948066, 'ref|NC_001224|': 85779}


In [15]:
# Create count table for S288C
gff = '/mnt/bigdata/linuxhome/mplace/data/reference/S288C_reference_genome_R64-1-1_20110203/saccharomyces_cerevisiae_R64-1-1_20110208_noFasta.gff'
geneLookUp = {}    # dictionary of dictionaries key = chrom : {key = the -72 position of gene value = gene name }
gffDict = {}       # dictionary of dictionaries key = chrom : {key = {start : 0, end : 0, strand : '+'}

with open(gff, 'r') as g:
    for line in g:
        if line.startswith('#'):   # skip comment rows
            continue
        else:
            dat = line.split('\t')
            chrom = dat[0]
            if chrom not in S288C_chromSizes:
                continue
            if chrom not in geneLookUp:
                geneLookUp[chrom] = {}
            # identify genes
            if dat[2] == 'gene':
                geneName = re.sub('ID=', '', dat[8].split(';')[0])
                # identify strand
                if dat[6] == '+':          # POSIIVE STRAND
                    if int(dat[3]) < 72:          
                        start = 0
                    else:
                        start = int(dat[3]) - 72
                    if start not in geneLookUp:
                        geneLookUp[chrom][start] = geneName
                    else:
                        print('Duplicate start ', geneName, geneLookUp[chrom][start])

                    if not int(dat[4]) + 60 > S288C_chromSizes[chrom]:
                        end   = int(dat[4]) + 60
                    else:
                        end = S288C_chromSizes[chrom]
                else:                      # NEGATIVE STRAND
                    if  int(dat[3]) - 60 < 60:
                        start = 0
                    else:
                        start = int(dat[3]) - 60
                    if start not in geneLookUp:
                        geneLookUp[chrom][start] = geneName
                    else:
                        print('Duplicate start minus strand ', geneName, geneLookUp[chrom][start]) 

                    if not int(dat[4]) + 72 > S288C_chromSizes[chrom]:
                        end = int(dat[4]) + 72
                    else:
                        end = S288C_chromSizes[chrom]

        

In [16]:
geneLookUp

{'ref|NC_001133|': {263: 'YAL069W',
  466: 'YAL068W-A',
  1747: 'YAL068C',
  2408: 'YAL067W-A',
  7175: 'YAL067C',
  10019: 'YAL066W',
  11505: 'YAL065C',
  11974: 'YAL064W-B',
  13303: 'YAL064C-A',
  21494: 'YAL064W',
  22335: 'YAL063C-A',
  23940: 'YAL063C',
  31495: 'YAL062W',
  33376: 'YAL061W',
  35083: 'YAL060W',
  36436: 'YAL059C-A',
  36437: 'YAL059W',
  37392: 'YAL058W',
  38636: 'YAL056C-A',
  39187: 'YAL056W',
  42105: 'YAL055W',
  42821: 'YAL054C',
  45827: 'YAL053W',
  48492: 'YAL051W',
  51795: 'YAL049C',
  52741: 'YAL048C',
  54512: 'YAL047W-A',
  54929: 'YAL047C',
  56969: 'YAL046C',
  57428: 'YAL045C',
  57446: 'YAL044W-A',
  57890: 'YAL044C',
  58635: 'YAL043C',
  61171: 'YAL042C-A',
  61244: 'YAL042W',
  62768: 'YAL041W',
  65718: 'YAL040C',
  68656: 'YAL039C',
  71714: 'YAL038W',
  72266: 'YAL037C-B',
  73366: 'YAL037C-A',
  73948: 'YAL037W',
  74983: 'YAL036C',
  76355: 'YAL035W',
  79429: 'YAL034C-B',
  79646: 'YAL034W-A',
  80650: 'YAL034C',
  82634: 'YAL033W',
 