In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.sparse
import scipy.stats
import os
import scipy.io as sio
import dnatools
from collections import Counter
%matplotlib inline
from pylab import *
# Plotting Params:
rc('mathtext', default='regular')
fsize=14

  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))


In [2]:
resultsdir = '../results/N0_A3SS_Fastq_to_Splice_Reads/'
if not os.path.exists(resultsdir):
    os.makedirs(resultsdir)
figdir = '../figures/N0_A3SS_Fastq_to_Splice_Reads/'
if not os.path.exists(figdir):
    os.makedirs(figdir)
    
#Choose if you want to actually save the plots:
SAVEFIGS = True

In [3]:
alt3SS_seq = '.........................GCTTGGATCTGATCTCAACAGGGT.........................'
alt3SS_tag = 'CATTACCTGC.........................'

### Creating a map between 3'UTR barcodes and randomized intronic sequences

Create a dictionary of dictionaries. Top level keys are the 3'UTR barcode sequences. For each barcode there is a counter dictionary, which counts the occurences of each intron sequence with that barcode. These sequences are only counted if the non-randomized regions match exactly to the plasmid sequence (no mis-matches or deletions). One sequence count is added from the forward read and another sequence count is added from the reverse read.

In [4]:
f = {}
f[0] = open('../fastq/A3SS_dna_R1.fq','r')
f[1] = open('../fastq/A3SS_dna_R2.fq','r')
tags = Counter()
c = 0
p = 0
header = {}
seq = {}
strand = {}
quality ={}
tag_seqs = {}
d = 0
while True:
    for i in range(2):
        header[i] = f[i].readline()[:-1]
        seq[i] = f[i].readline()[:-1]
        strand[i] = f[i].readline()[:-1]
        quality[i] = f[i].readline()[:-1]

    cur_tag = dnatools.reverse_complement(seq[1])
    if(len(header[0])==0):
        break
        
    # Check passing reads and that the sequence after the random tag matches
    # the plasmid sequence.
    if (int(header[0].split('#')[0][-1])==1) &\
       (int(header[1].split('#')[0][-1])==1) &\
       (cur_tag[10:20]==alt3SS_tag[:10]):
        p += 1
        #Check that the non-randomized sequences match perfectly to the reference
        if(seq[0][25:25+24]==alt3SS_seq[25:-25]):
            d+=1
            try:
                tag_seqs[cur_tag]
            except:
                tag_seqs[cur_tag] = Counter()
            tag_seqs[cur_tag][seq[0]]+=1

    if(c%1000000)==0:
        print c,p,'|',
    c+=1
    
for i in range(2):
    f[i].close()

0 0 | 1000000 795826 | 2000000 1600187 | 3000000 2398971 | 4000000 3198607 | 5000000 3994250 | 6000000 4786859 | 7000000 5578120 | 8000000 6357719 | 9000000 7129306 | 10000000 7919234 | 11000000 8718215 | 12000000 9513516 | 13000000 10307821 | 14000000 11092251 | 15000000 11863855 | 16000000 12637931 | 17000000 13397479 | 18000000 14145314 |


For each tag, I find the intron sequence that occurred the most times with that tag. I will only keep tag-sequence pairs that occurred at least twice.

In [5]:
ks = tag_seqs.keys()
tag_map = {}
tag_map_counts = {}
c = 0
for k in ks:
    max_seq = max(tag_seqs[k]) # Get seq
    max_seq_counts = tag_seqs[k][max_seq]
    if(max_seq_counts>=2):
        tag_map[k] = max_seq
        tag_map_counts[k] = max_seq_counts
    if(c%100000)==0:
        print c,
    c+=1
seq_series = pd.Series(tag_map)
seq_counts = pd.Series(tag_map_counts)

0 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000


Right now, I've kept 30nt of the barcode, even though only 20nt of that is randomized. Let's trim this to 20nt:

In [6]:
seq_series = pd.Series(dict(zip(pd.Series(seq_series.index).str.slice(-20),seq_series.values )))

Save the barcode-sequence mapping

In [7]:
seq_series.name='Seq'
seq_series.index.name='Tag'
seq_series.to_csv('../data/A3SS_Seqs.csv',index_label='Tag',header=True)

### Count the spliced reads

Make a dictionary to specifiy the row of each barcode in the read count matrix

In [8]:
tag2seq_dict = dict(zip(seq_series.index,arange(len(seq_series))))

Here is the sequence to which we will map reads. The first nucleotide corresponds the the first nucleotide (5') in the intron. I have included sequence in the second exon, in case downstream splice acceptors are used.

In [9]:
alt3SS_full_seq = 'gtaagttatcaccttcgtggctacagagtttccttatttgtctctgttgccggcttatatggacaagcatatcacagccatttatcggagcgcctccgtacacgctattatcggacgcctcgcgagatcaatacgtataccagctgccctcgatacatgtcttggacggggtcggtgttgatatcgtatNNNNNNNNNNNNNNNNNNNNNNNNNGCTTGGATCTGATCTCAACAGGGTNNNNNNNNNNNNNNNNNNNNNNNNNatgattacacatatagacacgcgagcacccatcttttatagaatgggtagaacccgtcctaaggactcagattgagcatcgtttgcttctcgagtactacctggtacagatgtctcttcaaacaggacggcagcgtgcagctcgccgaccactaccagcagaacacccccatcggcgacggccccgtgctgctgcccgacaaccactacctgagctaccagtccgccctgagcaaagaccccaacgagaagcgcgatcacatggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaaggactgatagtaaggcccattacctgcNNNNNNNNNNNNNNNNNNNNGCAGAACACAGCGGTTCGACCTGCGTGATATCTCGTATGCCGTCTTCTGCTTG'
alt3SS_full_seq = alt3SS_full_seq.upper()

To map the position of the spliced exon-exon junction, I map the last 20 nt of the read. For example, if the read was spliced at the last SA (most 3'), then the 100-120 nt of the read will map 100-120 nt into the second exon. If the read was spliced 20nt 5' of the last SA, the 100-120 nt of the read will map 80-100 nt into the second exon. If there is no splicing, the 100-120 nt of the read will map within 100-120 nt into the intron. However, requiring an exact match for these 20 nt is very stringent and we may lose reads. So if there is no match in the 100-120 nt of the read, I then check for a match in the 80-100 nt of the read, and then finally the 60-80 nt of the read.

In [10]:
c = 0
header = {}
seq = {}
strand = {}
quality ={}

tag_list = []
ss_list = []

f = {}
f[0] = open('../fastq/A3SS_rna_R1.fq','r')
f[1] = open('../fastq/A3SS_rna_R2.fq','r')

while True:
    for i in range(2):
        header[i] = f[i].readline()[:-1]
        seq[i] = f[i].readline()[:-1]
        strand[i] = f[i].readline()[:-1]
        quality[i] = f[i].readline()[:-1]
    if(len(header[i])==0):
        break
        #min_qual[i] = min(quality[i])
    tag = dnatools.reverse_complement(seq[1][:20])

    try:
        tag_ind = tag2seq_dict[tag]
    except:
        pass
    else:
        # Check if the end of the read 100-120 matches the second exon
        # of citrine. In case of mismatches, I check for matches to 3
        # different 20nt regions.
        s_start = alt3SS_full_seq.find(seq[0][100:120])-100
        if(s_start<-100):
            s_start = alt3SS_full_seq.find(seq[0][80:100])-80
            if(s_start<-80):
                s_start = alt3SS_full_seq.find(seq[0][60:80])-60
        if(s_start>=0):
            tag_list.append(tag_ind)
            ss_list.append(s_start)
    if(c%1000000)==0:
        print c,
    c+=1

for i in range(2):
    f[i].close()

0 1000000 2000000 3000000 4000000 5000000 6000000 7000000 8000000 9000000 10000000 11000000 12000000 13000000 14000000 15000000 16000000


Make the sparse matrix and save:

In [13]:
splices = {'A3SS':scipy.sparse.csr_matrix((list(np.ones_like(ss_list))+[0],
                                           (tag_list+[len(seq_series)-1],ss_list+[565])),
                                          dtype=np.float64)}

In [14]:
sio.savemat('../data/A3SS_Reads.mat',splices)