In [1]:
import pysam
import collections
import random
import pandas as pd
import numpy as np
%load_ext rpy2.ipython


  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()

  rpy2.rinterface.initr()


In [2]:
infile = "/ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/Nxf1-GFP-R1.bam"

This notebook contains test code to examine whether indel mutations in UMIs present a substainal effect. To test this, we first recognise that a one base pair indel will suck a base of the actual genomic sequencing into the UMI, shifting the mapping position of the by one base, and including a genome determined base as the final base of the UMI.

To test this we will parse the bam file and fill a dictionary: key="[contig][position]", value= counter of UMIs for each position. Then we'll parse the dictionary keys and look to see whether the positions +1 exist in the dictionary keys. If they do, we'll compare the umi profiles between the postions. 

First we will filter for UMIs that have the genomic base in the final UMI position. Then we will check for each possible one bp deletion in the reference UMI and see if that UMI exists in the +1 position. We will calculate what % of the UMIs at the reference position exist as deletion versions at +1 and what % of reads at +1 are deletion versions releative to the reference base.

This will then be compared to a random sample

A function to calculate the fraction of the reference and plus 1 base UMIs that are part of deletion pairs, weighted by the frequency of each UMI.

In [20]:
def getDelFraction(counter1, counter2, genomic_base):
    ''' for two counters with string keys,
    return the fraction of bases at counter2 that could be deletions cf conuter1'''
    
    umis1 = counter1.keys()
    umis2 = counter2.keys()
    
    found1 = set()
    found2 = set()
    
    filtered_set2 = set([umi[:-1] for umi in umis2 if umi[-1] == genomic_base])
    for umi in umis1:
        for i in range(len(umi)):
            del_umi = umi[:i] + umi[i+1:]
            if del_umi in filtered_set2:
                found1.add(umi)
                found2.add(del_umi+genomic_base)
            
    return  float(sum([counter1[umi] for umi in found1]))/sum(counter1.values())

now a function to parse a bam and create the dictionary.

Because whether we care about +1 or -1 and whether we want to rev comp the sequence or not, we will deal with plus and minus strands seperately. 

In [4]:
def parse_samfile(infile):
    '''Parses a bamfile and returns three dictionaries, the first is a dictionary of counters
    with the count of each umi at each bases on each contig, the second is the first bases matched
    reads at that position, and the third is the distribution of the UMIs in the file'''
    insam = pysam.Samfile(infile, "rb")

    umi_pos = collections.defaultdict(lambda:collections.defaultdict(lambda: collections.Counter()))
    umi_dist=collections.Counter()
    genomic_bases = collections.defaultdict(lambda: collections.defaultdict(str))
    inreads = insam.fetch()
    for read in inreads:
        if read.is_unmapped:
            continue

        if read.mate_is_unmapped and paired:
            continue

        if read.is_read2:
            continue

        is_spliced = False

        if read.is_reverse:
            continue
        
        else:
            pos = read.pos
            if read.cigar[0][0] == 4:
                pos = pos - read.cigar[0][1]
            start = pos

            if ('N' in read.cigarstring or
                (read.cigar[-1][0] == 4 and
                 read.cigar[-1][1] > soft_clip_threshold)):
                is_spliced = True

        umi = read.qname.split("_")[-1]
        chrom = insam.get_reference_name(read.tid)
        umi_pos[chrom][pos][umi] += 1
        umi_dist[umi] += 1
        
        if read.cigar[0][0] == 0:
            genomic_bases[chrom][pos] = read.query_sequence[0]
                
    return umi_pos, umi_dist, genomic_bases

Start with an undeduped sample. 

In [46]:
undeduped_umi_pos, undeduped_umi_dist, undeduped_bases = parse_samfile(infile)

In [48]:
for i, position in enumerate(undeduped_umi_pos["chr1"]):
    if not position.__class__ is int(1).__class__:
        print i, position
        


Now we need to go through each pair of adjecent positions and calculate the fraction of the UMIs at position +1 that could be explained as deletions of UMIs at the reference position. Then randomise the UMIs at the +1 position and do the same. 

In [6]:
def randomise_position(umi_counter, umi_dist):
    '''Takes a counter of UMI frequencies and create a randomised distribution by replacing the UMIs 
    in the input which UMIs sampled from the genomewide distribution'''
    
    return {umi: count for umi, count in 
            zip(np.random.choice(umi_dist["index"], 
                                 size=len(umi_counter.keys()),
                                 replace=False,
                                 p=umi_dist["freq"]),
                                 umi_counter.values())}

In [52]:
def calculate_deletion_rate(umi_pos, umi_dist, genomic_bases):
    '''Find positions where position+1 also has UMIs and calculate the deletion rate,
    in comparision to randomised UMIs'''

    # convert count umi_dist list to dataframe mapping umi to frequencies
    umi_dist = pd.Series(umi_dist, name="count").reset_index()
    umi_dist["freq"] = umi_dist["count"]/umi_dist["count"].sum()
    
    results_accumulator = []
    random_accumulator = []
    for contig in umi_pos:
        for position in umi_pos[contig]:
            if position+1 in umi_pos[contig]:
                genomic_base = genomic_bases[contig][position]
                if genomic_base == str():
                    continue
                this =  umi_pos[contig][position]
                other = umi_pos[contig][position + 1]
                
                rand_other = randomise_position(other, umi_dist)
                results_accumulator.append(getDelFraction(this, other, genomic_base))
                random_accumulator.append(getDelFraction(this, rand_other, genomic_base))
            
    results_frame = pd.DataFrame({"deletion_rate": results_accumulator})
    random_accumulator = pd.DataFrame({"deletion_rate": random_accumulator})

    results_frame["random"] = False
    random_accumulator["random"] = True

    results_frame = pd.concat([results_frame, random_accumulator])
    return results_frame



In [53]:
undeduped_deletion_rates = calculate_deletion_rate(undeduped_umi_pos, undeduped_umi_dist, undeduped_bases)

Having calculated the deledtion rates, lets look at the distribution, for both randomised and non-randomised

In [60]:
undeduped_deletion_rates.groupby("random").describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,deletion_rate
random,Unnamed: 1_level_1,Unnamed: 2_level_1
False,count,58160.0
False,mean,0.073596
False,std,0.19324
False,min,0.0
False,25%,0.0
False,50%,0.0
False,75%,0.0
False,max,1.0
True,count,58160.0
True,mean,0.06288


There is an enrichment compared to the randomisations. Small but clearly present. The overlap is zero in the vast majority of cases. The important measure is the UMIs as a fractin of the +1 position - these are UMIs that are not real. The means are different, but the overlap is zero in the vast majoirty of cases. How many cases are they non-zero?

In [56]:
undeduped_deletion_rates.groupby("random").apply(lambda x: (x>0).sum())

Unnamed: 0_level_0,deletion_rate,random
random,Unnamed: 1_level_1,Unnamed: 2_level_1
False,13893,0
True,12846,58160


Its a very small difference at this level. Some 300 positions or so. The real imporant descision is whether or not this enrichment is still present after deduplication. 

Compare to the deduped files:

In [61]:
deduped_umi_pos, deduped_umi_dist, deduped_bases = parse_samfile(
    "/ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/dedup_directional-adjacency.dir/Nxf1-GFP-R1.bam")
deduped_deletion_rates = calculate_deletion_rate(deduped_umi_pos, deduped_umi_dist, deduped_bases)

In [62]:
deduped_deletion_rates.groupby("random").mean()

Unnamed: 0_level_0,deletion_rate
random,Unnamed: 1_level_1
False,0.053554
True,0.043328


In [63]:
deduped_deletion_rates.groupby("random").apply(lambda x: (x>0).sum())

Unnamed: 0_level_0,deletion_rate,random
random,Unnamed: 1_level_1,Unnamed: 2_level_1
False,13107,0
True,11878,58160


The enrichment is smaller, but not by much. The biggest difference is that the there is a smaller number of overlapping positions in the randomised positions.

How much are the enrichments:

In [64]:
undeduped_means = undeduped_deletion_rates.groupby("random").mean()
undeduped_means.loc[0]/undeduped_means.loc[1]

deletion_rate    1.170417
dtype: float64

In [65]:
deduped_means = deduped_deletion_rates.groupby("random").mean()
deduped_means.loc[0]/deduped_means.loc[1]

deletion_rate    1.236016
dtype: float64

Run this computation for all of the samples in replicate 1. First wrap in function:

In [74]:
def run_complete_analysis(infile):
    print "analysing ", infile
    umi_pos, umi_dist, bases = parse_samfile(infile)
    deletion_rates = calculate_deletion_rate(umi_pos, umi_dist, bases)
    return deletion_rates.groupby("random").mean()


Now find the deduplicated files

In [73]:
import glob
import os
infiles = pd.Series(glob.glob("/ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/*R1.bam"))
infiles.index = infiles.apply(os.path.basename)
infiles

SRSF5-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF7-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
Control-GFP-R1.bam    /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF3-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF6-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF4-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
Nxf1-GFP-R1.bam       /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF1-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
SRSF2-GFP-R1.bam      /ifs/projects/ians/umisdeduping/iCLIP_deduping...
dtype: object

In [79]:
results = infiles.apply(run_complete_analysis)

analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF5-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF7-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/Control-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF3-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF6-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF4-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/Nxf1-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF1-GFP-R1.bam
analysing  /ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/mapping.dir/SRSF2-GFP-R1.bam


In [102]:
results[False]/results[True]

Control-GFP-R1.bam    2.604803
Nxf1-GFP-R1.bam       1.164694
SRSF1-GFP-R1.bam      1.174697
SRSF2-GFP-R1.bam      1.726313
SRSF3-GFP-R1.bam      1.337105
SRSF4-GFP-R1.bam      1.204101
SRSF5-GFP-R1.bam      1.188362
SRSF6-GFP-R1.bam      1.083955
SRSF7-GFP-R1.bam      1.879418
dtype: float64

In [97]:
results

random,False,True
Control-GFP-R1.bam,0.064308,0.024688
Nxf1-GFP-R1.bam,0.073596,0.063189
SRSF1-GFP-R1.bam,0.030228,0.025733
SRSF2-GFP-R1.bam,0.083943,0.048625
SRSF3-GFP-R1.bam,0.029753,0.022252
SRSF4-GFP-R1.bam,0.061406,0.050998
SRSF5-GFP-R1.bam,0.051041,0.042951
SRSF6-GFP-R1.bam,0.020549,0.018958
SRSF7-GFP-R1.bam,0.090106,0.047944


What are these enrichments compared to the enrichments caused by edit_disatance enrichments?

In [76]:
edit_distance = pd.read_csv(
    "/ifs/projects/ians/umisdeduping/iCLIP_deduping/SR_iCLIP_test3/dedup_directional-adjacency.dir/Nxf1-GFP-R2_edit_distance.tsv", sep="\t")
edit_distance = edit_distance.set_index("edit_distance")
edit_distance


Unnamed: 0_level_0,directional-adjacency,directional-adjacency_null,unique,unique_null
edit_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Single_UMI,676164,676164,661329,661329
0,0,31,0,37
1,137,388,14791,592
2,3312,2311,3540,3511
3,7697,7980,8181,11627
4,43520,43954,43709,50401
5,8595,8597,7875,11928
6,0,0,0,0


In [77]:
edit_distance_fractions = edit_distance.drop("Single_UMI", axis=0).apply(lambda x: x/sum(x))

In [78]:
edit_distance_fractions["unique"]/edit_distance_fractions["unique_null"]

edit_distance
0     0.000000
1    24.984797
2     1.008260
3     0.703621
4     0.867225
5     0.660211
6          NaN
dtype: float64

So the substitution errors are a 25x enrichment, rather than a 1.3 fold enrichment for the deletions. 