In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from Bio import SeqIO
import SearchTools

%matplotlib inline
sbn.set(font_scale=1.5)
sbn.set_style('white')


# Data Import

Sequence data is read in from the Los Alamos Database that is stored in the included `.tar.gz` file. These alignments were produced by the LANL tools and are aranged such that the HXB2 sequence is the last in the file. The alignment information is used to determine the HXB2 positions of each sequence. 

In [6]:
import tarfile
from collections import Counter
from copy import deepcopy
from itertools import compress
from collections import deque

def get_start_stop(seqR, hxb2_pos):
    """Determine the HXB2 start-stop of the sequence using affine-gaps."""
    
    it = zip(str(seqR.seq), hxb2_pos)
    poses = compress(it, (x[0] != '-' for x in it))
    start = poses.next()[1]
    stop = deque(poses, maxlen=1)[0][1]
    return start, stop


seq_locs = []

with tarfile.open('data/LANLdata.tar.gz', mode='r:gz') as tr:
    for f in tr:
        
        parts = f.name.split('/')[-1].split('.')[0].split('-')
        offset = int(parts[1])
        print 'Importing', f.name
        
        seqs = list(SeqIO.parse(tr.extractfile(f), 'fasta'))
        hxb2 = seqs[-1]

        hxb2_pos = []
        pos = offset-1
        for hx_l in str(hxb2.seq):
            pos += hx_l != '-'
            hxb2_pos.append(pos)

        for seqR in seqs[:-1]:
            if len(seqR.seq.ungap('-')) > 50:
                start, stop = get_start_stop(seqR, hxb2_pos)
                seq_locs.append({'sStart': start,
                                 'sStop': stop,
                                 'SeqR': deepcopy(seqR),
                                 'Name': seqR.id})

            
seq_df = pd.DataFrame(seq_locs)
print 'Total sequences imported', len(seq_locs)
seq_df.head()



Importing hiv-2500-3000.fixed.fst
Importing hiv-1100-1950.fixed.fst
Importing hiv-8300-8900.fixed.fst
Importing hiv-7100-7500.fixed.fst
Importing hiv-5200-5600.fixed.fst
Importing hiv-1-700.fixed.fst
Importing hiv-8500-8800.fixed.fst
Importing hiv-6200-6900.fixed.fst
Importing hiv-700-1150.fixed.fst
Importing hiv-2000-2500.fixed.fst
Importing hiv-5700-6100.fixed.fst
Importing hiv-9086-9717.fixed.fst
Importing hiv-4100-4900.fixed.fst
Importing hiv-3000-3500.fixed.fst
Importing hiv-7500-7900.fixed.fst
Importing hiv-3200-3500.fixed.fst
Total sequences imported 390290


Unnamed: 0,Name,SeqR,sStart,sStop
0,B.US.1997.ARES2.AB078005,"(A, C, A, T, A, A, T, T, G, -, -, -, -, -, G, ...",2500,3000
1,B.US.1985.Ba_L.AB221005,"(A, C, A, T, A, A, T, T, G, -, -, -, -, -, G, ...",2500,3000
2,B.US.1985.Ba_L.AB253432,"(A, C, A, T, A, A, T, T, G, -, -, -, -, -, G, ...",2500,3000
3,B.US.1991.US2.AB485638,"(A, C, A, T, A, A, T, T, G, -, -, -, -, -, G, ...",2500,3000
4,B.US.1991.US2.AB485639,"(A, C, A, T, A, A, T, T, G, -, -, -, -, -, G, ...",2500,3000


In [7]:
ex_df = pd.read_excel('data/gRNAList.xlsx').sort_values(by = 'Start')
ex_df.head()

Unnamed: 0,Citation,Name,Start,Stop,gRNA,Region
142,26581162,sg45F,27,47,CGACAAGAGATCCTTGATCTG.NGG,LTR
0,26607397,LTR-1,28,47,GACAAGATATCCTTGATCTG.NGG,LTR
19,27341108,gRNA 1,28,47,GACAAGATATCCTTGATCTG.NGG,LTR
7,26775808,sgRNA 1,28,47,GACAAGATATCCTTGATCTG.NGG,LTR
72,25808449,T1,28,47,GACAAGATATCCTTGATCTG.NGG,LTR


# Sequence Processing

Now that the gRNAs and sequences have been loaded we'll compare them. Each gRNA will be parsed to extract the protospacer region. For each gRNA we extract all sequences that overlap the target region. Then, the MIT penalty matrix is then applied exauhstively across the entire sequence to determine the ideal binding location.

In [8]:
from Bio.Seq import reverse_complement


In [9]:
def parse_grna(gRNA):
    """Extract and normalize the length of the protospacer"""
    
    parts = gRNA.split('.')
    if len(parts[1]) == 3:
        direc = 'Forward'
        proto = parts[0][-20:]
    else:
        direc = 'Reverse'
        proto = parts[1][:20]
        
    if len(proto) < 20:
        proto = 'N'*(20-len(proto)) + proto
        
    return direc, proto


def check_whole_seq(seq, gRNA):
    """Apply the MIT penalty matrix across every postion on the sequence."""
    
    direc, query = parse_grna(gRNA)
    
    # Normalize the orientation and ungap the sequence
    if direc == 'Reverse':
        nseq = seq.ungap('-')[::-1]
        query = query[::-1]
    else:
        nseq = seq.ungap('-')
    
    # Check every 20-mer in the sequence.
    scores = []
    for start in range(len(nseq)-20):
        scores.append(SearchTools.gRNA_score_hit(query, nseq[start:start+20]))

    # Find the best score
    scores = pd.Series(scores)
    bscore, bloc = scores.max(), scores.idxmax()
    
    # Prep outputs
    oseq = str(nseq[bloc:bloc+23])
    has_pam = oseq.endswith('GG')
    if direc == 'Reverse':
        oseq = oseq.encode('ascii')[::-1]
        has_pam = oseq.startswith('CC')
    
    return oseq, bscore, direc, has_pam
    

In [14]:
num = 0
scores = []

# Run through each gRNA
for _, row in ex_df.iterrows():
    
    # Extract all sequences that overlap the target region
    mask = (seq_df['sStart'] < (row['Start']-20)) & (seq_df['sStop'] > (row['Stop']+20))
    for _, seq_row in seq_df.ix[mask].head(1000).iterrows():
        num += 1
        if num % 50000 == 0:
            print num, row['Name'], row['Start']
        
        # Check this sequence for the presence of this gRNA
        hit, score, direc, has_pam = check_whole_seq(seq_row['SeqR'].seq, row['gRNA'])
    
        # Extract some information
        scores.append({'Hit': str(hit),
                       'Score': score,
                       'gRNA': row['gRNA'],
                       'Citation': row['Citation'],
                       'Direc': direc,
                       'SeqID': seq_row['Name'],
                       'gStart': row['Start'],
                       'gStop': row['Stop'],
                       'HasPam': has_pam,
                       'Name': row['Name'],
                       'Region': row['Region']})

score_df = pd.DataFrame(scores)
score_df['ProtoPam'] = score_df['Score']*(score_df['HasPam'].astype(float))
score_df['HasN'] = score_df['Hit'].str.contains('N')
score_df.head()   

50000 LTR-M 312
100000 gRNA-TAR3 485
150000 gEnv2 7794


Unnamed: 0,Citation,Direc,HasPam,Hit,Name,Region,Score,SeqID,gRNA,gStart,gStop,ProtoPam,HasN
0,26581162,Forward,True,GAAAAGAGATCCTTGATCTGTGG,sg45F,LTR,0.986,A0026-R06-PBMC-Genomic-LTR,CGACAAGAGATCCTTGATCTG.NGG,27,47,0.986,False
1,26581162,Forward,True,GACAAGACATCCTTGATCTGTGG,sg45F,LTR,1.0,A0044-R06-PBMC-Genomic-LTR,CGACAAGAGATCCTTGATCTG.NGG,27,47,1.0,False
2,26581162,Forward,True,GACAAGACATCCTTGATNNGTGG,sg45F,LTR,0.06174,A0044-R07-PBMC-Genomic-LTR,CGACAAGAGATCCTTGATCTG.NGG,27,47,0.06174,True
3,26581162,Forward,True,GACAAGATATCCTTGATNTGTGG,sg45F,LTR,0.196,A0068-R00-PBMC-Genomic-LTR,CGACAAGAGATCCTTGATCTG.NGG,27,47,0.196,True
4,26581162,Forward,True,GGCAAGAGATCCTTGACCTGTGG,sg45F,LTR,0.385,A0068-R02-PBMC-Genomic-LTR,CGACAAGAGATCCTTGATCTG.NGG,27,47,0.385,False


In [16]:
from scipy.stats import entropy

def calc_entropy(ser):    
    return entropy(ser.value_counts()/len(ser), base=2)


# Group by each gRNA and calculate the entropy of the hits

entropies = score_df.groupby('gRNA')['Hit'].agg(calc_entropy)

In [19]:


sum_data = []
# Group by gRNA and Cititation and calculate aggregate stats
for (grna, cit), rows in score_df.query('HasN == False').groupby(['gRNA', 'Citation']):
    
    rows = rows.groupby('SeqID', as_index=False).first()
    
    has_pam = rows['HasPam'].mean() # Fraction of hits with adjacent PAMs
    num_seqs = len(rows) 
    proto_mean = rows['Score'].mean() # Average MIT score
    proto_bind = (rows['Score']>0.5).mean() # Fraction of sequences with MIT > 0.5
    proto_cut = (rows['Score']>0.75).mean() # Fraction of sequences with MIT > 0.75
    
    proto_pam_mean = rows['ProtoPam'].mean() # Average MIT score with missing PAMs counting as Zero
    proto_pam_bind = (rows['ProtoPam']>0.5).mean() # Fraction of sequences with MIT > 0.5 and missing PAMs counting as Zero
    proto_pam_cut = (rows['ProtoPam']>0.75).mean() # Fraction of sequences with MIT > 0.75 and missing PAMs counting as Zero
    
    # Collect aggregate data
    sum_data.append({'gRNA': grna,
                     'Citation': cit,
                     'HasPam': has_pam,
                     'NumSeqs': num_seqs,
                     'ProtoMean': proto_mean,
                     'ProtoCut': proto_cut,
                     'ProtoBind': proto_bind,
                     'ProtoPamMean': proto_pam_mean,
                     'ProtoPamBind': proto_pam_bind,
                     'ProtoPamCut': proto_pam_cut,
                     'Start': rows['gStart'].iloc[0],
                     'Stop': rows['gStop'].iloc[0],
                     'Name': rows['Name'].iloc[0],
                     'Entropy': entropies[grna],
                     'Region': rows['Region'].iloc[0]})
    
order = ['Region',u'Citation', u'Name', u'Start', u'Stop', u'gRNA',   
         u'ProtoMean', 'ProtoBind', u'ProtoCut', 
         u'HasPam',  u'ProtoPamMean', 'ProtoPamBind', u'ProtoPamCut', 'Entropy', u'NumSeqs']
sum_df = pd.DataFrame(sum_data)[order].sort_values(by = ['Citation', 'Name']).reset_index(drop=True)
sum_df.head()

Unnamed: 0,Region,Citation,Name,Start,Stop,gRNA,ProtoMean,ProtoBind,ProtoCut,HasPam,ProtoPamMean,ProtoPamBind,ProtoPamCut,Entropy,NumSeqs
0,LTR,23974631,T5,465,484,GTTAGACCAGATCTGAGCCT.NGG,0.798512,0.710499,0.705403,0.979613,0.793878,0.707441,0.702345,2.418876,981
1,LTR,25049410,LTR-A,98,127,AGGGCCAGGGATCAGATATCCACTGACCTT.NGG,0.682982,0.628352,0.554278,0.947637,0.676152,0.623244,0.550447,5.249049,783
2,LTR,25049410,LTR-B,312,341,CCN.GAGTACTTCAAGAACTGCTGACATCGAGCT,0.474444,0.285566,0.272066,0.933541,0.468332,0.285566,0.272066,4.29888,963
3,LTR,25049410,LTR-C,78,97,GATTGGCAGAACTACACACC.NGG,0.785196,0.754065,0.705285,0.98374,0.782252,0.752033,0.703252,2.660178,492
4,LTR,25049410,LTR-D,380,399,GCGTGGCCTGGGCGGGACTG.NGG,0.756672,0.75,0.602459,0.989754,0.752059,0.745902,0.598361,4.051778,976


# Results

In [16]:
# Output raw results to Excel sheet for later processing

sum_df.to_excel('results/summary_res_all_gRNAs.xlsx', index=False)

In [22]:
from collections import defaultdict

# Extract the nucleotide frequencies for the LTR-2 gRNA

counts = defaultdict(float)
tot = 0
for n in score_df.query('Name == "LTR-2" & HasN == False')['Hit'].str.upper().values:
    tot += 1
    for p, l in enumerate(n):
        counts[(p, l)] += 1

order = 'ACGT'
d = []
for num in range(23):
    d.append([counts[(num, l)] for l in order])
        
        
res = pd.DataFrame(d, columns = list(order), index=range(1, 24)).T/tot
res.to_excel('results/LTR-2.freqs.xlsx')
res.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,23
A,0.006085,0.979716,0.004057,0.002028,0.016227,0.010142,0.01217,0.971602,0.020284,0.947262,...,1.0,0.0,0.995943,0.004057,0.88641,0.056795,0.068966,0.98783,0.008114,0.002028
C,0.01217,0.002028,0.022312,0.006085,0.004057,0.0,0.975659,0.006085,0.016227,0.004057,...,0.0,0.993915,0.004057,0.993915,0.099391,0.935091,0.920892,0.0,0.002028,0.004057
G,0.979716,0.004057,0.01217,0.006085,0.977688,0.98783,0.008114,0.006085,0.955375,0.004057,...,0.0,0.002028,0.0,0.002028,0.010142,0.002028,0.0,0.010142,0.985801,0.98783
T,0.002028,0.014199,0.96146,0.985801,0.002028,0.002028,0.004057,0.016227,0.008114,0.042596,...,0.0,0.004057,0.0,0.0,0.004057,0.004057,0.010142,0.002028,0.004057,0.006085
