In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set(font_scale=1.5)
sbn.set_style('white')
import os

%matplotlib inline

# MIT Webserver Processing

Due to the 250bp limitation of the http://crispr.mit.edu/ website we uploaded 250 bp segments with 50 bp overlaps. Using the batch-mode submission the results were exported as a set of Genbank files in which the potential gRNAs are encoded as sequence features. These files are compressed in the `data/MITData.tar.gz` file.

In [4]:
import tarfile
from Bio import SeqIO
import re

reg = re.compile('score: (\d{1,2})')


def process_feat(feat):
    """Extract the score, start, and strand from the Genbank feature. """
    m = reg.search(feat.qualifiers['note'][0])
    
    tdict = {'Start':feat.location.start.real,
             'Stop':feat.location.end.real, 
             'MITScore':float(m.groups()[0]), 
             'Seq':str(feat.extract(seqR).seq), 
             'Strand': feat.location.strand}
    
    return tdict

locs = []
with tarfile.open('data/MITData.tar.gz', mode='r:gz') as tr:

    for f in tr:
        # The offset of the submission is encoded as the number before the _
        # For example: 3601_all_guides.gb starts at an offset of 3601
        offset = int(f.name.split('/')[-1].split('_')[0])+1
        seqR = SeqIO.parse(tr.extractfile(f), 'genbank').next()

        for feat in seqR.features:
            locs.append(process_feat(feat))
            locs[-1]['Start'] += offset
            locs[-1]['Stop'] += offset
        
df = pd.DataFrame(locs).sort_values(by='Start')
df.head()

Unnamed: 0,MITScore,Seq,Start,Stop,Strand
233,81.0,AAGGATATCTTGTCTTCGTTGGG,19,42,-1
234,81.0,CAAGGATATCTTGTCTTCGTTGG,20,43,-1
243,69.0,GACAAGATATCCTTGATCTGTGG,28,51,1
235,73.0,TGTGGTAGATCCACAGATCAAGG,38,61,-1
244,61.0,CTGTGGATCTACCACACACAAGG,45,68,1


In [7]:
# Read in the results from the previous sequence processing script
known = pd.read_excel('results/summary_res_all_gRNAs.xlsx')
known.head()

Unnamed: 0,Region,Citation,Name,Start,Stop,gRNA,ProtoMean,ProtoBind,ProtoCut,HasPam,ProtoPamMean,ProtoPamBind,ProtoPamCut,Entropy,NumSeqs
0,LTR,23974631,T5,465,484,GTTAGACCAGATCTGAGCCT.NGG,0.746405,0.606061,0.606061,0.989899,0.736304,0.59596,0.59596,2.15669,99
1,LTR,25049410,LTR-A,98,127,AGGGCCAGGGATCAGATATCCACTGACCTT.NGG,0.71645,0.707071,0.626263,0.949495,0.714305,0.707071,0.626263,4.231717,99
2,LTR,25049410,LTR-B,312,341,CCN.GAGTACTTCAAGAACTGCTGACATCGAGCT,0.578449,0.381443,0.371134,1.0,0.578449,0.381443,0.371134,2.915684,97
3,LTR,25049410,LTR-C,78,97,GATTGGCAGAACTACACACC.NGG,0.846881,0.804878,0.756098,0.987805,0.846719,0.804878,0.756098,2.880366,82
4,LTR,25049410,LTR-D,380,399,GCGTGGCCTGGGCGGGACTG.NGG,0.818171,0.767677,0.757576,1.0,0.818171,0.767677,0.757576,2.506792,99


In [8]:

# The two datasets are merged and then the desired columns are extracted

cols = [u'Citation', u'Name', u'Start', 'Stop', u'gRNA', u'ProtoPamCut', u'Entropy', u'NumSeqs', 'Region']
rd = {'ProtoPamCut': 'Percent cleaved', 
      'Entropy': 'Entropy (bits)', 
      'NumSeqs': 'Number of Sequences Evaluated',
      'MITScore': 'MIT Score'}
mdf = pd.merge(known[cols], 
               df.drop(['Seq', 'Strand', 'Stop'], axis=1),
               on = 'Start', how = 'left').rename(columns = rd)
mdf['Percent cleaved'] *= 100
mdf.head()

Unnamed: 0,Citation,Name,Start,Stop,gRNA,Percent cleaved,Entropy (bits),Number of Sequences Evaluated,Region,MIT Score
0,23974631,T5,465,484,GTTAGACCAGATCTGAGCCT.NGG,59.59596,2.15669,99,LTR,67.0
1,25049410,LTR-A,98,127,AGGGCCAGGGATCAGATATCCACTGACCTT.NGG,62.626263,4.231717,99,LTR,
2,25049410,LTR-B,312,341,CCN.GAGTACTTCAAGAACTGCTGACATCGAGCT,37.113402,2.915684,97,LTR,
3,25049410,LTR-C,78,97,GATTGGCAGAACTACACACC.NGG,75.609756,2.880366,82,LTR,81.0
4,25049410,LTR-D,380,399,GCGTGGCCTGGGCGGGACTG.NGG,75.757576,2.506792,99,LTR,


In [11]:
# This is the final form of Supplemental Table 1

mdf.to_excel('results/SupTable1.xlsx', index=False)