In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set(font_scale=1.5)
sbn.set_style('white')
import os

%matplotlib inline

# MIT Webserver Processing

Due to the 250bp limitation of the http://crispr.mit.edu/ website we uploaded 250 bp segments with 50 bp overlaps. Using the batch-mode submission the results were exported as a set of Genbank files in which the potential gRNAs are encoded as sequence features. These files are compressed in the `data/MITData.tar.gz` file.

In [2]:
import tarfile
from Bio import SeqIO
import re

reg = re.compile('score: (\d{1,2})')


def process_feat(feat):
    """Extract the score, start, and strand from the Genbank feature. """
    m = reg.search(feat.qualifiers['note'][0])
    
    tdict = {'Start':feat.location.start.real,
             'Stop':feat.location.end.real, 
             'MITScore':float(m.groups()[0]), 
             'Seq':str(feat.extract(seqR).seq), 
             'Strand': feat.location.strand}
    
    return tdict

locs = []
with tarfile.open('data/MITData.tar.gz', mode='r:gz') as tr:

    for f in tr:
        # The offset of the submission is encoded as the number before the _
        # For example: 3601_all_guides.gb starts at an offset of 3601
        offset = int(f.name.split('/')[-1].split('_')[0])+1
        seqR = SeqIO.parse(tr.extractfile(f), 'genbank').next()

        for feat in seqR.features:
            locs.append(process_feat(feat))
            locs[-1]['Start'] += offset
            locs[-1]['Stop'] += offset
        
df = pd.DataFrame(locs).sort_values(by='Start')
df.head()

Unnamed: 0,MITScore,Seq,Start,Stop,Strand
233,81.0,AAGGATATCTTGTCTTCGTTGGG,19,42,-1
234,81.0,CAAGGATATCTTGTCTTCGTTGG,20,43,-1
243,69.0,GACAAGATATCCTTGATCTGTGG,28,51,1
235,73.0,TGTGGTAGATCCACAGATCAAGG,38,61,-1
244,61.0,CTGTGGATCTACCACACACAAGG,45,68,1


In [3]:
# Read in the results from the previous sequence processing script
known = pd.read_excel('results/summary_res_all_gRNAs.xlsx')
known.head()

Unnamed: 0,Region,Citation,Name,Start,Stop,gRNA,ProtoMean,ProtoBind,ProtoCut,HasPam,ProtoPamMean,ProtoPamBind,ProtoPamCut,Entropy,NumSeqs
0,LTR,23974631,T5,465,484,GTTAGACCAGATCTGAGCCT.NGG,0.77627,0.696307,0.688018,0.963828,0.769499,0.691032,0.682743,2.615312,1327
1,LTR,23974631,T6,343,362,GCTACAAGGGACTTTCCGCT.NGG,0.904882,0.887794,0.854435,0.976497,0.891003,0.872631,0.843063,2.959877,1319
2,LTR,25049410,LTR-A,98,127,AGGGCCAGGGATCAGATATCCACTGACCTT.NGG,0.682982,0.628352,0.554278,0.947637,0.676152,0.623244,0.550447,5.249049,783
3,LTR,25049410,LTR-B,312,341,CCN.GAGTACTTCAAGAACTGCTGACATCGAGCT,0.431888,0.226766,0.214126,0.896654,0.422759,0.226022,0.214126,4.605056,1345
4,LTR,25049410,LTR-C,78,97,GATTGGCAGAACTACACACC.NGG,0.785196,0.754065,0.705285,0.98374,0.782252,0.752033,0.703252,2.660178,492


In [4]:

# The two datasets are merged and then the desired columns are extracted

cols = [u'Citation', u'Name', u'Start', 'Stop', u'gRNA', u'ProtoPamCut', u'Entropy', u'NumSeqs', 'Region']
rd = {'ProtoPamCut': 'Percent cleaved', 
      'Entropy': 'Entropy (bits)', 
      'NumSeqs': 'Number of Sequences Evaluated',
      'MITScore': 'MIT Score'}
mdf = pd.merge(known[cols], 
               df.drop(['Seq', 'Strand', 'Stop'], axis=1),
               on = 'Start', how = 'left').rename(columns = rd)
mdf['Percent cleaved'] *= 100
mdf.head()

Unnamed: 0,Citation,Name,Start,Stop,gRNA,Percent cleaved,Entropy (bits),Number of Sequences Evaluated,Region,MIT Score
0,23974631,T5,465,484,GTTAGACCAGATCTGAGCCT.NGG,68.274303,2.615312,1327,LTR,67.0
1,23974631,T6,343,362,GCTACAAGGGACTTTCCGCT.NGG,84.306293,2.959877,1319,LTR,
2,25049410,LTR-A,98,127,AGGGCCAGGGATCAGATATCCACTGACCTT.NGG,55.0447,5.249049,783,LTR,
3,25049410,LTR-B,312,341,CCN.GAGTACTTCAAGAACTGCTGACATCGAGCT,21.412639,4.605056,1345,LTR,
4,25049410,LTR-C,78,97,GATTGGCAGAACTACACACC.NGG,70.325203,2.660178,492,LTR,81.0


In [5]:
# This is the final form of Supplemental Table 1

mdf.to_excel('results/SupTable1.xlsx', index=False)