In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from bs4 import BeautifulSoup
import urllib2
import json
sbn.set(font_scale=1.5)
sbn.set_style('white')
import os

%matplotlib inline

In [2]:
check_data = [(9001, 9150, 5277118505611227),
              (9401, 9550, 6071990716277723),
              (3401, 3550, 6848268606800618),
              (2801, 2950, 3845265521163193),
              (3201, 3350, 9279028973269044),
              (201, 350, 1848027062245798),
              (1, 150, 3188469332317261),
              (401, 550, 7277914941094421),
              (1601, 1750, 9694467932424530),
              (2201, 2350, 527498301827653),
              (601, 750, 3149823577943946),
              (2001, 2150, 8417149389350654),
              (1401, 1550, 5615916155850639),
              (801, 950, 7896567212397158),
              (1001, 1150, 802154456686067),
              (1801, 1950, 551724045144731),
              (3801, 3950, 9493518067183864),
              (1201, 1350, 9411113253667286),
              (4601, 4750, 2087044263522217),
              (2601, 2750, 9445877752785914),
              (3601, 3750, 9346233873222692),
              (2401, 2550, 7687588622343901),
              (5201, 5350, 8983770186959720),
              (4001, 4150, 2246595015637350),
              (4201, 4350, 9840130137150152),
              (5001, 5150, 9064830231388592),
              (4401, 4550, 6459109703625097),
              (5401, 5550, 1348014272887902),
              (4801, 4950, 6105569958918344),
              (5601, 5750, 8916846076588580),
              (6201, 6350, 8558584311319150),
              (6001, 6150, 8156059463940812),
              (7601, 7750, 5527889073981775),
              (6801, 6950, 6959110445423825),
              (5801, 5950, 1077864001893774),
              (6401, 6550, 8402755201789021),
              (8201, 8350, 9725305118888168),
              (8801, 8950, 3341989723350018),
              (8601, 8750, 1045669196535672),
              (8001, 8150, 3692000092679442),
              (8401, 8550, 8894260965073495),
              (7801, 7950, 8627812289816805),
              (7001, 7150, 4162064775468252),
              (7401, 7550, 6537327837896284),
              (7201, 7350, 6981572308848824),
              (6601, 6750, 378348456399327),
              (9201, 9350, 3348001586555534),
              (3001, 3150, 5565717754015577)]

urls = [('%4i_all_guides.gb', 'http://crispr.mit.edu/export/guides_gb/%i'),
        ('%4i_all_guides.csv', 'http://crispr.mit.edu/export/csv_all_guides/%i'),
       ]

for start, _, key in check_data:
    for fname, url in urls:
        path = os.path.join('../data/MITData/', fname % start)
        if not os.path.exists(path):
            with open(path, 'w') as handle:
                d = urllib2.urlopen(url % key)
                handle.write(d.read())


    


In [3]:
import glob
from Bio import SeqIO
import re

files = sorted(glob.glob('../data/MITData/*.gb'))




reg = re.compile('score: (\d{1,2})')

def process_feat(feat):
    m = reg.search(feat.qualifiers['note'][0])
    
    tdict = {'Start':feat.location.start.real,
             'Stop':feat.location.end.real, 
             'MITScore':float(m.groups()[0]), 
             'Seq':str(feat.extract(seqR).seq), 
             'Strand': feat.location.strand}
    
    return tdict

locs = []

for f in files:
    
    offset = int(f.split('/')[-1].split('_')[0])+1
    with open(f) as handle:
        seqR = SeqIO.parse(handle, 'genbank').next()
        
    for feat in seqR.features:
        locs.append(process_feat(feat))
        locs[-1]['Start'] += offset
        locs[-1]['Stop'] += offset

        
df = pd.DataFrame(locs).sort_values(by='Start')
df.head()

Unnamed: 0,MITScore,Seq,Start,Stop,Strand
0,81.0,AAGGATATCTTGTCTTCGTTGGG,19,42,-1
1,81.0,CAAGGATATCTTGTCTTCGTTGG,20,43,-1
10,69.0,GACAAGATATCCTTGATCTGTGG,28,51,1
2,73.0,TGTGGTAGATCCACAGATCAAGG,38,61,-1
11,61.0,CTGTGGATCTACCACACACAAGG,45,68,1


In [4]:
known = pd.read_excel('../results/summary_res_all_gRNAs.xlsx')
known.head()

Unnamed: 0,Type,Region,Citation,Name,Start,Stop,gRNA,ProtoMean,ProtoBind,ProtoCut,HasPam,ProtoPamMean,ProtoPamBind,ProtoPamCut,Entropy,NumSeqs
0,Activation,LTR,26607397,LTR-1,28,47,GACAAGATATCCTTGATCTG.NGG,0.744457,0.761658,0.634715,0.981865,0.742035,0.759067,0.634715,3.746268,386
1,Activation,LTR,26607397,LTR-2,78,97,GATTGACAGAACTACACACC.NGG,0.474226,0.703854,0.0,0.979716,0.472204,0.701826,0.0,2.660178,493
2,Activation,LTR,26607397,LTR-3,108,127,GTCAGATATCCACTGACCTT.NGG,0.683203,0.63171,0.553059,0.948814,0.676517,0.626717,0.549313,5.273464,801
3,Activation,LTR,26607397,LTR-4,343,362,GCTACAAGGGACTTTCCGCT.NGG,0.904882,0.887794,0.854435,0.976497,0.891003,0.872631,0.843063,2.959877,1319
4,Activation,LTR,26607397,LTR-5,379,398,GGCGTGGCCTGGGCGGGACT.NGG,0.805577,0.767235,0.680504,0.984433,0.801911,0.76427,0.679021,4.190446,1349


In [5]:
cols = [u'Citation', u'Name', u'Start', 'Stop', u'gRNA', u'ProtoPamCut', u'Entropy', u'NumSeqs', 'Region']


In [6]:
rd = {'ProtoPamCut': 'Percent cleaved', 
      'Entropy': 'Entropy (bits)', 
      'NumSeqs': 'Number of Sequences Evaluated',
      'MITScore': 'MIT Score'}
mdf = pd.merge(known[cols], 
               df.drop(['Seq', 'Strand', 'Stop'], axis=1),
               on = 'Start', how = 'left').rename(columns = rd)
mdf['Percent cleaved'] *= 100
mdf.head()

Unnamed: 0,Citation,Name,Start,Stop,gRNA,Percent cleaved,Entropy (bits),Number of Sequences Evaluated,Region,MIT Score
0,26607397,LTR-1,28,47,GACAAGATATCCTTGATCTG.NGG,63.471503,3.746268,386,LTR,69.0
1,26607397,LTR-2,78,97,GATTGACAGAACTACACACC.NGG,0.0,2.660178,493,LTR,81.0
2,26607397,LTR-3,108,127,GTCAGATATCCACTGACCTT.NGG,54.931336,5.273464,801,LTR,65.0
3,26607397,LTR-4,343,362,GCTACAAGGGACTTTCCGCT.NGG,84.306293,2.959877,1319,LTR,
4,26607397,LTR-5,379,398,GGCGTGGCCTGGGCGGGACT.NGG,67.90215,4.190446,1349,LTR,


In [7]:
mdf.to_excel('../papers/Sullivan-2017-NatBiotech/SupTable1.xlsx', index=False)