In [1]:
import sys
sys.path.append('../')
import crisprtree
from crisprtree import utils
from crisprtree import estimators
from crisprtree import annotators

In [2]:
from Bio import SeqIO
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn

In [3]:
with open('data/GCA_002855745.1_ASM285574v1_genomic.gbff') as handle:
    genome = list(SeqIO.parse(handle, 'genbank'))[0]

In [4]:
with open('data/addgene-plasmid-54622-sequence-158642.gbk') as handle:
    plasmid = list(SeqIO.parse(handle, 'genbank'))[0]

for rec in plasmid.features:
    if rec.qualifiers.get('product', [''])[0].startswith('enhanced GFP'):
        print('Found')
        egfp_feature = rec
        break
        
egfp_record = egfp_feature.extract(plasmid)

Found


In [5]:
# TODO: Refactor extract_possible_targets to exclude the PAM, this is an old strategy
possible_targets = utils.extract_possible_targets(egfp_record)
len(possible_targets)

117

In [6]:
estimator = estimators.CFDEstimator.build_pipeline()

In [7]:
results = {}
for gRNA in possible_targets:
    
    #Refactor to use cas9-offinder!!!
    possible_binding = utils.tile_seqrecord(gRNA, genome[:1000])
    worst_off_score = max(estimator.predict_proba(possible_binding.values))
    results[gRNA] = worst_off_score
results = pd.Series(results).sort_values()
results.head()


CCGTCCAGCTCGACCAGGAT    0.000616
CGGTGGTGCAGATGAACTTC    0.001904
GGGCGAGGAGCTGTTCACCG    0.002075
ATGGCCGACAAGCAGAAGAA    0.002223
CTGAAGTTCATCTGCACCAC    0.002320
dtype: float64

In [9]:
for key, val in results.head().to_dict().items():
    annotators.annotate_grna_binding(key, egfp_record, 
                                     estimator, 
                                     extra_qualifiers = {'Off Target Score': val},
                                     exhaustive=True)

In [10]:
for feat in egfp_record.features:
    if 'gRNA' in feat.qualifiers:
        print(feat.location.start, feat.location.strand, 
              feat.qualifiers['gRNA'], feat.qualifiers['Off Target Score'])

459 1 ATGGCCGACAAGCAGAAGAA 0.00222330264208
11 1 GGGCGAGGAGCTGTTCACCG 0.00207506825033
39 -1 CCGTCCAGCTCGACCAGGAT 0.0006162427814
132 1 CTGAAGTTCATCTGCACCAC 0.0023197807762
131 -1 CGGTGGTGCAGATGAACTTC 0.00190428133879
