In [1]:
import sys
sys.path.append('../')
import crisprtree
from crisprtree import utils
from crisprtree import estimators
from crisprtree import annotators

In [2]:
from Bio import SeqIO
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from scipy.stats import hmean

In [3]:
with open('data/GCA_002855745.1_ASM285574v1_genomic.gbff') as handle:
    genome = list(SeqIO.parse(handle, 'genbank'))[0]

In [4]:
estimator = estimators.CFDEstimator.build_pipeline()

In [5]:
library_grnas = []

for feat in genome.features[:20]:
    if feat.type == 'CDS':
        product = feat.qualifiers['product'][0]
        tag = feat.qualifiers['locus_tag'][0]
        gene_record = feat.extract(genome)
        possible_targets = utils.extract_possible_targets(gene_record)
        
        results = {}
        for gRNA in possible_targets[:20]:

            #Refactor to use cas9-offinder!!!
            possible_binding = utils.tile_seqrecord(gRNA, genome[:1000])
            off_scores = estimator.predict_proba(possible_binding.values)
            results[gRNA] = hmean(off_scores[off_scores>0])
        results = pd.Series(results).sort_values()
        
        for protospacer, off_score in results.head().to_dict().items():
            library_grnas.append({'Product': product,
                                  'Tag': tag,
                                  'Protospacer': protospacer,
                                  'Off Target Score': off_score})
        
        
library_df = pd.DataFrame(library_grnas)

In [6]:
library_df

Unnamed: 0,Off Target Score,Product,Protospacer,Tag
0,9.441762e-09,hypothetical protein,AGGCATGTAGGACGCCGTCT,B7L53_00005
1,1.502881e-08,hypothetical protein,AGGAGGTGAAGAAGCTGCTC,B7L53_00005
2,1.137217e-08,hypothetical protein,AGCGAGGAGGCACACCACTA,B7L53_00005
3,1.694319e-08,hypothetical protein,AGGCCAGCTCTCCTTCCAGC,B7L53_00005
4,4.566358e-08,hypothetical protein,ACATGCCTTCTGCCCTCTGC,B7L53_00005
5,1.748891e-08,hypothetical protein,AGAAAGTGAGGAAACAGCAA,B7L53_00010
6,1.455874e-08,hypothetical protein,AATACGGCTTCATCTACTAC,B7L53_00010
7,3.002206e-08,hypothetical protein,ATTGCGATAGCAAAGGCACT,B7L53_00010
8,9.305839e-09,hypothetical protein,AGAAAACCCGCTCACCGCTC,B7L53_00010
9,2.197842e-08,hypothetical protein,AACGGTTTATAGATCAGCAC,B7L53_00010
