In [1]:
from Bio import SeqIO
import csv
import sys
sys.path.append('../../code')
import visualize
from crispr import CasTemplate
from pipeline import NDNFPipeline

In [2]:
# The number of protospacers to extract from the nomination set.
TOP_N_KMERS = 20_000

In [3]:
CAS_LIST = [CasTemplate(pam_motif = 'NGG', typ='II', 
                  protospacer_length=20, name='spCas9'),
            CasTemplate(pam_motif = 'NGGRRT', typ='II', 
                  protospacer_length=20, name='saCas9'),
            CasTemplate(pam_motif = 'TTTV', typ='V', 
                  protospacer_length=20, name='LbCas12a_Cpf1')
           ]

In [4]:
NOM_FASTAS = [f'../../data/nominate.{i}.fasta' for i in range(5)]
VAL_FASTAS = [f'../../data/test.{i}.fasta' for i in range(5)]

In [5]:
def get_seqs(path):
    with open(path) as handle:
        return list(SeqIO.parse(handle, 'fasta'))

In [6]:
def stream_nomination_results(nom_seqs, cas, **extra_args):
    "Convert results to a tidy dictionary format for easy streaming"
    
    nominated, proto_counts = NDNFPipeline.nominate(cas, nom_seqs, 
                                                          top_n=TOP_N_KMERS, 
                                                          return_counts=True)
    for proto, count in proto_counts.items():
        
        info = {'protospacer': proto,
                'nomination_count': count}
        info.update(extra_args)
        yield info

In [7]:
with open('nomination_stability_exp.csv', 'w') as handle:
    writer = csv.DictWriter(handle, ['name', 'protospacer', 'nomination_count', 
                                     'seed'])
    writer.writeheader()
    for seed, nom_path in enumerate(NOM_FASTAS):
        nom_seqs = get_seqs(nom_path)
    
        for cas in CASNDNFPipelineIST:
            print(f'Seed: {seed}, Cas: {cas.name}')
            writer.writerows(stream_nomination_results(nom_seqs, cas, seed=seed, name=cas.name))

Seed: 0, Cas: spCas9


100%|██████████| 3228/3228 [00:36<00:00, 89.48it/s]


Seed: 0, Cas: saCas9


100%|██████████| 3228/3228 [00:11<00:00, 292.48it/s]


Seed: 0, Cas: LbCas12a_Cpf1


100%|██████████| 3228/3228 [00:14<00:00, 215.43it/s]


Seed: 1, Cas: spCas9


100%|██████████| 3228/3228 [00:35<00:00, 90.02it/s]


Seed: 1, Cas: saCas9


100%|██████████| 3228/3228 [00:11<00:00, 293.10it/s]


Seed: 1, Cas: LbCas12a_Cpf1


100%|██████████| 3228/3228 [00:14<00:00, 216.41it/s]


Seed: 2, Cas: spCas9


100%|██████████| 3228/3228 [00:35<00:00, 89.85it/s]


Seed: 2, Cas: saCas9


100%|██████████| 3228/3228 [00:11<00:00, 292.06it/s]


Seed: 2, Cas: LbCas12a_Cpf1


100%|██████████| 3228/3228 [00:15<00:00, 215.16it/s]


Seed: 3, Cas: spCas9


100%|██████████| 3228/3228 [00:37<00:00, 86.68it/s]


Seed: 3, Cas: saCas9


100%|██████████| 3228/3228 [00:11<00:00, 288.10it/s]


Seed: 3, Cas: LbCas12a_Cpf1


100%|██████████| 3228/3228 [00:15<00:00, 212.20it/s]


Seed: 4, Cas: spCas9


100%|██████████| 3228/3228 [00:35<00:00, 89.82it/s]


Seed: 4, Cas: saCas9


100%|██████████| 3228/3228 [00:11<00:00, 276.21it/s]


Seed: 4, Cas: LbCas12a_Cpf1


100%|██████████| 3228/3228 [00:15<00:00, 214.23it/s]


In [11]:
def stream_validation_results(ndnf_pipe, val_path, **extra_args):
    "Convert results to a tidy dictionary format for easy streaming"
    
    _, protospacer_hit_freq = ndnf_pipe.narrow(val_path, min_rate=0, 
                                               mismatch = 2, return_freqs=True)
    
    for proto, score in protospacer_hit_freq.items():
        
        info = {'protospacer': proto,
                'validation_hit_rate': score}
        info.update(extra_args)
        yield info

In [12]:
LANL_seqs = get_seqs('../../data/LANL.reference.fasta')

with open('validation_stability_exp.csv', 'w') as handle:
    writer = csv.DictWriter(handle, ['name', 'protospacer', 
                                     'validation_hit_rate', 'seed'])
    writer.writeheader()
    
    for cas in CAS_LIST:
        
        pipe = NDNFPipeline.nominate(cas, LANL_seqs, 
                                     top_n=TOP_N_KMERS)
        
        for seed, val_path in enumerate(VAL_FASTAS):
            print(f'Seed: {seed}, Cas: {cas.name}')
            writer.writerows(stream_validation_results(pipe, val_path, seed=seed, name=cas.name))

100%|██████████| 4725/4725 [00:55<00:00, 84.95it/s]


Seed: 0, Cas: spCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.0.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
50.1351 seconds elapsed.


Seed: 1, Cas: spCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.1.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
49.0359 seconds elapsed.


Seed: 2, Cas: spCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.2.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
49.2266 seconds elapsed.


Seed: 3, Cas: spCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.3.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
49.1282 seconds elapsed.


Seed: 4, Cas: spCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.4.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
60.7684 seconds elapsed.
100%|██████████| 4725/4725 [00:16<00:00, 293.02it/s]
Total 1 device(s) found.
Loading input file...


Seed: 0, Cas: saCas9


Reading ../../data/test.0.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
11.5205 seconds elapsed.


Seed: 1, Cas: saCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.1.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
8.73849 seconds elapsed.


Seed: 2, Cas: saCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.2.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
9.19179 seconds elapsed.


Seed: 3, Cas: saCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.3.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
11.7795 seconds elapsed.


Seed: 4, Cas: saCas9


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.4.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
13.5793 seconds elapsed.
100%|██████████| 4725/4725 [00:22<00:00, 213.91it/s]


Seed: 0, Cas: LbCas12a_Cpf1


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.0.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
24.1858 seconds elapsed.


Seed: 1, Cas: LbCas12a_Cpf1


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.1.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
24.4583 seconds elapsed.


Seed: 2, Cas: LbCas12a_Cpf1


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.2.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
24.423 seconds elapsed.


Seed: 3, Cas: LbCas12a_Cpf1


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.3.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
23.5345 seconds elapsed.


Seed: 4, Cas: LbCas12a_Cpf1


Total 1 device(s) found.
Loading input file...
Reading ../../data/test.4.fasta...
Sending data to devices...
Chunk load started.
1 devices selected to analyze...
Finding pattern in chunk #1...
Comparing patterns in chunk #1...
22.2578 seconds elapsed.
