# Verify final inserts

Notebook to work out details of verifying contents of final inerts before working this into a script and adding to the pipeline. This notebook will only run after the pipeline has completed.

In [1]:
from pathlib import Path
from Bio.Restriction import *
from pydna.genbankrecord import GenbankRecord
from pydna.readers import read
from Bio import SeqIO
from pydna.dseqrecord import Dseqrecord

In [2]:
insert_path = '../output/insert_sequences/inserts/genbank_files/VR-1.insert.gb'
anchor_path = '../output/insert_sequences/sequences/anchor.gb'

In [3]:
insert_record = GenbankRecord(read(insert_path))
anchor_record = GenbankRecord(read(anchor_path))

In [4]:
insert_record.list_features()

| Ft# | Label or Note    | Dir | Sta | End | Len | type | orf? |
|-----|------------------|-----|-----|-----|-----|------|------|
|   0 | L:5_prime_HR     | --> | 0   | 30  |  30 | CDS  |  no  |
|   1 | L:Anchor region  | --> | 30  | 45  |  15 | CDS  |  no  |
|   2 | L:Variable regio | --> | 45  | 245 | 200 | CDS  |  no  |
|   3 | L:3_prime_HR     | --> | 245 | 275 |  30 | CDS  |  no  |

In [5]:
def get_feature_by_name(record, name):
    # pull a feature out of a record using its name
    # if not present return -1. Send all names to
    # lower case first so check is not case sensitive
    features = [record.extract_feature(i) 
                for i in range(len(record.features))
               ]
    name_dict = {f.name.lower(): f for f in features}
    if name.lower() in name_dict:
        return name_dict[name]

    else:
        return -1

In [20]:
vr = get_feature_by_name(insert_record, 'variable_region')
vr

Dseqrecord(-200)

In [25]:
vr.__dict__

{'_seq': Dseq(-200)
 CGCT..ATAA
 GCGA..TATT,
 'id': 'Variable_region',
 'name': 'Variable_region',
 'description': 'insert_VR-1',
 'dbxrefs': [],
 'annotations': {'molecule_type': 'DNA',
  'topology': 'linear',
  'data_file_division': 'SYN',
  'date': '11-JUL-2021',
  'accessions': ['56U0DC-tdIipxX5CUaSVmEaSO7E'],
  'keywords': [''],
  'source': '',
  'organism': '.',
  'taxonomy': []},
 '_per_letter_annotations': {},
 'features': [SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(200), strand=1), type='CDS')],
 'map_target': None,
 'n': 5e-14,
 'path': '../output/insert_sequences/inserts/genbank_files/VR-1.insert.gb',
 'item': 'accession',
 'start': None,
 'stop': None,
 'strand': 1,
 '_repr': 'accession',
 '_linktemplate': "<a href='https://www.ncbi.nlm.nih.gov/nuccore/{item}?from={start}&to={stop}&strand={strand}' target='_blank'>{text}</a>",
 'hyperlink': <a href='https://www.ncbi.nlm.nih.gov/nuccore/accession?from=&to=&strand=1' target='_blank'>accession</a>}

In [7]:
get_feature_by_name(insert_record, anchor_record.name)

Dseqrecord(-15)

Check to make sure insert contains anchor region

In [8]:
def check_insert_for_anchor(insert_record, anchor_record):
    insert_anchor = get_feature_by_name(insert_record, anchor_record.name)
    assert insert_anchor.seq == anchor_record.seq

In [9]:
check_insert_for_anchor(insert_record, anchor_record)

Check to make sure anchor does not contain prohibited cutters.

In [10]:
prohibited_cutters = ['HindIII', 'KpnI', 'EcoRI']

In [11]:
def check_anchor_for_prohibited_cutters(insert_record, anchor_record, prohibited_cutters):
    insert_anchor = get_feature_by_name(insert_record, anchor_record.name)
    no_cutters = set([str(s) for s in set(anchor_record.no_cutters())])
    for each_prohibited_cutter in prohibited_cutters:
        assert each_prohibited_cutter in no_cutters

In [12]:
check_anchor_for_prohibited_cutters(insert_record, anchor_record, prohibited_cutters)

Check to make sure homology arms match input arms and check to make sure arm are present in specified plasmid.

In [13]:
three_prime_arm_path = '../resources/files/genbank/3_prime_homology_arm.gb'
five_prime_arm_path = '../resources/files/genbank/5_prime_homology_arm.gb'
pFC9_path = '../resources/files/genbank/pFC9.gb'

In [14]:
three_prime_record =  GenbankRecord(read(three_prime_arm_path))
five_prime_record =  GenbankRecord(read(five_prime_arm_path))
pFC9 = GenbankRecord(read(pFC9_path))

In [15]:
def check_homology_arms(insert_record, homology_arm, source_backbone):
    assert insert_record.seq.find(str(homology_arm.seq)) != -1
    assert source_backbone.seq.find(str(homology_arm.seq)) != -1

In [16]:
check_homology_arms(insert_record, three_prime_record, pFC9)
check_homology_arms(insert_record, five_prime_record, pFC9)

In [17]:
def no_cutters_as_string_set(record):
    return set([str(s) for s in set(record.no_cutters())])

def one_cutters_as_string_set(record):
    return set([str(s) for s in set(record.once_cutters())])

In [18]:
def check_homology_arms_for_required_cutter(homology_arm, cutter):
    assert cutter in one_cutters_as_string_set(homology_arm)

In [19]:
check_homology_arms_for_required_cutter(
    five_prime_record, prohibited_cutters[1]
)
check_homology_arms_for_required_cutter(
    three_prime_record, prohibited_cutters[2]
)

Check variable regions for sequence content

In [26]:
def calculate_content(record, nuc_a, nuc_b):
    a_count = str(record.seq).count(nuc_a)
    b_count = str(record.seq).count(nuc_b)
    return (a_count + b_count) / len(record)

def calculate_skew(record, nuc_a, nuc_b):
    a_count = str(record.seq).count(nuc_a)
    b_count = str(record.seq).count(nuc_b)
    
    return (a_count - b_count) / (a_count + b_count)

In [31]:
insert_record.seguid()

iNwXzb-2HaF216umO3c-EhdHv7M

In [28]:
calculate_skew(vr, 'G', 'C')

0.2

In [29]:
calculate_content(vr, 'G', 'C')

0.4