# Debug simulate assembly

In [1]:
import os
import sys
from pathlib import Path
import random
import numpy as np

import pytest
from pydna.genbankrecord import GenbankRecord
from pydna.readers import read
from pydna.dseqrecord import Dseqrecord



In [2]:
SCRIPTS = Path('../scripts')
TEST_FILES = Path('../scripts/test_files')
INSERT_DIR = TEST_FILES.joinpath('assembled_inserts')
T7_INIT_BACKBONE = TEST_FILES.joinpath('backbones/pFC9.gb')
T7_TERM_BACKBONE = TEST_FILES.joinpath('backbones/pFC8.gb')

In [3]:
sys.path.append(str(SCRIPTS))

In [4]:
from simulate_assembly import *

## Test termination series

Termination series is throwing errors, namely when trying to assemble in the strong initiator which should not be a problem in theory because should share compatable sticky ends after HindIII and EcoRI digestion.

In [5]:
def strong_initiator():
    # strong initiator seq not known unitl experiments with initiators are
    # done. Seq doesn't matter as long as starts with HindIII site and ends
    # with an EcoRI site so generate it it here.
    si = Dseqrecord(
        ''.join(
            [
            HindIII.site,
            ''.join(list(np.random.choice(['A', 'T', 'C', 'G'], 200, replace=True))),
            EcoRI.site
            ]
        )
    )
    return si

In [6]:
def read_genbank_record(filepath):
    # helper function for reading genbank files
    return GenbankRecord(
        read(filepath)
    )

In [7]:
def get_assembled_inserts():
    inserts = []
    for each_ai in INSERT_DIR.iterdir():
        inserts.append(
            read_genbank_record(each_ai)
        )
    return inserts

assembled_inserts = get_assembled_inserts()

In [8]:
T7_term_backbone = read_genbank_record(T7_TERM_BACKBONE)

In [9]:
random_insert = assembled_inserts[random.randint(0, len(assembled_inserts))]

In [10]:
term = T7TerminationSeries(
    'T7-term', 
     T7_term_backbone,
     random_insert, 
     strong_initiator()
)
print(term)

<simulate_assembly.T7TerminationSeries object at 0x145d2daf0e80>


Get the initiator after digestion.

In [11]:
di = term.digested_initiator
print(di.seq.five_prime_end())
print(di.seq.three_prime_end())

("5'", 'agct')
("5'", 'aatt')


Sticky ends are as expected.

Take a look at the digesteed backbone.

In [12]:
term.backbone_initiator_digest_lf

Dseqrecord(-3105)

Hold on why do we get a Dseqrecord here and not with `digested_initiator`? Also sticky ends look good.

In [13]:
type(term.backbone_initiator_digest_lf)

pydna.dseqrecord.Dseqrecord

In [14]:
type(term.digested_initiator)

pydna.dseqrecord.Dseqrecord

Nevermid we do? **Oh**, since this is not Gibson assembly there is no additional homology besides the restriction site so need to reduce the `limit` term to at *max* the length of the restriction recognition site (6 nucleotides). If below cell works then this would be important to note for any assembly using restruction recognition sites as the source of sequence homology.

In [15]:
a = Assembly((term.backbone_initiator_digest_lf, term.digested_initiator), limit=6)
a

Assembly
fragments..: 3105bp 210bp
limit(bp)..: 6
G.nodes....: 324
algorithm..: common_sub_strings

Lots of nodes because of short limit.

Reduce max number of nodes, not sure exactly how many are needed but not 317.

In [16]:
a1 = Assembly((term.backbone_initiator_digest_lf, term.digested_initiator), limit=6)
a1

Assembly
fragments..: 3105bp 210bp
limit(bp)..: 6
G.nodes....: 324
algorithm..: common_sub_strings

In [30]:
a1.assemble_circular()

[]

This still took forever try different algo that only looks for [terminal overlaps](https://pydna.readthedocs.io/#pydna.common_sub_strings.terminal_overlap).

In [16]:
from pydna.common_sub_strings import terminal_overlap

In [19]:
try:
    a2=Assembly(
        (term.backbone_initiator_digest_lf, term.digested_initiator), 
        limit=6, 
        algorithmfunction=terminal_overlap
    )
except Exception as e:
    print(e)

__init__() got an unexpected keyword argument 'algorithmfunction'


Ok it says in docs I could do this.

Source code.

```
def __init__(self, frags=None, limit=25, algorithm=common_sub_strings):

        # Fragments is a string subclass with some extra properties
        # The order of the fragments has significance
        fragments = []
        for f in frags:
            fragments.append(
                {
                    "upper": str(f.seq).upper(),
                    "mixed": str(f.seq),
                    "name": f.name,
                    "features": f.features,
                    "nodes": [],
                }
            )
```
argument is actuall just `algorithm`.

In [62]:
lf = term.backbone_initiator_digest_lf
lf.name = 'Large boy'
insert = term.digested_initiator
insert.name = 'Small boy'

a2=Assembly(
        (lf, insert), 
        limit=4, 
        algorithm=terminal_overlap
    )
print(a2)
a2.nodemap

Assembly
fragments..: 3105bp 210bp
limit(bp)..: 4
G.nodes....: 2
algorithm..: terminal_overlap


{'begin': 'end',
 'end': 'begin',
 'begin_rc': 'end_rc',
 'end_rc': 'begin_rc',
 'AATT': 'AATT',
 'AGCT': 'AGCT'}

Test assemble linear for fun

In [63]:
a2.assemble_linear()

[Contig(-3311),
 Contig(-3311),
 Contig(-3105),
 Contig(-3105),
 Contig(-210),
 Contig(-210),
 Contig(-4),
 Contig(-4)]

In [64]:
a2.assemble_linear()[0]

In [65]:
a2.assemble_circular()

[Contig(o6202), Contig(o3307), Contig(o3307), Contig(o412)]

Make sure to select contig that is not assembled with the reverse complement of the large fragment. In this case it is second and third contigs.

In [67]:
a2.assemble_circular()[1]

This seems to work and using the terminal overlap algo is the way to go with lots of possible substrings due to short required length for RE digests. 

In [None]:
dir(a2.assemble_circular()[1])