<img src="golden-gator.png" width="400">

# Godlen Gator notebook

## setup 

In [1]:
import os
import sys
from typing import List, Tuple

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from collections import OrderedDict
import pandas as pd
from itertools import product

from dawdlib.golden_gate.gate import Gate
from dawdlib.golden_gate.gate_data import GGData
from dawdlib.golden_gate.gg_path_finder import dijkstra_all_paths, find_compatible_paths
from dawdlib.golden_gate.graph_maker import (
    GraphMaker,
    SOURCE_NODE,
    SINK_NODE,
    make_default_graph,
    build_custom_graph,
    create_default_valid_node_function,
    create_default_weight_func,
)
from dawdlib.golden_gate.utils import find_dna_var_poss, parse_dna, expand_dna_var_poss

from dawdlib.golden_gate.find_gg import deg_table_to_dict

from dawdlib.degenerate_dna.codon_selector import CodonSelector
from dawdlib.degenerate_dna.deg_table import generate_deg_csv, TableColNames
from dawdlib.golden_gate.dijkstra import all_shortest_paths

from Bio import SeqFeature, SeqIO

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed',)).History will not be written to the database.


In [2]:
MIN_OLIGO_LENGTH: int = 20
MAX_OLIGO_LENGTH: int = 80
MIN_CONST_OLIGO_LENGTH: int = 20
MAX_NUM_GATES = 12

prefix = "CGTGCGGTCTCG"
suffix = "CGAGACCGCGCCGGGC"

gg_data = GGData()
np.random.seed(0)

W_PATH = f"./example"
resfile_file = f"{W_PATH}/chosen_18Dec.resfile"

SEQ_RECORD = SeqIO.read(f"{W_PATH}/embl.embl", "embl")
dna = parse_dna(f"{W_PATH}/wt_dna.fasta")

## generate the degenerate codon table

In [3]:
generate_deg_csv(resfile_file, csv_filename=f"{W_PATH}/deg_table.csv")

In [4]:
deg_table = pd.read_csv(
    f"{W_PATH}/deg_table.csv", na_filter=True, keep_default_na=False,
)
var_poss = expand_dna_var_poss(deg_table[TableColNames.DNA_POS.value].tolist())

## create a graph. either use the default, or custom blocks

### default graph

In [5]:
gm = GraphMaker(gg_data)
d_graph, src, target = make_default_graph(
    GraphMaker(gg_data), dna, var_poss, deg_table_to_dict(deg_table), 20, 50, 20
)

### custom graph

In [6]:
gm = GraphMaker(gg_data)
acceptable_fcws = gm.gg_data.filter_self_binding_gates(2000)
is_valid_node = create_default_valid_node_function(acceptable_fcws, var_poss)

# def valid_node_wrapper(fcw: str, ind: int):
#     if not is_valid_node(fcw, ind):
#         return False
#     if palyndrome(fcw):
#         return False
#     return True


is_valid_edge = gm.create_default_valid_edge_func(
    var_poss,
    MIN_OLIGO_LENGTH,
    MAX_OLIGO_LENGTH - len(prefix) - len(suffix) - 3 + 1,
    MIN_CONST_OLIGO_LENGTH,
    1000,
)

# def hamming_dist_over1(gate1: Gate, gate2: Gate) -> bool:
#     dist = sum([1 for a, b in zip(gate1.bps, gate2.bps) if a != b])
#     return dist > 1

# def valid_edge_wrapper(gate1: Gate, gate2: Gate) -> bool:
#     if not is_valid_edge(gate1, gate2):
#         return False
#     if gate2.idx - gate1.idx + len(prefix) + len(suffix) + 3 > MAX_OLIGO_LENGTH:
#         return False
#     return True

# # def edge_cost(*args, **kwrags):
# #      return 1


def cost_func(nd1, nd2):
    default = create_default_weight_func(deg_table_to_dict(deg_table))
    return default(nd1, nd2) + len(suffix) + len(prefix) + 3


# d_graph, src, target = make_default_graph(
#    GraphMaker(ggdata), dna, var_poss, deg_table_to_dict(deg_table), 20, 80, 0
# )

In [7]:
d_graph, src, target = build_custom_graph(
    gm, dna, var_poss, is_valid_node, is_valid_edge, cost_func
)

## prepare degenerate codon features

In [8]:
def parse_degenerate_codon_csv(csv_file: str) -> pd.DataFrame:
    return pd.read_csv(
        csv_file,
        index_col=False,
        na_values="NaN",
        converters={
            "ENCODED_AAS": lambda x: x.strip("[]").replace("'", "").split(", "),
            "ENCODED_COUNT": lambda x: [
                int(a) for a in x.strip("[]").replace("'", "").split(", ")
            ],
        },
    )


deg_parsed_df = parse_degenerate_codon_csv(f"{W_PATH}/deg_table.csv")
print(deg_parsed_df)

    AA_POS  DNA_POS               ENCODED_AAS             ENCODED_COUNT  \
0       16       46                    [I, V]                    [1, 1]   
1       42      124                    [L, V]                    [1, 1]   
2       61      181                 [A, L, V]                 [1, 1, 1]   
3       65      193                    [S, T]                    [1, 1]   
4       68      202                 [A, M, V]                 [1, 1, 1]   
5       69      205              [A, L, P, Q]              [1, 1, 1, 1]   
6       72      214           [A, C, S, T, V]           [1, 1, 1, 1, 1]   
7      108      322           [E, I, L, T, V]           [1, 1, 1, 1, 1]   
8      112      334                    [I, V]                    [1, 1]   
9      145      433  [A, F, I, M, S, T, V, Y]  [1, 1, 1, 1, 1, 1, 1, 1]   
10     150      448                    [I, V]                    [1, 1]   
11     167      499                    [T, V]                    [1, 1]   
12     181      541      

In [9]:
FEATURES: List[SeqFeature.SeqFeature] = []
for i, row in deg_parsed_df.iterrows():
    num = 0
    for col in [c for c in deg_parsed_df.columns if c.startswith("AMBIGUOUS_CODONS")]:
        if str(row[col]) != "nan":
            num += 1
    name = f"{''.join(row['ENCODED_AAS'])}={num}"
    ftr = SeqFeature.SeqFeature(
        SeqFeature.FeatureLocation(row["DNA_POS"] - 1, row["DNA_POS"] + 2, 1),
        type=name,
        id=name,
    )
    FEATURES.append(ftr)

## find shortest paths!

In [10]:
shortest_paths = all_shortest_paths(d_graph, src, target, "weight", len_cutoff=16)
# shortest_paths = nx.all_shortest_paths(d_graph, src, target, "weight")

In [11]:
def all_gates_hamming_dist(pth: List[Gate]) -> bool:
    for gate1 in pth[1:-1]:
        for gate2 in pth[1:-1]:
            if gate1 == gate2:
                continue
            if not hamming_dist_over1(gate1, gate2):
                return False
    return True


best_paths = {}
best_costs = {}
for pth, cost in shortest_paths:
    if len(pth) not in best_paths.keys():
        if not gg_data.gate_set_has_off_target([a.bps for a in pth[1:-1]]):
            best_paths[len(pth)] = pth
            best_costs[len(pth)] = cost
for k, v in best_paths.items():
    print(k, best_costs[k], v)

16 1022 [Gate(idx=-1, bps='src'), Gate(idx=27, bps='GGGG'), Gate(idx=48, bps='GAGC'), Gate(idx=119, bps='CAAG'), Gate(idx=168, bps='TGGC'), Gate(idx=195, bps='TACG'), Gate(idx=216, bps='CGCT'), Gate(idx=315, bps='TACA'), Gate(idx=336, bps='AAGT'), Gate(idx=428, bps='CAAC'), Gate(idx=454, bps='TCAT'), Gate(idx=503, bps='CCAC'), Gate(idx=543, bps='TACC'), Gate(idx=651, bps='ATGG'), Gate(idx=672, bps='GAGG'), Gate(idx=717, bps='snk')]


### choose whatever path you want

In [12]:
chosen_path = best_paths[min(best_paths.keys())]

In [13]:
chosen_path

[Gate(idx=-1, bps='src'),
 Gate(idx=27, bps='GGGG'),
 Gate(idx=48, bps='GAGC'),
 Gate(idx=119, bps='CAAG'),
 Gate(idx=168, bps='TGGC'),
 Gate(idx=195, bps='TACG'),
 Gate(idx=216, bps='CGCT'),
 Gate(idx=315, bps='TACA'),
 Gate(idx=336, bps='AAGT'),
 Gate(idx=428, bps='CAAC'),
 Gate(idx=454, bps='TCAT'),
 Gate(idx=503, bps='CCAC'),
 Gate(idx=543, bps='TACC'),
 Gate(idx=651, bps='ATGG'),
 Gate(idx=672, bps='GAGG'),
 Gate(idx=717, bps='snk')]

## add best path gates to features

In [14]:
for gate in chosen_path[1:-1]:
    name = f"G.{gate.idx}.{gate.bps}"
    ftr = SeqFeature.SeqFeature(
        SeqFeature.FeatureLocation(gate.idx, gate.idx + 4, 1), type=name, id=name
    )
    FEATURES.append(ftr)

## save embl file with all features

In [15]:
SEQ_RECORD = SeqIO.read(f"{W_PATH}/embl.embl", "embl")
SEQ_RECORD.features.extend(FEATURES)
with open(f"{W_PATH}/w_features.gb", "w+") as fout:
    SeqIO.write(SEQ_RECORD, fout, "gb")

## create oligos for ordering

In [16]:
def find_codons_for_oligo(
    oligo: Tuple[int, int], dc_df: pd.DataFrame
) -> List[List[Tuple[int, str]]]:
    oligo_codons: List[List[Tuple[int, str]]] = []
    sub_df = dc_df.loc[
        dc_df["DNA_POS"].between(oligo[0], oligo[1]),
        ["DNA_POS"] + [c for c in dc_df.columns if c.startswith("AMBIGUOUS_CODONS")],
    ]
    pos_codon: List[List[Tuple[int, str]]] = []
    for _, row in sub_df.iterrows():
        pos_codon.append([])
        for col in sub_df.columns[1:]:
            if row[col] is np.nan:
                continue
            pos_codon[-1].append((row["DNA_POS"], row[col]))
    for prod in product(*pos_codon):
        combination: List[Tuple[int, str]] = []
        for (pos, codon) in prod:
            combination.append((pos, codon))
        oligo_codons.append(combination)
    return oligo_codons


def create_dc_oligo(
    dna: str, pos_codons: List[Tuple[int, str]], oligo: Tuple[int, int]
) -> str:
    dna_copy = dna
    for (pos, codon) in pos_codons:
        dna_copy = dna_copy[: pos - 1] + codon + dna_copy[pos + 3 - 1 :]
    return dna_copy[oligo[0] : oligo[1] + 4]

In [17]:
to_order_df = pd.DataFrame()
for gate1, gate2 in zip(chosen_path[1:-2], chosen_path[2:-1]):
    oligo_codons = find_codons_for_oligo((gate1.idx, gate2.idx), deg_parsed_df)
    row = {
        "gate1": gate1,
        "gate2": gate2,
        "gate_gate_dist": gate2.idx - gate1.idx + 3,
        "oligo_codons": oligo_codons,
        "const": oligo_codons == [[]],
    }
    wt_dna = create_dc_oligo(dna, [], (gate1.idx, gate2.idx))
    row["oligo_dna"] = wt_dna
    row["full_oligo_dna"] = prefix + wt_dna + suffix
    row["name"] = f"{gate1.idx}-{gate2.idx}.wt_dna"
    to_order_df = to_order_df.append(row, ignore_index=True)
    for ind, oligo_codon in enumerate(oligo_codons):
        oligo_dna = create_dc_oligo(dna, oligo_codon, (gate1.idx, gate2.idx))
        if oligo_codon == []:
            continue
        row["oligo_dna"] = oligo_dna
        row["full_oligo_dna"] = prefix + oligo_dna + suffix
        row["name"] = f"{gate1.idx}-{gate2.idx}.{len(oligo_codons)}.{ind}"
        to_order_df = to_order_df.append(row, ignore_index=True)

print(to_order_df)

    const                                     full_oligo_dna        gate1  \
0     0.0  CGTGCGGTCTCGGGGGTGGTGCCCATCCTGGTCGAGCCGAGACCGC...   (27, GGGG)   
1     0.0  CGTGCGGTCTCGGGGGTGGTGCCCATCCTGRTTGAGCCGAGACCGC...   (27, GGGG)   
2     1.0  CGTGCGGTCTCGGAGCTGGACGGCGACGTAAACGGCCACAAGTTCA...   (48, GAGC)   
3     0.0  CGTGCGGTCTCGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGC...  (119, CAAG)   
4     0.0  CGTGCGGTCTCGCAAGSTGACCCTGAAGTTCATCTGCACCACCGGC...  (119, CAAG)   
5     0.0  CGTGCGGTCTCGTGGCCCACCCTCGTGACCACCCTGACCTACGCGA...  (168, TGGC)   
6     0.0  CGTGCGGTCTCGTGGCCCACCCTCSTGACCACCCTGASCTACGCGA...  (168, TGGC)   
7     0.0  CGTGCGGTCTCGTGGCCCACCCTCGCAACCACCCTGASCTACGCGA...  (168, TGGC)   
8     0.0  CGTGCGGTCTCGTACGGCGTCCAGTGCTTCAGCCGCTCGAGACCGC...  (195, TACG)   
9     0.0  CGTGCGGTCTCGTACGGCRTGCHGTGCTTCWSCCGCTCGAGACCGC...  (195, TACG)   
10    0.0  CGTGCGGTCTCGTACGGCRTGCHGTGCTTCGYGCGCTCGAGACCGC...  (195, TACG)   
11    0.0  CGTGCGGTCTCGTACGGCRTGGCATGCTTCWSCCGCTCGAGACCGC...  (195, TACG)   

## golden gate simulators to validate the oligos and gates

In [18]:
from flab.repertoire.simulator.fragment import parse_order_to_fragments
from flab.repertoire.simulator.simulator import GeneSimulator

### this is where you insert the sequences that come before and after the 1st and last gate respectively

In [19]:
VEC_TO_GATE_1 = "GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGCGAGACCGCGCCGGGC"
LAST_GATE_TO_END = "CGTGCGGTCTCGGAGGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGTAA"

### validation 1. Wild-Type fragments only

In [20]:
fragments_df = pd.DataFrame(columns=["id", "seq"])
fragments_df["id"] = to_order_df.loc[to_order_df["name"].str.contains("wt_dna"), "name"]
fragments_df["seq"] = to_order_df.loc[
    to_order_df["name"].str.contains("wt_dna"), "full_oligo_dna"
]

fragments_df = fragments_df.append(
    {"id": "vec-", "seq": VEC_TO_GATE_1}, ignore_index=True
)
fragments_df = fragments_df.append(
    {"id": "-vec", "seq": LAST_GATE_TO_END}, ignore_index=True
)

fragments = parse_order_to_fragments(fragments_df)
simulator = GeneSimulator(
    fragments, prefix=VEC_TO_GATE_1[:5], suffix=LAST_GATE_TO_END[-5:]
)
genes = simulator.get_all_possible_genes()
# optimally, there's only one gene, and it is the WT. DO NOT CONTINUE if it is not
for gene in genes:
    print(str(gene))
    assert gene == dna, "if not equal, something is wrong"

creates graph
connects source sink
GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTCCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCTGCTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGCACGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTTCAACAGCCACAACGTCTATATCATGCCCGACAAGCAGAACAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACTACCACTACCTGCACACCTGGTCCGAGCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGGAGGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGTAA


### the above output should only have one gene, the WT. make sure this is the case.
### if this test fails, DO NOT GO ON

In [21]:
des_fragments_df = pd.DataFrame(columns=["id", "seq"])
des_fragments_df["id"] = to_order_df["name"]
des_fragments_df["seq"] = to_order_df["full_oligo_dna"]

des_fragments_df = des_fragments_df.append(
    {"id": "vec-", "seq": VEC_TO_GATE_1}, ignore_index=True
)
des_fragments_df = des_fragments_df.append(
    {"id": "-vec", "seq": LAST_GATE_TO_END}, ignore_index=True
)

fragments = parse_order_to_fragments(des_fragments_df)
simulator = GeneSimulator(
    fragments, prefix=VEC_TO_GATE_1[:5], suffix=LAST_GATE_TO_END[-5:]
)
genes = simulator.get_all_possible_genes()

print(f"found {len(genes)} genes")
gene_set = set()
n = 5
for gene in genes[:n]:
    print(str(gene))
    gene_set.add(str(gene))

print(f"there are {len(gene_set)} unique genes in the first {n} genes simulated")

creates graph
connects source sink
found 23328 genes
GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTCCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCTGCTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGCACGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTTCAACAGCCACAACGTCTATATCATGCCCGACAAGCAGAACAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACTACCACTACCTGCACACCTGGTCCGAGCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGGAGGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGTAA
GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTCCAGTGCTTCAGCCGCTACCCCGACC

### troubleshooting
in case the above simulators found no genes, use this block to go over the path,
looking for the gate that isn't connecting

In [22]:
print(simulator.graph.nodes)

['27-48.wt_dna_s_0', '27-48.wt_dna_s_1', '27-48.wt_dna_s_2', '27-48.1.0_s_0', '27-48.1.0_s_1', '27-48.1.0_s_2', '48-119.wt_dna_s_0', '48-119.wt_dna_s_1', '48-119.wt_dna_s_2', '119-168.wt_dna_s_0', '119-168.wt_dna_s_1', '119-168.wt_dna_s_2', '119-168.1.0_s_0', '119-168.1.0_s_1', '119-168.1.0_s_2', '168-195.wt_dna_s_0', '168-195.wt_dna_s_1', '168-195.wt_dna_s_2', '168-195.2.0_s_0', '168-195.2.0_s_1', '168-195.2.0_s_2', '168-195.2.1_s_0', '168-195.2.1_s_1', '168-195.2.1_s_2', '195-216.wt_dna_s_0', '195-216.wt_dna_s_1', '195-216.wt_dna_s_2', '195-216.8.0_s_0', '195-216.8.0_s_1', '195-216.8.0_s_2', '195-216.8.1_s_0', '195-216.8.1_s_1', '195-216.8.1_s_2', '195-216.8.2_s_0', '195-216.8.2_s_1', '195-216.8.2_s_2', '195-216.8.3_s_0', '195-216.8.3_s_1', '195-216.8.3_s_2', '195-216.8.4_s_0', '195-216.8.4_s_1', '195-216.8.4_s_2', '195-216.8.5_s_0', '195-216.8.5_s_1', '195-216.8.5_s_2', '195-216.8.6_s_0', '195-216.8.6_s_1', '195-216.8.6_s_2', '195-216.8.7_s_0', '195-216.8.7_s_1', '195-216.8.7_s_2', 

In [23]:
for pth in simulator.get_simlpe_paths(start="vec-_s_0", end="-vec_s_1"):
    print(pth)

['vec-_s_0', '27-48.wt_dna_s_1', '48-119.wt_dna_s_1', '119-168.wt_dna_s_1', '168-195.wt_dna_s_1', '195-216.wt_dna_s_1', '216-315.wt_dna_s_1', '315-336.wt_dna_s_1', '336-428.wt_dna_s_1', '428-454.wt_dna_s_1', '454-503.wt_dna_s_1', '503-543.wt_dna_s_1', '543-651.wt_dna_s_1', '651-672.wt_dna_s_1', '-vec_s_1']
['vec-_s_0', '27-48.wt_dna_s_1', '48-119.wt_dna_s_1', '119-168.wt_dna_s_1', '168-195.wt_dna_s_1', '195-216.wt_dna_s_1', '216-315.wt_dna_s_1', '315-336.wt_dna_s_1', '336-428.wt_dna_s_1', '428-454.wt_dna_s_1', '454-503.wt_dna_s_1', '503-543.wt_dna_s_1', '543-651.wt_dna_s_1', '651-672.1.0_s_1', '-vec_s_1']
['vec-_s_0', '27-48.wt_dna_s_1', '48-119.wt_dna_s_1', '119-168.wt_dna_s_1', '168-195.wt_dna_s_1', '195-216.wt_dna_s_1', '216-315.wt_dna_s_1', '315-336.wt_dna_s_1', '336-428.wt_dna_s_1', '428-454.wt_dna_s_1', '454-503.wt_dna_s_1', '503-543.2.0_s_1', '543-651.wt_dna_s_1', '651-672.wt_dna_s_1', '-vec_s_1']
['vec-_s_0', '27-48.wt_dna_s_1', '48-119.wt_dna_s_1', '119-168.wt_dna_s_1', '168-1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



['vec-_s_0', '27-48.1.0_s_1', '48-119.wt_dna_s_1', '119-168.1.0_s_1', '168-195.2.0_s_1', '195-216.8.4_s_1', '216-315.wt_dna_s_1', '315-336.3.1_s_1', '336-428.wt_dna_s_1', '428-454.2.1_s_1', '454-503.2.0_s_1', '503-543.wt_dna_s_1', '543-651.wt_dna_s_1', '651-672.1.0_s_1', '-vec_s_1']
['vec-_s_0', '27-48.1.0_s_1', '48-119.wt_dna_s_1', '119-168.1.0_s_1', '168-195.2.0_s_1', '195-216.8.4_s_1', '216-315.wt_dna_s_1', '315-336.3.1_s_1', '336-428.wt_dna_s_1', '428-454.2.1_s_1', '454-503.2.0_s_1', '503-543.2.0_s_1', '543-651.wt_dna_s_1', '651-672.wt_dna_s_1', '-vec_s_1']
['vec-_s_0', '27-48.1.0_s_1', '48-119.wt_dna_s_1', '119-168.1.0_s_1', '168-195.2.0_s_1', '195-216.8.4_s_1', '216-315.wt_dna_s_1', '315-336.3.1_s_1', '336-428.wt_dna_s_1', '428-454.2.1_s_1', '454-503.2.0_s_1', '503-543.2.0_s_1', '543-651.wt_dna_s_1', '651-672.1.0_s_1', '-vec_s_1']
['vec-_s_0', '27-48.1.0_s_1', '48-119.wt_dna_s_1', '119-168.1.0_s_1', '168-195.2.0_s_1', '195-216.8.4_s_1', '216-315.wt_dna_s_1', '315-336.3.1_s_1', '3

In [24]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", 1000
):  # more options can be specified also
    print(des_fragments_df)

                id  \
0     27-48.wt_dna   
1        27-48.1.0   
2    48-119.wt_dna   
3   119-168.wt_dna   
4      119-168.1.0   
5   168-195.wt_dna   
6      168-195.2.0   
7      168-195.2.1   
8   195-216.wt_dna   
9      195-216.8.0   
10     195-216.8.1   
11     195-216.8.2   
12     195-216.8.3   
13     195-216.8.4   
14     195-216.8.5   
15     195-216.8.6   
16     195-216.8.7   
17  216-315.wt_dna   
18  315-336.wt_dna   
19     315-336.3.0   
20     315-336.3.1   
21     315-336.3.2   
22  336-428.wt_dna   
23  428-454.wt_dna   
24     428-454.2.0   
25     428-454.2.1   
26  454-503.wt_dna   
27     454-503.2.0   
28     454-503.2.1   
29  503-543.wt_dna   
30     503-543.2.0   
31     503-543.2.1   
32  543-651.wt_dna   
33  651-672.wt_dna   
34     651-672.1.0   
35            vec-   
36            -vec   

                                                                                                                                             seq  
0               

## write the to-order oligos to a table

In [25]:
to_order_df[["name", "const", "full_oligo_dna"]].to_csv(f"{W_PATH}/to_order_df.csv")

## validating the to order table
after you have prepared the oligos for ordering, and have planned all the constant 
segments, use these blocks to validate

### the below read table should have the following columns "Positions" and "Sequence"
(these are the names used in the IDT server

In [26]:
idt_df = pd.read_csv(f"{W_PATH}/idt_order.csv", sep=",")
print(idt_df)

   Well        Position                                           Sequence
0    A1    27-48.wt_dna  CGTGCGGTCTCGGGGGTGGTGCCCATCCTGGTCGAGCCGAGACCGC...
1    A2       27-48.1.0  CGTGCGGTCTCGGGGGTGGTGCCCATCCTGRTTGAGCCGAGACCGC...
2    A3  119-168.wt_dna  CGTGCGGTCTCGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGC...
3    A4     119-168.1.0  CGTGCGGTCTCGCAAGSTGACCCTGAAGTTCATCTGCACCACCGGC...
4    A5  168-195.wt_dna  CGTGCGGTCTCGTGGCCCACCCTCGTGACCACCCTGACCTACGCGA...
5    A6     168-195.2.0  CGTGCGGTCTCGTGGCCCACCCTCSTGACCACCCTGASCTACGCGA...
6    A7     168-195.2.1  CGTGCGGTCTCGTGGCCCACCCTCGCAACCACCCTGASCTACGCGA...
7    B1  195-216.wt_dna  CGTGCGGTCTCGTACGGCGTCCAGTGCTTCAGCCGCTCGAGACCGC...
8    B2     195-216.8.0  CGTGCGGTCTCGTACGGCRTGCHGTGCTTCWSCCGCTCGAGACCGC...
9    B3     195-216.8.1  CGTGCGGTCTCGTACGGCRTGCHGTGCTTCGYGCGCTCGAGACCGC...
10   B4     195-216.8.2  CGTGCGGTCTCGTACGGCRTGGCATGCTTCWSCCGCTCGAGACCGC...
11   B5     195-216.8.3  CGTGCGGTCTCGTACGGCRTGGCATGCTTCGYGCGCTCGAGACCGC...
12   B6     195-216.8.4  

### this is where you define your constant segments
give each one a name, and the sequence of the fragment that will be added to the 
Golden-Gate mix

In [27]:
CONSTANT_SEGMENTS = {
    "vec-": "GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGCGAGACCGCGCCGGGCTGGTGCCCTCGTG",
    "-vec": "GCGCCGGGCTGGTGCCCTCGTGCGTGCGGTCTCGGAGGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGTAA",
    "const1": "CGTGCGGTCTCGGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCGAGACCGCGCCGGGC",
    "const2": "CGTGCGGTCTCGCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCTGCTACACGAGACCGCGCCGGGC",
    "const3": "CGTGCGGTCTCGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGCACGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACCGAGACCGCGCCGGGC",
    "const4": "CGTGCGGTCTCGTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACTACCACTACCTGCACACCTGGTCCGAGCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGCGAGACCGCGCCGGGC",
}

In [29]:
val_fragments_df = pd.DataFrame(columns=["id", "seq"])
val_fragments_df["id"] = idt_df["Position"]
val_fragments_df["seq"] = idt_df["Sequence"]

for c_name, c_seq in CONSTANT_SEGMENTS.items():
    val_fragments_df = val_fragments_df.append(
        {"id": c_name, "seq": c_seq}, ignore_index=True
    )

val_fragments = parse_order_to_fragments(val_fragments_df)
simulator = GeneSimulator(
    val_fragments, prefix=VEC_TO_GATE_1[:5], suffix=LAST_GATE_TO_END[-5:]
)
val_genes = simulator.get_all_possible_genes()

print(f"found {len(val_genes)} val_genes")
gene_set = set()
n = 5
for gene in val_genes[:n]:
    print(str(gene))
    gene_set.add(str(gene))

print(f"there are {len(gene_set)} unique genes in the first {n} genes simulated")

creates graph
connects source sink
found 23328 val_genes
GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTCCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCTGCTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGCACGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTTCAACAGCCACAACGTCTATATCATGCCCGACAAGCAGAACAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACTACCACTACCTGCACACCTGGTCCGAGCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGGAGGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTATAAGTAA
GTTAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGCGCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTCCAGTGCTTCAGCCGCTACCCC

# Good Luck