[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Fleishman-Lab/GGAssembler/blob/master/example/colab_oligos_design.ipynb)

<img src="https://github.com/Fleishman-Lab/GGAssembler/blob/master/example/golden-gator.png?raw=1" width="100">

## Golden Gator
A self contained dna gene segmentation to generate combinatoricaly assembled variant libraries.


## 1. setup

In [None]:
# @title Install prerequisites and import python libraries { display-mode: "form" }
try:
    import google.colab
    !apt install rustc cargo > /dev/null
    !pip install -q --no-warn-conflicts git+https://github.com/Fleishman-Lab/GGAssembler
except ImportError:
    pass

import ast
import os

import networkx as nx
import pandas as pd
from typing import List
from Bio import SeqIO

from dawdlib.embl_utils.embl_maker import create_dc_features, create_path_features
from dawdlib.degenerate_dna.deg_table import TableColNames, generate_deg_csv
from dawdlib.degenerate_dna.utils import parse_degenerate_codon_csv
from dawdlib.dijkstra import colorful
from dawdlib.dijkstra.len_limit import all_shortest_paths
from dawdlib.gg_dc_combine.gg_dc_combine import dc_df_codon_list, gate_cdn_oligos
from dawdlib.golden_gate.find_gg import deg_table_to_dict
from dawdlib.golden_gate.gate_data import GGData
from dawdlib.golden_gate.graph_maker import (
    GraphMaker,
    build_custom_graph,
    create_default_valid_node_function,
    create_default_weight_func,
    make_default_graph,
)
from dawdlib.golden_gate.reaction_sim import ReactionSim, ReactionGraphWt
from dawdlib.golden_gate.utils import RequirementsFactory, expand_dna_var_poss, parse_dna, check_for_restriction_sites

In [None]:
# @title Define project parameters { display-mode: "form" }

PROJECT_NAME = 'example' #@param {type:"string"}

#@markdown ###Set DNA oligo restrictions
MIN_OLIGO_LENGTH = 4 # @param {type:"integer"}
MAX_OLIGO_LENGTH = 100 # @param {type:"integer"}
MIN_CONST_OLIGO_LENGTH= 15 # @param {type:"integer"}
MIN_NUM_GATES = 12 # @param {type:"integer"}
MAX_NUM_GATES = 26 # @param {type:"integer"}

#@markdown ###Base pair cost of a DNA segment devoid of diversity
CONST_COST = 40 # @param {type:"number"}

#@markdown ###Golden gate overhang and fidelity requirements
MIN_EFFICIENCY = 0.25 # @param {type:"number"}
#@markdown - The amount of trace mismatch allowed = 10 * (1 - MIN_FIDELITY)
MIN_FIDELITY = 0.1 # @param {type:"number"}

#@markdown ###Restriction enzyme settings:
RESTRICTION_ENZYME = "BsaI" # @param {type:"string"}
RESTRICTION_ENZYME = [RESTRICTION_ENZYME]
#@markdown ###### Set overhanglength to 3 base pairs instead of 4 is you're using a resriction enzyme which produces an overhang of 3 bps such as SapI.
overhanglength = 4 # @param {type:"integer"}
#@markdown ###### Must include the enzyme's restriction pattern at the end
PREFIX = "GACATTGGTCTCA" # @param {type:"string"}
#@markdown ###### Must include the enzyme's restriction pattern at the begining
SUFFIX = "TGAGACCAACGACGCCGTACTCTTTGTCAAC" # @param {type:"string"}

reqs = RequirementsFactory(
    min_oligo_length = MIN_OLIGO_LENGTH,
    max_oligo_length = MAX_OLIGO_LENGTH,
    min_const_oligo_length = MIN_CONST_OLIGO_LENGTH,
    min_efficiency=MIN_EFFICIENCY,
    min_fidelity=MIN_FIDELITY,
    oligo_prefix=PREFIX,
    oligo_suffix=SUFFIX,
    const_cost = CONST_COST,
    filter_gc_overhangs = False
)

ggdata = GGData(
    temperature=reqs.gg_temp,
    hours=reqs.gg_hours,
    min_efficiency=reqs.min_efficiency,
    min_fidelity=reqs.min_fidelity
)

In [None]:
# @title Load input data { display-mode: "form" }
# @markdown ####Upload files by clicking the folder icon on the left bar followed by clicking the upload icon.
# @markdown **Only after you've uploaded and set the file names run this cell.**

# @markdown - Upload rosetta style Resfile specifying Amino acid diversity
# @markdown and set the input file name in the following text-box.

# @markdown you can use the example data found here [example data](https://raw.githubusercontent.com/Fleishman-Lab/GGAssembler/master/example/input.resfile)
resfile = 'input.resfile' # @param {type:"string"}

# @markdown - Upload the WT DNA sequence file in fasta format
# @markdown and set the input file name in the following text-box

# @markdown you can use the example WT fasta found here [example data](https://raw.githubusercontent.com/Fleishman-Lab/GGAssembler/master/example/wt_dna.fasta)
wt_dna = 'wt_dna.fasta' # @param {type:"string"}

W_PATH = "/content"
resfile_path = os.path.join(W_PATH,"input.resfile")
dna_path = os.path.join(W_PATH,wt_dna)
embl_path = os.path.join(W_PATH,"output.embl")
deg_table_path = os.path.join(W_PATH,"deg_table.csv")
chosen_path_path = os.path.join(W_PATH, 'chosen_path.csv')
order_table_path = os.path.join(W_PATH, "order_table.csv")
oligo_table_path = os.path.join(W_PATH, "oligos.csv")
dna = parse_dna(dna_path).upper()

In [None]:
# @title View available ligation data { display-mode: "form" }
import dawdlib.golden_gate.resources as gg_resources
for lig_data in gg_resources.ligation_data.keys():
    print(f'{lig_data}')

In [None]:
# @title Use custom ligation data { display-mode: "form" }
# @markdown Enter a name from the dictionary printed above or upload a custom table and provide it's name
ligation_data_table = 'FileS_T4_18h_37C.csv' # @param {type:"string"}

ggdata.set_default_df(ligation_data_table)
ggdata.init()

In [None]:
# @title Restriction enzyme verification { display-mode: "form" }
# @markdown Verifies the used restriction enzyme's restriction site doesn't appear in WT DNA sequence
sites = check_for_restriction_sites(dna, RESTRICTION_ENZYME)
assert sites[0], f'Restriction enzyme {sites[1]} recognition site were found at positions {sites[2]} in the dna.'

## 2. Degenerate codons

In [None]:
# @title ### 2.1 Generate degenerate codon table { display-mode: "form" }
generate_deg_csv(resfile_path, csv_filename=deg_table_path)
deg_table = pd.read_csv(deg_table_path, na_filter=True, keep_default_na=False,)
encoded_diversity = deg_table.ENCODED_COUNT.apply(ast.literal_eval).apply(sum).prod()
print(f'The encoded diversity has {encoded_diversity} variants.')

In [None]:
# @title ### 2.2 View degenerate codon table { display-mode: "form" }
deg_table

## 3. Find golden gates

In [None]:
# @title ### 3.1 Create a graph { display-mode: "form" }
# @markdown **Either use the default, or custom blocks.**
use_default = True # @param {type:"boolean"}
if use_default:
    gm = GraphMaker(ggdata)
    var_poss = expand_dna_var_poss(deg_table[TableColNames.DNA_POS.value].tolist())
    graph, src, target = make_default_graph(
        gm, dna, var_poss, deg_table_to_dict(deg_table), reqs, overhanglength
    )
if not use_default:
    gm = GraphMaker(ggdata)
    var_poss = expand_dna_var_poss(deg_table[TableColNames.DNA_POS.value].tolist())

    is_valid_edge = gm.create_default_valid_edge_func(
        dna_var_poss=var_poss,
        min_oligo_length=MIN_OLIGO_LENGTH,
        max_oligo_length=MAX_OLIGO_LENGTH - len(PREFIX) - len(SUFFIX),
        min_const_oligo_length=MIN_CONST_OLIGO_LENGTH,
        min_fidelity=MIN_FIDELITY,
    )


    def cost_func(nd1, nd2):
        default = create_default_weight_func(
            dna_pos_n_codons=deg_table_to_dict(deg_table),
            oligo_addition=0,
            const_cost=0,
        )
        return default(nd1, nd2) + len(SUFFIX) + len(PREFIX)


    acceptable_fcws = ggdata.filter_self_binding_gates(filter_gc=True)
    is_valid_node = create_default_valid_node_function(acceptable_fcws, var_poss)

    graph, src, target = build_custom_graph(
        dna, is_valid_node, is_valid_edge, cost_func
    )

In [None]:
# @title ### 3.2 Find shortest paths { display-mode: "form" }
from collections import defaultdict

shortest_paths = all_shortest_paths(
    graph, src, target, weight="weight", len_cutoff=MAX_NUM_GATES
)
best_paths = defaultdict(list)
bad_paths = defaultdict(list)
max_shortest_paths = int(1e4)
try:
    for i, (pth, cost) in enumerate(shortest_paths):
        if i > max_shortest_paths:
            break
        rpth = [p for p in pth if not p.src_or_target]
        overhangs = [a.bps for a in rpth]
        try:
            reaction_fidelities = ggdata.reaction_fidelity(*overhangs)
        except ValueError:
            bad_paths[len(rpth)].append((pth, None))
            continue
        neb_fidelity = reaction_fidelities[0]
        if neb_fidelity > MIN_FIDELITY:
            best_paths[len(rpth)].append((pth, cost, neb_fidelity, i))
            continue
        bad_paths[len(rpth)].append((pth, neb_fidelity))
except nx.NetworkXNoPath:
    print(f"No path was found between {src} and {target}")


In [None]:
# @title #### 3.2.1 View found shortest paths { display-mode: "form" }
for length, path_list in best_paths.items():
    for i, v in enumerate(path_list):
        print(f"Path ID: {length}-{i}. Number of overhangs: {len([a for a in v[0] if not a.src_or_target])}. Cost: {v[1]}. Fidelity: {v[2]}")

In [None]:
# @title #### 3.2.2 Find __*colorful*__ shortest paths { display-mode: "form" }
colorful_retries = 100 # @param {type:"integer"}
spf = colorful.ShortestPathFinder(graph, ggdata, src, target)
for max_gates in range(MIN_NUM_GATES, MAX_NUM_GATES+1):
    for i in range(colorful_retries):
        pth = spf.find_shortest_path(len_cutoff=max_gates, no_colors=max_gates+1)
        if pth:
            rpth = [p for p in pth if not p.src_or_target]
            overhangs = [a.bps for a in rpth]
            reaction_fidelities = ggdata.reaction_fidelity(*overhangs)
            neb_fidelity = reaction_fidelities[0]
            if neb_fidelity < MIN_FIDELITY:
                continue
            pth_len = len(pth)
            cost = sum(
                (graph.edges[n1, n2]["weight"] for n1, n2 in zip(pth[:-1], pth[1:]))
            )
            best_paths[len(rpth)].append((pth, cost, neb_fidelity, i))

In [None]:
# @title ##### 3.2.2.2 View found *colorful* paths { display-mode: "form" }
for length, path_list in best_paths.items():
    for i, v in enumerate(path_list):
        print(f"Path ID: {length}-{i}. Number of overhangs: {len([a for a in v[0] if not a.src_or_target])}. Cost: {v[1]}. Fidelity: {v[2]}")

In [None]:
# @title ### 3.3 choose whichever solution you prefer by it's ID. { display-mode: "form" }
# @markdown Example: Path ID: '15-14' translates to best_paths[15][14].
chosen_path_id = '15-14' #@param {type:"string"}
chosen_path_length = int(chosen_path_id.strip().split('-')[0])
chosen_path_index = int(chosen_path_id.strip().split('-')[1])
chosen_entry = best_paths[chosen_path_length][chosen_path_index]
chosen_path = chosen_entry[0]

In [None]:
# @title #### 3.3.1 View chosen path { display-mode: "form" }
chosen_path

In [None]:
# @title #### 3.3.2 save chosen path to csv { display-mode: "form" }
path_df = pd.DataFrame.from_records(chosen_path, columns=chosen_path[0].__annotations__.keys())
path_df.to_csv(chosen_path_path)

## 4. Create embl feature view

In [None]:
# @title ### 4.1 Prepare degenerate codon and gates features { display-mode: "form" }
deg_parsed_df = parse_degenerate_codon_csv(deg_table_path)
seq_features = create_dc_features(deg_parsed_df)
pth_features = create_path_features(chosen_path)

In [None]:
# @title ### 4.2 Save embl file with all features { display-mode: "form" }
seq_rec = SeqIO.read(dna_path, format="fasta")
seq_rec.annotations.update({"molecule_type": "DNA"})
seq_rec.features.extend(seq_features)
seq_rec.features.extend(pth_features)
SeqIO.write(seq_rec, embl_path, "embl")

In [None]:
# @title ## 5. Create oligo table { display-mode: "form" }
oligo_df = gate_cdn_oligos(chosen_path, dc_df_codon_list(deg_table), dna, reqs.oligo_prefix, reqs.oligo_suffix, PROJECT_NAME)
oligo_df.to_csv(oligo_table_path)

## 6. Verify golden gate reaction

In [None]:
# @title ### 6.1 Create golden gate simulator and load oligo table { display-mode: "form" }
rs = ReactionSim(ggdata, reqs, RESTRICTION_ENZYME)
res = rs.create_reaction_graph(os.path.join(W_PATH, "oligos.csv"))
if res is not None:
    msg = res[0]
    oligo_entry = res[1]
    print(msg)
    print(f'The choice of degenerate codons {oligo_entry.oligo_codons} in oligo named "{oligo_entry.name}" created a new enzyme restriction site!')
    print('''This error must be resolved manually!
open the file “deg_table.csv” that was created by box 2.2 and find the relevant segment by the name given above.
Try to edit the selected codons to eliminate the creation of the BsaI site.
Then, comment out the first line in box 2.2 (which created deg_table.csv) and now rerun the notebook again from box 2.2.
The notebook will use your edited file without the enzyme restriction site.''')


In [None]:
# @title ### 6.2 Check for WT sequence { display-mode: "form" }
reaction_wts:List[ReactionGraphWt] = list(rs.get_wt_dna())
assert 1 == len(reaction_wts), "Error: {len(reaction_wt)} WT DNA sequences found! expected 1!!"
reaction_wt = reaction_wts[0]
assert reaction_wt.dna == dna, "Error: reaction DNA doesn't match input DNA!!!"

In [None]:
# @title ### 6.3 Verify all golden gate products { display-mode: "form" }
# @markdown **Checks that all products are constructed correctly and have the same length and gates as WT**

# @markdown  * Note: This might take a while!
result = rs.verify_reaction(reaction_wt.end - reaction_wt.start, reaction_wt.no_segments, reaction_wt.fidelity_sum)
if result[0]:
    msg = '\n'.join([f'The diversity of the degenerate table ({encoded_diversity}) differs from the one found by the simulation {result[1]}',
           'Do not continue or use the product of this run! (unless you know exactly what you\'re doing',
           'Either the golden gate reaction failed, a restriction site appeared or some of the diversity disappeared somewhere!'])
    assert result[1] == encoded_diversity, msg
    print(f"Golden gate simulation passed! the number of different products is {result[1]}")
else:
    print("Verifying golden gate reaction failed!!!")
    print("The following product failed verification:\n")
    print(result[1])

In [None]:
# @title ## 7. Write order table { display-mode: "form" }
# @markdown #### Write constant segments?
output_const = True # @param {type:"boolean"}
# @markdown #### write WT segments?
output_wt = True # @param {type:"boolean"}
oligo_df[(oligo_df.wt <= output_wt) & (oligo_df.const <= output_const)][['name', 'full_oligo_dna']].to_csv(order_table_path)

In [None]:
# @title ## 8. Download results { display-mode: "form" }
!rm -r /content/sample_data/
!tar -czf /content/results.tar.gz --no-recursion -C /content *
google.colab.files.download('/content/results.tar.gz')