# DNA assembly project: ____

This notebook template shows a standard workflow of DNA assembly design, using EGF software. Each section begins with parameters that need to be defined.

## Obtain parts

In [None]:
# List of all and new parts in assembly plan:
f1 = open('all_parts.txt', 'r')
f2 = open('new_parts.txt', 'r')

In [None]:
all_parts = f1.read().splitlines()
new_parts = f2.read().splitlines()
f1.close()
f2.close()

In [None]:
difference = set(new_parts) - set(all_parts)
if len(difference) != 0:
    print("Some parts are not in the plan:")
    print(difference)
else:
    print("All new parts are accounted for in the assembly. Retrieving these remaining parts from repository:")
    print(set(all_parts) - set(new_parts))

In [None]:
# Alternatively retrieve from repository via API

---
## Domestication of new parts

In [None]:
# Directory containing genbanks
dir_to_domesticate = ""
# Path to CSV of GoldenGateDomesticator spreadsheet
GGdomesticator_spreadsheet = ""
# Output path
domestication_target = ""

In [None]:
import os
import genedom
import dnacauldron as dc
# import proglog
# proglog.notebook()

In [None]:
records_to_domesticate = dc.biotools.load_records_from_files(folder=dir_to_domesticate, use_file_names_as_ids=True)
EMMA_PLUS = genedom.GoldenGateDomesticator.standard_from_spreadsheet(GGdomesticator_spreadsheet)
genedom.batch_domestication(
    records=records_to_domesticate, 
    standard=EMMA_PLUS, 
    target=domestication_target)

In [None]:
# Check if any names were truncated:
import pandas as pd
order_ids = pd.read_csv(os.path.join(domestication_target, "order_ids.csv"))

In [None]:
any_truncated = False
for index, row in order_ids.iterrows():
    if row["sequence"] != row["order_id"]:
        any_truncated = True
        print("Truncated name:", end=" ")
        print(" --> ".join(row))
if not any_truncated:
    print("Part names were not truncated")

---

## Check overhangs

In [None]:
import os
import overhang as oh

In [None]:
projectname = "Project_name"
report_dir = ""
overhangs = ["TAGG", "ACGA"]
enzyme = "Esp3I"
kappagate_dataset = "2020_01h_Esp3I"  # or 2020_01h_BsaI

In [None]:
overhangset = oh.OverhangSet(overhangs=overhangs, name=projectname, enzyme=enzyme)
oh.write_overhangset_report(os.path.join(report_dir, "overhang_report_" + projectname + ".pdf"), overhangset)
# Tatapov plot (37 Celsius, 1 hour):

In [None]:
from kappagate import overhangs_list_to_slots, plot_circular_interactions, predict_assembly_accuracy, plot_colony_picking_graph, success_rate_facts

In [None]:
slots = overhangs_list_to_slots(overhangs)
ax = plot_circular_interactions(
    slots, annealing_data=('37C', kappagate_dataset), rate_limit=200)
ax.figure.savefig(os.path.join(report_dir, "interactions_" + projectname + ".png"), bbox_inches='tight')

In [None]:
predicted_rate, _, _ = predict_assembly_accuracy(slots)
ax = plot_colony_picking_graph(success_rate=predicted_rate)
ax.figure.savefig(os.path.join(report_dir, "success_rate_facts" + projectname + ".png"), bbox_inches='tight')

print(success_rate_facts(predicted_rate, plain_text=True))

---
## Cloning simulation

In [None]:
# Dir of domesticated sequences
dir_domesticated =  os.path.join(domestication_target, "domesticated_genbanks")
# Dir of available parts
dir_available_parts = ""
# Assembly plan
assembly_plan_name = "Assembly_plan"
assembly_plan_path = "assembly_plan.csv"
########################################
simulation_target_path = "predicted_simulation"

backbone_first = True
backbone_name = "HC_Amp_ccdB"

In [None]:
import dnacauldron as dc
repository = dc.SequenceRepository()
repository.import_records(folder=dir_domesticated, use_file_names_as_ids=True, topology="circular")
repository.import_records(folder=dir_available_parts, use_file_names_as_ids=True, topology="circular")

In [None]:
repository.get_record(backbone_name).is_backbone = True

In [None]:
assembly_plan = dc.AssemblyPlan.from_spreadsheet(
    name=assembly_plan_name,
    path=assembly_plan_path,
    assembly_class=dc.Type2sRestrictionAssembly
)

In [None]:
simulation = assembly_plan.simulate(sequence_repository=repository)
stats = simulation.compute_stats()
print(stats)

In [None]:
report_writer = dc.AssemblyReportWriter(
    include_assembly_plots=True,
    include_mix_graphs=True,
    include_pdf_report=True
)
simulation.write_report(simulation_target_path, assembly_report_writer=report_writer)

---
## Calculate total length of DNA domesticated (bp)

In [None]:
parts_to_order = dc.biotools.load_records_from_files(folder=dir_domesticated, use_file_names_as_ids=True)

In [None]:
total_length = 0
for part in parts_to_order:
    total_length += len(part.seq)
print(total_length)