In [26]:
# Import Modules and Packages for Library Generation

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import Bio.Data.CodonTable as CodonTable
from itertools import product
from dnachisel import *

# Define Functions

def csv_to_fasta(csv_file, fasta_file, seq_column):
    df = pd.read_csv(csv_file)
    with open(fasta_file, 'w') as f:
        for index, row in df.iterrows():
            f.write(f">{row[0]}\n{row[seq_column]}\n")

def initial_reverse_translate(protein_seq):
    codon_table = CodonTable.standard_dna_table.forward_table
    dna_seq = ""
    for aa in protein_seq:
        codons = [codon for codon, amino_acid in codon_table.items() if amino_acid == aa]
        if codons:
            dna_seq += codons[0]
        else:
            raise ValueError(f"No codon found for amino acid: {aa}")

    return dna_seq

def standard_dna_chisel(dna_seq, restriction_enzymes_to_silence):

    constraints = [
        AvoidHairpins(stem_size=20, hairpin_window=200),
        EnforceGCContent(mini=0.30, maxi=0.70, window=50),
        EnforceTranslation(location=(0, len(dna_seq)))
    ]

    for pattern in restriction_enzymes_to_silence:
        constraints.append(AvoidPattern(pattern))

    problem = DnaOptimizationProblem(
        sequence=dna_seq,
        constraints=constraints,
        objectives=[
            CodonOptimize(species='e_coli', method="match_codon_usage", location=(0, len(dna_seq)))
        ]
    )

    problem.resolve_constraints()
    problem.optimize()
    problem.max_random_iters = 5000

    return problem.sequence

def barcode_builder(barcode_length, restriction_enzymes, restriction_enzymes_to_silence):
    nucleotides = ['A', 'T', 'C', 'G']
    barcodes = [''.join(combo) for combo in product(nucleotides, repeat=barcode_length)]

    restriction_sites_to_remove = set()
    for enzyme in restriction_enzymes_to_silence:
        restriction_sites_to_remove.update(restriction_enzymes.get(enzyme, []))

    filtered_barcodes = [bc for bc in barcodes if bc not in restriction_sites_to_remove]

    return filtered_barcodes

def read_fasta_as_list(fasta_file):
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    return sequences

def gc_content(sequence):
    sequence = sequence.upper()
    gc_count = sequence.count('G') + sequence.count('C')
    total_length = len(sequence)

    if total_length == 0:
        return 0.0

    return (gc_count / total_length) * 100

def retranslate(dna_seq, start_pos):
    dna_seq = dna_seq.upper()
    coding_region = dna_seq[start_pos:]
    protein_seq = Seq(coding_region).translate(to_stop=True)
    
    return str(protein_seq)

def write_sequences_to_fasta(sequences, output_file, header_prefix="Seq"):
    records = []
    for i, seq in enumerate(sequences):
        record = SeqRecord(Seq(seq), id=f"{header_prefix}_{i+1}", description="")
        records.append(record)
    
    with open(output_file, "w") as f:
        SeqIO.write(records, f, "fasta")

# Define useful variables

restriction_enzymes = {
    "BsmBI" : ["CGTCTC","GAGACG"],
    "BsaI" : ["GGTCTC","GAGACC"],
    "BbsI" : ["GAAGAC","GTCTTC"],
    "SapI" : ["GCTCTTC","GAAGAGC"],
    "PaqCI" : ["CACCTGC","GCAGGTG"]
    }

In [27]:
# Define Pertinent Variables

### Filepath to where the CSV full of designs is -- Designs should be AA sequences
designs_csv_path = "/Users/asmiley/Downloads/design_sequences_tier1.csv"

### Define the filepath and filename for a the .FASTA version of the CSV that will be made
designs_fasta_path = "/Users/asmiley/Downloads/design_sequences_tier1.fasta"

### Define the length of a barcode to append to your sequence (4^N = Barcode Diversity)
barcode_length = 6

### Define the restriction enzymes you want to silence in your library as a list of strings 
### Supported restriction enzymes -> "BsmBI", "BsaI", "BbsI", "SapI", and "PaqCI"
restriction_enzymes_to_silence = ["BbsI", "PaqCI"]

### Define your favorite stop codon
stop_codon = "TAA"

### Define filepaths for output files
output_csv = "/Users/asmiley/Downloads/test_output.csv"
output_csv_sequences_only = "/Users/asmiley/Downloads/test_output_sequences_only.csv"
output_fasta_sequences_only = "/Users/asmiley/Downloads/test_output_sequences_only.fasta"

### Define any consistent sequences you want to be common to all DNA sequences
### See the final cell on how to implement this
five_prime_constant = "TTAGTATATTAGTTAAGTATAAGAAGGAGATATACATGATAGAAGCACCAGATGTTAAACCTTGGCTATTCTTG"
three_prime_constant = ""
internal_primer_binding_site = ""
five_prime_primer_binding_site = ""
three_prime_primer_binding_site = ""

### Define any golden gate sites that you want to implement on your sequences
### Do NOT include these if you are using OMEGA oligos
### These should include 5 nt of spacer sequence, the cleavage motif, and the overhang
five_prime_golden_gate = ""
three_prime_golden_gate = ""

In [None]:
# DNA Chisel

### Converts Design CSV to .FASTA
csv_to_fasta(designs_csv_path, designs_fasta_path, seq_column='design_sequence')

# Read in .FASTA as a list for optimization and restrict it to the variable section if necessary
list_of_designs = read_fasta_as_list(designs_fasta_path)
pertinent_sections = [sequence[30:] for sequence in list_of_designs]

# Make barcodes, ensuring the removal of any desired restriction sites
barcodes = barcode_builder(barcode_length, restriction_enzymes, restriction_enzymes_to_silence)

# DNA chisel on the sequences of interest
optimized_sequences = [standard_dna_chisel(initial_reverse_translate(sequence), restriction_enzymes_to_silence) for sequence in pertinent_sections]
print("DNA Chisel Complete")

In [29]:
# File Export

columns = ['De Novo AA Sequence', 'Re-Translation of DNA Sequence', 'DNA Sequence', 'Barcode Sequence', "Length (bp)", "% GC Content"]
df = pd.DataFrame(columns=columns)

for sequence_idx in range(len(optimized_sequences)):
    
    # This is where you define your overall sequences with constant sections
    optimized_full_length_sequence =  five_prime_golden_gate + barcodes[sequence_idx] + five_prime_constant + optimized_sequences[sequence_idx] + stop_codon + three_prime_golden_gate
    
    df.loc[sequence_idx] = [pertinent_sections[sequence_idx], retranslate(optimized_full_length_sequence, 41), optimized_full_length_sequence, barcodes[sequence_idx], len(optimized_full_length_sequence), gc_content(optimized_full_length_sequence)]

df.to_csv(output_csv)

seqs_only = df[['DNA Sequence']]

seqs_only.to_csv(output_csv_sequences_only)

seqs_only_list = df['DNA Sequence'].tolist()

write_sequences_to_fasta(seqs_only_list, output_fasta_sequences_only)