# Gen_Clonr
### For Cloning Genes using Molecular Biology
### Last Updated: 05/21/2024
### Author: Frank Escalante

In [1]:
!pip install Biopython
import Bio
from Bio import SeqIO
import os
import numpy as np
import pandas as pd
import random



DEPRECATION: pandas 0.23.4 has a non-standard dependency specifier pytz>=2011k. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pandas or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


### Golden Gate Functions for DNA assembly Design

In [2]:
def re_enzyme_genesis(enz_dict, enzyme_name, rec_seq, cut_pattern):
    """ This function adds a restriction enzyme to a dictionary including it's name (enzyme name)
    recognition sequence (rec_seq), and its cutting pattern (cut_pattern). It requires a prexisting
    dictionary as well for input (enz_dict)"""
    # 5' --> 3' DNA inputs
    rec_seq = Bio.Seq.Seq(rec_seq)
    enz_dict[enzyme_name] = (rec_seq, len(rec_seq), cut_pattern)
    return enz_dict

def cut_site_chckr(ez_dict, seq):
    """ This function is designed to check the sequences passed into it for internal cut sites relevant 
    to GoldenGate Assembly. Inputs are ez_dict (a dictionary of GGA relevant restriction enzymes), and the seq to check """
    internal_cut_sites = []
    for enzyme in ez_dict: # for each enzyme, find out the recognition cut site then search for it in each seq
        subseq = str(ez_dict[enzyme][0])
        int_cut_site_loci = seq.find(subseq)
        if int_cut_site_loci != -1: # if no internal cut site found
            internal_cut_sites.append((True, enzyme, int_cut_site_loci)) # True means no problem
        else:
            internal_cut_sites.append((False, enzyme, None)) # False means cut site found, potential problem
    return internal_cut_sites

def DNA_cuttr(DNA_seq, max_length_cap):
    """Recursive function that will split DNA sequences that are too large for a particular specification"""
    if len(DNA_seq) <= max_length_cap: # If within specification do not cut, return seq
        return DNA_seq
    else: # if sequence too long, then cut; recursively do this until complete
        new_seq = DNA_seq[0:max_length_cap] # DNA subsection within spec
        remainder_seq = DNA_seq[max_length_cap::] # DNA subsection that needs more trimming
        return DNA_cuttr(remainder_seq, max_length_cap), new_seq
        
def fasta_analyzer(file, enzyme_dict, length_max, file_type = 'FASTA'):
    """This function iteratively analzyes DNA sequences either in FASTA format (default) or 
    those found in dictionaries (file_type = 'dict'); returns a dictionary analzying the sequences
    provided """
    sequence_records = {}
    if file_type == 'FASTA': # For fasta file
        for record in SeqIO.parse(file,'fasta'):
            DNA_length = len(record.seq)
            DNA_sequence = record.seq    
            internal_cut_sites = cut_site_chckr(enzyme_dict, DNA_sequence) # Check each seq. for internal cut sites
            if DNA_length <= length_max:
                status = True
            elif DNA_length > length_max:
                status = False
            sequence_records[record.name] = (DNA_sequence, DNA_length, internal_cut_sites, status)
    elif file_type == 'dict': # For dictionary
        for record in file:
            DNA_sequence = file[record]
            DNA_length = len(DNA_sequence)
            internal_cut_sites = cut_site_chckr(enzyme_dict, DNA_sequence) # Check each seq. for internal cut sites
            if DNA_length <= length_max:
                status = True
            elif DNA_length > length_max:
                status = False           
            
            sequence_records[record] = (DNA_sequence, DNA_length, internal_cut_sites, status)
    return sequence_records

def seq_trunctr(seq_dict,length_max,additional_cut,enzyme_dict):
    nonconforming_seq_records = {}
    for seq in seq_dict:
        status = seq_dict[seq][3]
        DNA_sequence = seq_dict[seq][0]
        if status == False:
            # This next section will continually break oversized DNA sequences and analzye them until 
            # they meet specification
            remainder, new_seq = DNA_cuttr(DNA_sequence, length_max-additional_cut)
            cuts_left = len(new_seq)//(length_max-additional_cut) + 1
            nonconforming_seq_records[seq + '-0'] = (new_seq, len(new_seq), cut_site_chckr(enzyme_dict, new_seq),True)
            if type(remainder) != type((0,0)): # IF there is only one split required of a seq
                nonconforming_seq_records[seq + '-1'] = (remainder, len(remainder), cut_site_chckr(enzyme_dict, remainder),True)
            elif type(remainder) == type((0,0)): # IF there is multiple splits required of a seq
                count = cuts_left
                for sequence_remainder in remainder:
                    nonconforming_seq_records[seq + '-' + str(count)] = (sequence_remainder, len(sequence_remainder), cut_site_chckr(enzyme_dict, sequence_remainder),True)
                    count -= 1
    return nonconforming_seq_records

def seq_order(seqs):
    """ Takes a dictionary of sequence parts and re-organizes them to be in proper order and grouped
    together by gene assembly"""
    seqs_organized = {}
    for seq in seqs: # First create a new dictionary with each key being a gene
        name = seq.split('-')[0]
        seqs_organized[name] = []
    for seq in seqs: # For each gene in the dictionary add the parts so that they are properly grouped together
        name = seq.split('-')[0]
        DNA_seq = seqs[seq]
        part_number = int(seq.split('-')[1])        
        seqs_organized[name].append((part_number, DNA_seq))      
    for seq in seqs_organized: # Within each group of parts organize them so that they are in order for assembly
        seqs_organized[seq].sort()    
                
    return seqs_organized

def seq3_appendr(DNA_seq, restriction_site, enzyme_dict):
    """This function modifies the 3' end of DNA sequences that are parts of larger genes. This
    function adds based on design principles of goldengate"""
    overhang = DNA_seq[-4:]
    recog_site = enzyme_dict[restriction_site][0] # This identifies the restriction sequence site of interest
    recog_site = Bio.Seq.Seq(recog_site)
    # For the 3' end the reverse complement is required
    reverse_complement_recog_site = str(recog_site.reverse_complement())
    pattern = enzyme_dict[restriction_site][2] # This identifies how far 'up' the restriction site
    # The enzyme will cut
    three_prime_end = pattern[0].split('-')[0]
    random_seq = 'ATCGAC' # A few extra base pairs to faciliate the actual biology downstream of the restriction site
    bases = 'ACGT'
    pre_recog_site = ''
    for n in three_prime_end: # This generates random N number of base pairs to put upstream the restriction site
        # The number N is determined by the variable pattern or three_prime_end
        pre_recog_site += random.choice(bases)
    # Generate the modified DNA and keep track of the overhang; overhang will be useful in attaching
    # the 3' end of one part to the 5' end of the next part without scars
    DNA_seq = DNA_seq + pre_recog_site + reverse_complement_recog_site + random_seq
    return DNA_seq, overhang
    
def seq5_appendr(DNA_seq, restriction_site, enzyme_dict, overhang):
    """This function modifies the 5' end of DNA sequences that are parts of larger genes. This
    function adds based on design principles of goldengate"""
    random_seq = 'ATCGAC' # A few extra base pairs to faciliate the actual biology upstream of the restriction site
    recog_site = enzyme_dict[restriction_site][0] # This identifies the restriction sequence site of interest
    pattern = enzyme_dict[restriction_site][2] # This identifies how far 'down' the restriction site
    # The enzyme will cut
    five_prime_end = pattern[0].split('-')[1]
    bases = 'ACGT'
    post_recog_site = ''
    for n in five_prime_end:
        # This generates random N number of base pairs to put downstream the restriction site
        # The number N is determined by the variable pattern or five_prime_end
        post_recog_site += random.choice(bases)     
    DNA_seq = random_seq + recog_site + post_recog_site + overhang + DNA_seq
    return DNA_seq
    
def seq_appendr(seqs_organized, restriction_site, enzyme_dict):
    """This function uses seq5_appendr and seq3_appendr to modify ends of DNA parts; it outputs a
    dictionary of DNA part names with each correctly modified part sequence for goldengate assembly"""
    fixed_seqs = {}
    for seq in seqs_organized:
        fixed_seqs[seq] = []
        
    for seq in seqs_organized:
        full_seq = seqs_organized[seq]
        Total_parts = len(full_seq) - 1 # determine the total number of parts in the assembly
        counter = 0
        for part in full_seq:
            DNA_seq = part[1]
            if counter == 0: # IF the part being modified is the first part only modify the 3' end
                DNA_seq, overhang = seq3_appendr(DNA_seq, restriction_site, enzyme_dict)
            elif counter < Total_parts and counter != 0: # if the part being modified is an 
                # intermediate part (i.e. not the first part or last part of an assembly) then
                # modify both 3' and 5' ends
                DNA_seq = seq5_appendr(DNA_seq, restriction_site, enzyme_dict, overhang)
                DNA_seq, overhang = seq3_appendr(DNA_seq, restriction_site, enzyme_dict)     
            elif counter == Total_parts: # if the part being modified is the last part then only modify
                # the 5' end
                DNA_seq = seq5_appendr(DNA_seq, restriction_site, enzyme_dict, overhang)
                         
            counter += 1
            fixed_seqs[seq].append((part[0], DNA_seq))

    return fixed_seqs
