### 1. Importing Required Modules and Packages

In [44]:
import os
import sys

sys.path.append('..')
os.environ["OMP_NUM_THREADS"] = '1'  # KMeans is not parallelized, so set to 1 thread

from src.mutation import Mutation
from src.sequence import Vector, Gene
from src.eblocks import Eblock, EblockDesign
import biotite.sequence as seq
from src.primer import DesignPrimers
from src.plot import Plot

%reload_ext autoreload
%autoreload 2

### 2. Loading and Analyzing the Gene Sequence

The desired mutations should be added to a txt file. 
Here, we create a Mutation() object and parse the specified mutations. 

In [46]:
# Create a Mutation object and parse the input mutations from the files/ directory

mutations_file = "/zfsdata/data/rosan/eBlocks/testset_input/Mtb_pINIT_DnaE1/datasets/N10/N10_9_mutations.txt"

mutation_instance = Mutation()
mutation_instance.parse_mutations(mutations_file)

# Print the mutations that were parsed
mutation_instance.print_mutations()

The selected mutations are:
	Combined  	G53E, I87F, A96H
	Mutation  	Q94Q      
	Mutation  	Y105W     
	Deletion  	N114-G117 
	Combined  	E150N, L169I, S160G, F192R, A179G
	Insert    	G161-WQLNTLHY
	Insert    	E162-ASNVCEEG
	Insert    	R173-VRR  
	Insert    	A175-GF   
	Combined  	A179F, H198K, I185K, R215W
	Combined  	E194G, C243D, A240P, H236K, T201F
	Mutation  	D233Y     
	Deletion  	I274-D276 
	Deletion  	S354-F357 
	Deletion  	S369-G375 
	Deletion  	E433-Y437 
	Insert    	I477-SS   
	Insert    	A478-PMCLKPSE
	Mutation  	E509F     
	Combined  	A511A, E534T, P559V, E552C
	Mutation  	Q528Y     
	Deletion  	A530-G532 
	Deletion  	A540-A546 
	Mutation  	I596W     
	Combined  	G609I, P618L, Y625M, L612T
	Insert    	L627-QQVN 
	Combined  	L628F, G669V
	Combined  	V636W, K683H, V660D
	Combined  	D640A, Y679G, K683A, P643T, D681M
	Combined  	G655Y, R686R
	Mutation  	K742V     
	Mutation  	L801M     
	Mutation  	A824R     
	Deletion  	K910-A913 
	Deletion  	G951-T957 
	Insert    	G956-SGYHA

Next, we read the gene sequence and the vector that contains our gene of interest.

In [47]:
# Create a Plasmid object and parse the input plasmid from the files/ directory

# Read sequence
sequence_file = "/zfsdata/data/rosan/eBlocks/testset_input/Mtb_pINIT_DnaE1/A0QX55.fasta"
vector_file = "/zfsdata/data/rosan/eBlocks/testset_input/Mtb_pINIT_DnaE1/dnae1-pinit.gb"

gene_instance = Gene()
gene_instance.parse_sequence(sequence_file)
vector_instance = Vector(gene=gene_instance)
vector_instance.parse_vector(vector_file)

We also define an output directory for the generated files and create a snapgene object for visualization

We create an eBlockDesign instance that can initiate the design of the eblocks. Here, we choose as optimization method cost_optimization that aims to use as little basepairs as possible. Another option would be to do amount_optimization, that aims to cluster as many mutations as possible together, to get the lowest number of different eBlocks

In [48]:
# Create an Eblocks object based on the input mutations and the gene sequence

output_dir = "/zfsdata/data/rosan/eBlocks/testset_output/Mtb_pINIT_DnaE1/N10"

design_instance = EblockDesign(mutation_instance=mutation_instance,
                            gene_instance=gene_instance,
                            vector_instance=vector_instance,
                            output_dir=output_dir,
                            verbose=True,
                            cost_optimization=True,
                            amount_optimization=False)

In [None]:
# Create a Plots object and check the input vector

# Create the Plot object
plot_instance = Plot(mutation_instance=mutation_instance,
                    eblocks_design_instance=design_instance,
                    vector_instance=vector_instance,
                    gene_instance=gene_instance,
                    output_dir=output_dir,
                    show=True)

# Check the input vector
# plot_instance.plot_vector(figsize=(5, 5))

In [None]:
# Run the eBlocks design and print the results
# design_instance.max_eblock_length = 1400
design_instance.run_design_eblocks()


In [None]:
plot_instance.plot_eblocks_mutations(figure_length=25,
                                     figure_width=5)

In [None]:
# Primer3 settings
settingsfile = '/home/rosan/git/design_gene_blocks/settings/primer3-settings-mild.txt'
seq_settingsfile = '/home/rosan/git/design_gene_blocks/settings/primer3-seq-settings.txt'

primers_instance = DesignPrimers(mutation_instance=mutation_instance,
                eblocks_design_instance=design_instance,
                primers_settingsfile=settingsfile,
                seqprimers_settingsfile=seq_settingsfile,
                vector_instance=vector_instance,
                output_dir=output_dir)

primers_instance.run_design()

In the process, for each mutation a different eBlock is created and a .gb file is made to easily view the clone in a sequence editor. 

In [None]:
sequence_file = 'files/A0QX55.fasta'
vector_file = 'files/vector.dna'

sequence_instance = Plasmid()
sequence_instance.parse_vector(vector_file)
sequence_instance.parse_sequence(sequence_file)

sequence_instance.description

In [23]:
from Bio import SeqIO

def read_single_fasta(fp: str) -> str:
    """
    This function reads a single fasta file and returns the sequence.
    """
    for num, record in enumerate(SeqIO.parse(fp, "fasta")):
        sequence = record.seq
        seqid = record.id
        if num > 0:
            raise ValueError("Please provide a single sequence in FASTA format.")
    return sequence, seqid

def read_single_fasta(fp: str) -> str:
    """
    This function reads a single fasta file and returns the sequence.
    """
    record = next(SeqIO.parse(fp, "fasta"))  # Read the first and only record
    sequence = record.seq
    seqid = record.id
    return sequence, seqid

In [None]:
seq, seqid = read_single_fasta(sequence_file)
print(seq, seqid)
seq, seqid = read_single_fasta2(sequence_file)
print(seq, seqid)