### 1. Importing Required Modules and Packages

In [1]:
import os
import sys

sys.path.append('../..')
os.environ["OMP_NUM_THREADS"] = '1'  # KMeans is not parallelized, so set to 1 thread

from src.mutation import Mutation
from src.sequence import Plasmid
from src.eblocks import Eblock, EblockDesign
from src.primer import DesignPrimers
from src.plot import Plot
from src.utils import Utils, SnapGene

%reload_ext autoreload
%autoreload 2

In [8]:
import itertools
import tempfile
import numpy as np
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
from biotite.sequence import AnnotatedSequence

In [3]:
# variables

genome_id = "U00096" # Ecoli
# genome_id = "CP000480.1" # smegmatis
# genome_id = "AL123456.3" # Ecoli
# genome_id = "fakeidtest123"

# Make sure the the genome is a complete genome (perform some kind of check maybe?)

In [4]:
def get_genome_features(genome_id):
    """
    Get the CDS features of a genome
    """
    try:
        gb_file = gb.GenBankFile.read(
            entrez.fetch(genome_id, tempfile.gettempdir(), "gb", "nuccore", "gb"))
    except Exception as e:
        print(f"Error fetching genome with ID '{genome_id}': {e}")
        return None
    genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])
    if isinstance(genome, AnnotatedSequence):
        return genome
    else:
        raise ValueError("No CDS features found in genome")


def count_codons(genome):
    """
    Count the occurrence of each codon in a genome
    The symbols [0 1 2 3] represent ['A' 'C' 'G' 'T'] respectively
    """
    codon_counter = {
        codon: 0
        for codon in itertools.product(*([range(len(genome.sequence.alphabet))] * 3))}
    return codon_counter


def get_codon_usage(genome_id, codon_counter):
    """
    Get the codon usage of a genome
    """
    genome = get_genome_features(genome_id)
    if genome is None:
        return None
    codon_counter = count_codons(genome)
    for feature in genome.annotation:
        cds = genome[feature]  # Get the coding sequence
        if len(cds) % 3 != 0:  # malformed CDS
            continue
        for i in range(0, len(cds), 3):  # Count the codons
            codon_code = tuple(cds.code[i:i+3])
            codon_counter[codon_code] += 1
    return codon_counter


def get_relative_frequencies(genome, codon_counter):
    """
    Convert the total frequencies into relative frequencies
    """
    table = seq.CodonTable.default_table()  # ID 11 is the bacterial codon table
    relative_frequencies = {}
    for amino_acid_code in range(20):
        codon_codes_for_aa = table[amino_acid_code]
        total = 0  # Get the total amount of codon occurrences for the amino acid
        for codon_code in codon_codes_for_aa:
            total += codon_counter[codon_code]
        for codon_code in codon_codes_for_aa:
            codon_counter[codon_code] /= total
            amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code)
            codon = genome.sequence.alphabet.decode_multiple(codon_code)
            codon = "".join(codon)
            freq = codon_counter[codon_code]
            if amino_acid not in relative_frequencies:  # Store relative frequencies in dictionary
                relative_frequencies[amino_acid] = []
            relative_frequencies[amino_acid].append((codon, round(freq, 3)))
    return relative_frequencies


def relative_frequencies_to_csv(genome_id, frequencies_dict):
    """
    Save relative frequencies in a CSV file
    """
    with open(f"{genome_id}_codon_usage.csv", "w") as file:
        file.write("Amino Acid,Codon,Relative Frequency\n")
        for amino_acid, codons in frequencies_dict.items():
            for codon, freq in codons:
                file.write(f"{amino_acid},{codon},{freq}\n")

In [5]:
genome = get_genome_features(genome_id)

In [6]:
codon_counter = count_codons(genome)
print(codon_counter)

{(0, 0, 0): 0, (0, 0, 1): 0, (0, 0, 2): 0, (0, 0, 3): 0, (0, 1, 0): 0, (0, 1, 1): 0, (0, 1, 2): 0, (0, 1, 3): 0, (0, 2, 0): 0, (0, 2, 1): 0, (0, 2, 2): 0, (0, 2, 3): 0, (0, 3, 0): 0, (0, 3, 1): 0, (0, 3, 2): 0, (0, 3, 3): 0, (1, 0, 0): 0, (1, 0, 1): 0, (1, 0, 2): 0, (1, 0, 3): 0, (1, 1, 0): 0, (1, 1, 1): 0, (1, 1, 2): 0, (1, 1, 3): 0, (1, 2, 0): 0, (1, 2, 1): 0, (1, 2, 2): 0, (1, 2, 3): 0, (1, 3, 0): 0, (1, 3, 1): 0, (1, 3, 2): 0, (1, 3, 3): 0, (2, 0, 0): 0, (2, 0, 1): 0, (2, 0, 2): 0, (2, 0, 3): 0, (2, 1, 0): 0, (2, 1, 1): 0, (2, 1, 2): 0, (2, 1, 3): 0, (2, 2, 0): 0, (2, 2, 1): 0, (2, 2, 2): 0, (2, 2, 3): 0, (2, 3, 0): 0, (2, 3, 1): 0, (2, 3, 2): 0, (2, 3, 3): 0, (3, 0, 0): 0, (3, 0, 1): 0, (3, 0, 2): 0, (3, 0, 3): 0, (3, 1, 0): 0, (3, 1, 1): 0, (3, 1, 2): 0, (3, 1, 3): 0, (3, 2, 0): 0, (3, 2, 1): 0, (3, 2, 2): 0, (3, 2, 3): 0, (3, 3, 0): 0, (3, 3, 1): 0, (3, 3, 2): 0, (3, 3, 3): 0}


In [7]:
codon_counter = get_codon_usage(genome_id, codon_counter)
print(codon_counter)

{(0, 0, 0): 45174, (0, 0, 1): 29019, (0, 0, 2): 13739, (0, 0, 3): 23449, (0, 1, 0): 9278, (0, 1, 1): 31499, (0, 1, 2): 19308, (0, 1, 3): 11910, (0, 2, 0): 2644, (0, 2, 1): 21580, (0, 2, 2): 1494, (0, 2, 3): 11635, (0, 3, 0): 5631, (0, 3, 1): 33893, (0, 3, 2): 37401, (0, 3, 3): 40900, (1, 0, 0): 20648, (1, 0, 1): 13034, (1, 0, 2): 38826, (1, 0, 3): 17224, (1, 1, 0): 11301, (1, 1, 1): 7295, (1, 1, 2): 31362, (1, 1, 3): 9319, (1, 2, 0): 4718, (1, 2, 1): 29656, (1, 2, 2): 7173, (1, 2, 3): 28336, (1, 3, 0): 5222, (1, 3, 1): 14949, (1, 3, 2): 71286, (1, 3, 3): 14737, (2, 0, 0): 53303, (2, 0, 1): 25701, (2, 0, 2): 23884, (2, 0, 3): 42970, (2, 1, 0): 27045, (2, 1, 1): 34388, (2, 1, 2): 45512, (2, 1, 3): 20462, (2, 2, 0): 10486, (2, 2, 1): 39976, (2, 2, 2): 14717, (2, 2, 3): 33270, (2, 3, 0): 14633, (2, 3, 1): 20531, (2, 3, 2): 35319, (2, 3, 3): 24576, (3, 0, 0): 2815, (3, 0, 1): 16379, (3, 0, 2): 319, (3, 0, 3): 21514, (3, 1, 0): 9468, (3, 1, 1): 11558, (3, 1, 2): 11960, (3, 1, 3): 11304, (3, 

In [8]:
relative_frequencies = get_relative_frequencies(genome, codon_counter)
print(relative_frequencies)

{'A': [('GCA', 0.212), ('GCC', 0.27), ('GCG', 0.357), ('GCT', 0.161)], 'C': [('TGC', 0.558), ('TGT', 0.442)], 'D': [('GAC', 0.374), ('GAT', 0.626)], 'E': [('GAA', 0.691), ('GAG', 0.309)], 'F': [('TTC', 0.427), ('TTT', 0.573)], 'G': [('GGA', 0.107), ('GGC', 0.406), ('GGG', 0.149), ('GGT', 0.338)], 'H': [('CAC', 0.431), ('CAT', 0.569)], 'I': [('ATA', 0.07), ('ATC', 0.421), ('ATT', 0.509)], 'K': [('AAA', 0.767), ('AAG', 0.233)], 'L': [('CTA', 0.037), ('CTC', 0.105), ('CTG', 0.498), ('CTT', 0.103), ('TTA', 0.129), ('TTG', 0.128)], 'M': [('ATG', 1.0)], 'N': [('AAC', 0.553), ('AAT', 0.447)], 'P': [('CCA', 0.191), ('CCC', 0.123), ('CCG', 0.529), ('CCT', 0.157)], 'Q': [('CAA', 0.347), ('CAG', 0.653)], 'R': [('AGA', 0.036), ('AGG', 0.02), ('CGA', 0.064), ('CGC', 0.401), ('CGG', 0.097), ('CGT', 0.383)], 'S': [('AGC', 0.278), ('AGT', 0.15), ('TCA', 0.122), ('TCC', 0.149), ('TCG', 0.154), ('TCT', 0.146)], 'T': [('ACA', 0.129), ('ACC', 0.438), ('ACG', 0.268), ('ACT', 0.165)], 'V': [('GTA', 0.154), 

In [9]:
relative_frequencies_to_csv(genome_id, relative_frequencies)

In [10]:
genome_id = "U00096" # Ecoli

In [11]:
import itertools
import tempfile
import numpy as np
import biotite.database.entrez as entrez
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb

# Get the E. coli K-12 genome as annotated sequence
gb_file = gb.GenBankFile.read(
    entrez.fetch(genome_id, tempfile.gettempdir(), "gb", "nuccore", "gb")
)
# We are only interested in CDS features
k12_genome = gb.get_annotated_sequence(gb_file, include_only=["CDS"])


# This dictionary will count how often each codon occurs in the genome
# For increased performance the dictionary uses symbol codes ([0 3 2])
# instead of symbols (['A' 'T' 'G']) as keys
codon_counter = {
    codon: 0
    for codon in itertools.product(*([range(len(k12_genome.sequence.alphabet))] * 3))
}
# For demonstration purposes print the 64 codons in symbol code form
print(list(codon_counter.keys()))

[(0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3), (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3), (0, 2, 0), (0, 2, 1), (0, 2, 2), (0, 2, 3), (0, 3, 0), (0, 3, 1), (0, 3, 2), (0, 3, 3), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 1, 3), (1, 2, 0), (1, 2, 1), (1, 2, 2), (1, 2, 3), (1, 3, 0), (1, 3, 1), (1, 3, 2), (1, 3, 3), (2, 0, 0), (2, 0, 1), (2, 0, 2), (2, 0, 3), (2, 1, 0), (2, 1, 1), (2, 1, 2), (2, 1, 3), (2, 2, 0), (2, 2, 1), (2, 2, 2), (2, 2, 3), (2, 3, 0), (2, 3, 1), (2, 3, 2), (2, 3, 3), (3, 0, 0), (3, 0, 1), (3, 0, 2), (3, 0, 3), (3, 1, 0), (3, 1, 1), (3, 1, 2), (3, 1, 3), (3, 2, 0), (3, 2, 1), (3, 2, 2), (3, 2, 3), (3, 3, 0), (3, 3, 1), (3, 3, 2), (3, 3, 3)]


In [12]:
table = seq.CodonTable.default_table()
# Convert to dictionary with amino acid as key
amino_acid_codon_dict = {}
for amino_acid_code in range(20):
    codon_codes_for_aa = table[amino_acid_code]
    amino_acid = seq.ProteinSequence.alphabet.decode(amino_acid_code)
    codons = []
    for codon_code in codon_codes_for_aa:
        codon = k12_genome.sequence.alphabet.decode_multiple(codon_code)
        codon = "".join(codon)
        codons.append(codon)
    amino_acid_codon_dict[amino_acid] = codons
print(amino_acid_codon_dict)

# Change to format dict[codon] = amino acid
codon_amino_acid_dict = {}
for amino_acid, codons in amino_acid_codon_dict.items():
    for codon in codons:
        codon_amino_acid_dict[codon] = amino_acid
codon_amino_acid_dict['TAA'] = '*'
codon_amino_acid_dict['TAG'] = '*'
codon_amino_acid_dict['TGA'] = '*'

#print(table)
print(codon_amino_acid_dict)

{'A': ['GCA', 'GCC', 'GCG', 'GCT'], 'C': ['TGC', 'TGT'], 'D': ['GAC', 'GAT'], 'E': ['GAA', 'GAG'], 'F': ['TTC', 'TTT'], 'G': ['GGA', 'GGC', 'GGG', 'GGT'], 'H': ['CAC', 'CAT'], 'I': ['ATA', 'ATC', 'ATT'], 'K': ['AAA', 'AAG'], 'L': ['CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'], 'M': ['ATG'], 'N': ['AAC', 'AAT'], 'P': ['CCA', 'CCC', 'CCG', 'CCT'], 'Q': ['CAA', 'CAG'], 'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'], 'S': ['AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'], 'T': ['ACA', 'ACC', 'ACG', 'ACT'], 'V': ['GTA', 'GTC', 'GTG', 'GTT'], 'W': ['TGG'], 'Y': ['TAC', 'TAT']}
{'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'TGC': 'C', 'TGT': 'C', 'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 'TTC': 'F', 'TTT': 'F', 'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'CAC': 'H', 'CAT': 'H', 'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'AAA': 'K', 'AAG': 'K', 'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 'TTA': 'L', 'TTG': 'L', 'ATG': 'M', 'AAC': 'N', 'AAT': 'N', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',

In [13]:
codon_codes_for_aa

((3, 0, 1), (3, 0, 3))

In [15]:
table = seq.CodonTable.default_table()
table['M']

('ATG',)

In [12]:
table['TAG']

'*'

In [21]:
table.start_codons()

('ATG',)

In [3]:
codon = 'ATG'
default_codon_table = seq.CodonTable.default_table()

In [10]:
try:
    result = default_codon_table['ATG']
except:
    result = None
print(result)

M


In [14]:
result = seq.CodonTable.default_table()['GGT']
print(result)

G


In [None]:
result = default_codon_table[codon.upper()] if codon.upper() in default_codon_table else None
print(result)

In [15]:
codon_table = seq.CodonTable.default_table()
codon = 'XXX'
result = codon_table[str(codon)] if str(codon) in codon_table else None
print(result)

: 

In [16]:
codon = 'ATG'

try:
    result = seq.CodonTable.default_table()[str(codon)]
except:
    result = None

print(result)

M


In [18]:
seq.protein_alphabet

AttributeError: module 'biotite.sequence' has no attribute 'protein_alphabet'

In [26]:
test = 'ACTHGGGGILPM'

test.seq.ProteinSequence.is_valid()

AttributeError: 'str' object has no attribute 'seq'

In [1]:
from biotite.sequence import NucleotideSequence

# Define your sequence
sequence_str = "ACGGTACG"

# Create a Biotite ProteinSequence object
sequence = NucleotideSequence(sequence_str)

# Use is_valid() to check if the sequence contains valid codes
is_valid = sequence.is_valid()

# Print the result
print(f"Is the sequence '{sequence_str}' valid?: {is_valid}")

Is the sequence 'ACGGTACG' valid?: True


In [12]:
sequence_str = 'ATG'

seq.CodonTable.default_table().is_start_codon('ATG')


# seq.CodonTable(sequence_str).is_start_codon('ATGCCG', starts='ATG')

ValueError: invalid literal for int() with base 10: 'A'

In [37]:
sequence = 'aaacgta'

sequence = seq.NucleotideSequence(sequence)
result = sequence.reverse().complement()

result2 = seq.NucleotideSequence(sequence).reverse().complement()
result3 = seq.NucleotideSequence(sequence).complement()

print(result)
print(result2)
print(result3)

TACGTTT
TACGTTT
TTTGCAT


In [38]:
def reverse_complement(sequence):
    """
    Reverse complement sequence
    """
    pairs = {"a": "t", "c":"g", "t":"a", "g":"c"}
    reverse = ""
    for nucleotide in sequence:
        rev_nucl = pairs[nucleotide]
        reverse += rev_nucl
    return reverse

sequence = 'aaacgta'
result = reverse_complement(sequence)
print(result)

complementary_inverted = result[::-1]
print(complementary_inverted)



tttgcat
tacgttt


In [17]:
def contains_start_stop_codon1(sequence: str) -> bool:
    """
    This function checks if the sequence contains start and stop codons.
    """
    start_codons = ["ATG"]
    stop_codons = ["TAA", "TAG", "TGA"]
    sequence = sequence.upper()
    if sequence.upper().startswith("ATG") and sequence.upper().endswith(("TAA", "TAG", "TGA")):
        return True
    else:
        print("Please provide a sequence with start and stop codons")

In [20]:
def contains_start_stop_codon2(sequence: str) -> bool:
    """
    This function checks if the sequence contains start and stop codons.
    """
    stop_codons = ["TAA", "TAG", "TGA"]
    sequence = sequence.upper()
    if (sequence.startswith("ATG")) and (sequence[-3:] in stop_codons):
        return True
    else:
        print("Please provide a sequence with start and stop codons")
        return False


sequence = 'ATGATGATGATGATGTGC'
contains_start_stop_codon1(sequence)
contains_start_stop_codon2(sequence)

Please provide a sequence with start and stop codons
Please provide a sequence with start and stop codons
