In [10]:
from typing_extensions import Counter
import collections
import random

nucleodites =['A','T','C','G']
reverse_nucleodites = {'A':'T','T':'A','G':'C','C':'G'}

def validate_seg_dna(dna):
    verification = dna.upper()
    for nuc in verification:
        if nuc not in nucleodites:
            return False
    return verification

def count_nucleodites(dna):
    nucleodites_count = {'A':0,'T':0,'C':0,'G':0}
    for nuc in dna:
        nucleodites_count[nuc] += 1
    return nucleodites_count

def transcription(dna):
    return dna.replace('T','U')

def reverse_complement(dna):
    #return ''.join([reverse_nucleodites[nuc] for nuc in dna])[::-1]
    mapping=str(dna).maketrans('ATCG','TAGC')
    return dna.translate(mapping)[::-1]


def dna_complement(dna):
    return ''.join([reverse_nucleodites[nuc] for nuc in dna])

def dna_colors(dna):
    dnacolors = {
        'A': '\033[92m',
        'C': '\033[94m',
        'G': '\033[93m',
        'T': '\033[91m',
        'U': '\033[91m',
        'reset': '\033[0;0m'
    }

    variavel = ""

    for nuc in dna:
        if nuc in dnacolors:
            variavel += dnacolors[nuc] + nuc
        else:
            variavel += dnacolors['reset'] + nuc

    return variavel + '\033[0;0m'

def gc_content(dna):
  if len(dna) == 0:
    return 0
  return round((dna.count('C') + dna.count('G')) / len(dna) * 100,6)

def gc_content_subsets(dna,k=20):
  props=[]
  for i in range(0,len(dna)-k+1,k):
    subsets=dna[i:i+k]
    props.append(gc_content(subsets))
  return props


def readTextFile(filePath):
    with open(filePath, 'r') as f:
        return "".join([l.strip() for l in f.readlines()])


def writeTextFile(filePath, dna, mode='w'):
    with open(filePath, mode) as f:
        f.write(dna + '\n')


def read_FASTA(filePath):
    with open(filePath, 'r') as f:
        FASTAFile = [l.strip() for l in f.readlines()]

    FASTADict = {}
    FASTALabel = ""

    for line in FASTAFile:
        if '>' in line:
            FASTALabel = line
            FASTADict[FASTALabel] = ""
        else:
            FASTADict[FASTALabel] += line

    return FASTADict

# List the amoni acids
def translate_dna(dna):
    return [DNA_Codons[dna[pos:pos + 3]] for pos in range(0, len(dna)-2, 3)]

#Frequency of a codon and returns the amino acid
def codon_freq(dna,animoacid):
  lista=[]
  for i in range(0,len(dna)-2,3):
    if DNA_Codons[dna[i:i+3]]==animoacid:
      lista.append(dna[i:i+3])
  freqdic=dict(Counter(lista))
  total_freq=sum(freqdic.values())
  for dna in freqdic:
    freqdic[dna]=round(freqdic[dna]/total_freq,2)
  return freqdic


In [8]:
NUCLEOTIDE_BASE = {
    "DNA": ["A", "T", "C", "G"],
    "RNA": ["A", "U", "C", "G"]
}

DNA_Codons = {
    # 'M' - START, '_' - STOP
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "TGT": "C", "TGC": "C",
    "GAT": "D", "GAC": "D",
    "GAA": "E", "GAG": "E",
    "TTT": "F", "TTC": "F",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    "CAT": "H", "CAC": "H",
    "ATA": "I", "ATT": "I", "ATC": "I",
    "AAA": "K", "AAG": "K",
    "TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "ATG": "M",
    "AAT": "N", "AAC": "N",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAA": "Q", "CAG": "Q",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", "AGC": "S",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "TGG": "W",
    "TAT": "Y", "TAC": "Y",
    "TAA": "_", "TAG": "_", "TGA": "_"
}

RNA_Codons = {
    # 'M' - START, '_' - STOP
    "GCU": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "UGU": "C", "UGC": "C",
    "GAU": "D", "GAC": "D",
    "GAA": "E", "GAG": "E",
    "UUU": "F", "UUC": "F",
    "GGU": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    "CAU": "H", "CAC": "H",
    "AUA": "I", "AUU": "I", "AUC": "I",
    "AAA": "K", "AAG": "K",
    "UUA": "L", "UUG": "L", "CUU": "L", "CUC": "L", "CUA": "L", "CUG": "L",
    "AUG": "M",
    "AAU": "N", "AAC": "N",
    "CCU": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAA": "Q", "CAG": "Q",
    "CGU": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
    "UCU": "S", "UCC": "S", "UCA": "S", "UCG": "S", "AGU": "S", "AGC": "S",
    "ACU": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GUU": "V", "GUC": "V", "GUA": "V", "GUG": "V",
    "UGG": "W",
    "UAU": "Y", "UAC": "Y",
    "UAA": "_", "UAG": "_", "UGA": "_"
}


In [15]:
dna_random= ''.join([random.choice(nucleodites) for nuc in range(30)])
print('\n Check if my Data is a DNA Sequence \n')
print(dna_colors(validate_seg_dna(dna_random)))

print('\n Check the Number of Nucleodites I have in my DNA list \n')
dna_counts = count_nucleodites(dna_random)
formatted_counts = f"A: {dna_counts['A']}, T: {dna_counts['T']}, C: {dna_counts['C']}, G: {dna_counts['G']}"
print(dna_colors(formatted_counts))
print(f' '.join([str(val) for key,val in count_nucleodites(dna_random).items()]))

print('\n Transcription of DNA to RNA \n')
print(dna_colors(transcription(dna_random)))

print('\n Implement my DNA  Reverse Complement \n')
print(f"DNA + Reverse: \n 5' {dna_colors(dna_random)} 3'")
print(f"    {''.join(['|' for nuc in range(len(dna_random))])}")
print(f"\n\n  {dna_colors(reverse_complement(dna_random))}  \n")

print('\n Implement my DNA complement \n')
print(f"DNA + Complement: \n 5' {dna_colors(dna_random)} 3'")
print(f"    {''.join(['|' for nuc in range(len(dna_random))])}")
print(f" 3' {dna_colors(dna_complement(dna_random))} 5'\n")

#GC-rich regions may indicate greater compaction, greater genetic density,
#greater stability and resistance to dematuration and may also indicate the origin of genes
#in foreign genomes
print('\n Calculate my GC Content total and subsets \n')
print('\nTotal GC Content \n')
print(gc_content(dna_random))
print('\nGC Content Subsets \n')
print(gc_content_subsets(dna_random,k=5))

#Read information from a file in a specific format and perform the GC content calculation

Fastadict=read_FASTA('fasta_samples.txt')

DictGC={key:gc_content(value) for (key,value) in Fastadict.items()}

maxGC = (max(DictGC, key=DictGC.get))
print(f'{maxGC}\n{DictGC[maxGC]}')

print(f'\n Amino Acid Sequence \n')
print(translate_dna(dna_random))
print(f'\n Frequency of a codon and returns the amino acid \n')
print(codon_freq(dna_random,'L'))



 Check if my Data is a DNA Sequence 

[94mC[94mC[93mG[94mC[92mA[93mG[94mC[91mT[91mT[91mT[92mA[91mT[92mA[91mT[92mA[92mA[92mA[91mT[92mA[94mC[91mT[92mA[92mA[91mT[92mA[93mG[93mG[91mT[92mA[93mG[0;0m

 Check the Number of Nucleodites I have in my DNA list 

[92mA[0;0m:[0;0m [0;0m1[0;0m1[0;0m,[0;0m [91mT[0;0m:[0;0m [0;0m9[0;0m,[0;0m [94mC[0;0m:[0;0m [0;0m5[0;0m,[0;0m [93mG[0;0m:[0;0m [0;0m5[0;0m
11 9 5 5

 Transcription of DNA to RNA 

[94mC[94mC[93mG[94mC[92mA[93mG[94mC[91mU[91mU[91mU[92mA[91mU[92mA[91mU[92mA[92mA[92mA[91mU[92mA[94mC[91mU[92mA[92mA[91mU[92mA[93mG[93mG[91mU[92mA[93mG[0;0m

 Implement my DNA  Reverse Complement 

DNA + Reverse: 
 5' [94mC[94mC[93mG[94mC[92mA[93mG[94mC[91mT[91mT[91mT[92mA[91mT[92mA[91mT[92mA[92mA[92mA[91mT[92mA[94mC[91mT[92mA[92mA[91mT[92mA[93mG[93mG[91mT[92mA[93mG[0;0m 3'
    ||||||||||||||||||||||||||||||


  [94mC[91mT[92mA[94mC[94mC[91