In [None]:
from typing_extensions import Counter
import collections
import random

nucleodites =['A','T','C','G']
reverse_nucleodites = {'A':'T','T':'A','G':'C','C':'G'}

def validate_seg_dna(dna):
    """Validate if a sequence is a DNA sequence"""
    verification = dna.upper()
    for nuc in verification:
        if nuc not in nucleodites:
            return False
    return verification

def count_nucleodites(dna):
    """Count the number of nucleodites in a DNA sequence"""
    nucleodites_count = {'A':0,'T':0,'C':0,'G':0}
    dna = dna.upper()
    for nuc in dna:
        nucleodites_count[nuc] += 1
    return nucleodites_count

def transcription(dna):
    """Return a list of nucleodites to complete the dna"""
    dna = dna.upper()
    return dna.replace('T','U')

def reverse_complement(dna):
    """Return a list of nucleodites to complete the dna"""
    #return ''.join([reverse_nucleodites[nuc] for nuc in dna])[::-1]
    dna = dna.upper()
    mapping=str(dna).maketrans('ATCG','TAGC')
    return dna.translate(mapping)[::-1]


def dna_complement(dna):
    """Return a list of nucleodites reverse of a list of nucleodites"""
    return ''.join([reverse_nucleodites[nuc.upper()] for nuc in dna])

def dna_colors(dna):
    """Define a color for each nucleotide in a DNA sequence"""
    #dna = dna.upper()
    dnacolors = {
        'A': '\033[92m',
        'C': '\033[94m',
        'G': '\033[93m',
        'T': '\033[91m',
        'U': '\033[91m',
        'reset': '\033[0;0m'
    }

    variavel = ""

    for nuc in dna:
        if nuc in dnacolors:
            variavel += dnacolors[nuc] + nuc
        else:
            variavel += dnacolors['reset'] + nuc

    return variavel + '\033[0;0m'

def gc_content(dna):
  """GC Content in a DNA/RNA sequence"""
  dna = dna.upper()
  if len(dna) == 0:
    return 0
  return round((dna.count('C') + dna.count('G')) / len(dna) * 100,6)

def gc_content_subsets(dna,k=20):
  """GC Content in a DNA/RNA sub-sequence length k. k=20 by default"""
  dna = dna.upper()
  props=[]
  for i in range(0,len(dna)-k+1,k):
    subsets=dna[i:i+k]
    props.append(gc_content(subsets))
  return props


def readTextFile(filePath):
    """Read a text file and return a string"""
    with open(filePath, 'r') as f:
        return "".join([l.strip() for l in f.readlines()])


def writeTextFile(filePath, dna, mode='w'):
    """Write a DNA sequence to a file"""
    with open(filePath, mode) as f:
        f.write(dna + '\n')


def read_FASTA(filePath):
    """Read a FASTA file and return a dictionary with the sequence"""
    with open(filePath, 'r') as f:
        FASTAFile = [l.strip() for l in f.readlines()]

    FASTADict = {}
    FASTALabel = ""

    for line in FASTAFile:
        if '>' in line:
            FASTALabel = line
            FASTADict[FASTALabel] = ""
        else:
            FASTADict[FASTALabel] += line

    return FASTADict

# List the amoni acids
def translate_dna(dna,position_initial=0):
    """Translates a DNA sequence into an aminoacid sequence"""
    #Check if dna is a string
    if isinstance(dna, str):
        return [DNA_Codons[dna[pos:pos + 3]] for pos in range(position_initial, len(dna)-2, 3)]
    #If dna is not a string (e.g., a list), iterate over its elements and translate them
    elif isinstance(dna, list):
        translated_dna = []
        for item in dna:
            #Check if each element is a string
            if isinstance(item, str):
                #Use the original translate_dna function on strings
                translated_dna.extend(translate_dna(item, position_initial))
            else:
                #Handle cases for non-string elements if necessary
                pass
        return translated_dna
    else:
        return []

#Frequency of a codon and returns the amino acid
def codon_freq(dna,animoacid):
  """Provides the frequency of each codon encoding a given aminoacid in a DNA sequence"""
  dna = dna.upper()
  lista=[]
  for i in range(0,len(dna)-2,3):
    if DNA_Codons[dna[i:i+3]]==animoacid:
      lista.append(dna[i:i+3])
  freqdic=dict(Counter(lista))
  total_freq=sum(freqdic.values())
  for dna in freqdic:
    freqdic[dna]=round(freqdic[dna]/total_freq,2)
  return freqdic

#In molecular biology, a reading frame is a specific choice out of the possible ways to read the sequence of nucleotides in a nucleic acid (DNA or RNA)
# molecule as a sequence of triplets. Where these triplets equate to amino acids or stop signals during translation, they are called codons.
def reading_frames(dna):
  """Generate reading frames of a dna sequence, including reverse complement"""
  frames=[]
  frames.append(translate_dna(dna,0))
  frames.append(translate_dna(dna,1))
  frames.append(translate_dna(dna,2))
  frames.append(translate_dna(reverse_complement(dna),0))
  frames.append(translate_dna(reverse_complement(dna),1))
  frames.append(translate_dna(reverse_complement(dna),2))
  return frames


def proteins(dna):
  """Compute all possible proteins and return a list of possible proteins"""
  proteinas=[]
  proteina_atual=[]
  for amino in dna:
    if amino =="_":
      if proteina_atual:
        for p in proteina_atual:
          proteinas.append(''.join(p))
          proteina_atual=[]
    else:
      if amino == "M":
        proteina_atual.append("")
      for i in range(len(proteina_atual)):
        proteina_atual[i]+=amino
  return proteinas

def frames_proteins(dna,inicio=0,final=0,order='False'):
   """Compute all possible proteins for all reading frames"""
   """Protine Search DB: https://www.ncbi.nlm.nih.gov/nuccore/NM_001185097.2"""
   """API can be used to pull protein info"""
   if final> inicio:
    frames=reading_frames(dna[inicio:final])
   else:
    frames = reading_frames(dna)

   find_prot=[]
   for frame in frames:
    proteinas=proteins(frame)
    for prot in proteinas:
      find_prot.append(prot)
   if order=='True':
     find_prot.sort(key=len)
   return find_prot



In [None]:
NUCLEOTIDE_BASE = {
    "DNA": ["A", "T", "C", "G"],
    "RNA": ["A", "U", "C", "G"]
}

DNA_Codons = {
    # 'M' - START, '_' - STOP
    "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "TGT": "C", "TGC": "C",
    "GAT": "D", "GAC": "D",
    "GAA": "E", "GAG": "E",
    "TTT": "F", "TTC": "F",
    "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    "CAT": "H", "CAC": "H",
    "ATA": "I", "ATT": "I", "ATC": "I",
    "AAA": "K", "AAG": "K",
    "TTA": "L", "TTG": "L", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
    "ATG": "M",
    "AAT": "N", "AAC": "N",
    "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAA": "Q", "CAG": "Q",
    "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "AGT": "S", "AGC": "S",
    "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
    "TGG": "W",
    "TAT": "Y", "TAC": "Y",
    "TAA": "_", "TAG": "_", "TGA": "_"
}

RNA_Codons = {
    # 'M' - START, '_' - STOP
    "GCU": "A", "GCC": "A", "GCA": "A", "GCG": "A",
    "UGU": "C", "UGC": "C",
    "GAU": "D", "GAC": "D",
    "GAA": "E", "GAG": "E",
    "UUU": "F", "UUC": "F",
    "GGU": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    "CAU": "H", "CAC": "H",
    "AUA": "I", "AUU": "I", "AUC": "I",
    "AAA": "K", "AAG": "K",
    "UUA": "L", "UUG": "L", "CUU": "L", "CUC": "L", "CUA": "L", "CUG": "L",
    "AUG": "M",
    "AAU": "N", "AAC": "N",
    "CCU": "P", "CCC": "P", "CCA": "P", "CCG": "P",
    "CAA": "Q", "CAG": "Q",
    "CGU": "R", "CGC": "R", "CGA": "R", "CGG": "R", "AGA": "R", "AGG": "R",
    "UCU": "S", "UCC": "S", "UCA": "S", "UCG": "S", "AGU": "S", "AGC": "S",
    "ACU": "T", "ACC": "T", "ACA": "T", "ACG": "T",
    "GUU": "V", "GUC": "V", "GUA": "V", "GUG": "V",
    "UGG": "W",
    "UAU": "Y", "UAC": "Y",
    "UAA": "_", "UAG": "_", "UGA": "_"
}


In [None]:
#Read information from a file in a specific format and perform the GC content calculation
Fastadict=read_FASTA('fasta_samples.txt')
first_sequence_key = list(Fastadict.keys())[0]
first_sequence_value = Fastadict[first_sequence_key]
dna_random= first_sequence_value
print('\n Check if my Data is a DNA Sequence \n')
print(dna_colors(validate_seg_dna(dna_random)))

print('\n Check the Number of Nucleodites I have in my DNA list \n')
dna_counts = count_nucleodites(dna_random)
formatted_counts = f"A: {dna_counts['A']}, T: {dna_counts['T']}, C: {dna_counts['C']}, G: {dna_counts['G']}"
print(dna_colors(formatted_counts))
print(f' '.join([str(val) for key,val in count_nucleodites(dna_random).items()]))

print('\n Transcription of DNA to RNA \n')
print(dna_colors(transcription(dna_random)))

print('\n Implement my DNA  Reverse Complement \n')
print(f"DNA + Reverse: \n 5' {dna_colors(dna_random)} 3'")
print(f"    {''.join(['|' for nuc in range(len(dna_random))])}")
print(f"\n\n  {dna_colors(reverse_complement(dna_random))}  \n")

print('\n Implement my DNA complement \n')
print(f"DNA + Complement: \n 5' {dna_colors(dna_random)} 3'")
print(f"    {''.join(['|' for nuc in range(len(dna_random))])}")
print(f" 3' {dna_colors(dna_complement(dna_random))} 5'\n")

#GC-rich regions may indicate greater compaction, greater genetic density,
#greater stability and resistance to dematuration and may also indicate the origin of genes
#in foreign genomes
print('\n Calculate my GC Content total and subsets \n')
print('\nTotal GC Content \n')
print(gc_content(dna_random))
print('\nGC Content Subsets \n')
print(gc_content_subsets(dna_random,k=5))

DictGC={key:gc_content(value) for (key,value) in Fastadict.items()}

maxGC = (max(DictGC, key=DictGC.get))
print(f'{maxGC}\n{DictGC[maxGC]}')

print(f'\n Amino Acid Sequence \n')
print(translate_dna(dna_random))
print(f'\n Frequency of a codon and returns the amino acid \n')
print(codon_freq(dna_random,'P'))

print(f'\n Reading Frames \n')
for reading in reading_frames(dna_random):
  print(reading)
print(f'\n Frames Proteins \n')
for protein in frames_proteins(dna_random):
  print(protein)



 Check if my Data is a DNA Sequence 

[94mC[94mC[91mT[93mG[94mC[93mG[93mG[92mA[92mA[93mG[92mA[91mT[94mC[93mG[93mG[94mC[92mA[94mC[91mT[92mA[93mG[92mA[92mA[91mT[92mA[93mG[94mC[94mC[92mA[93mG[92mA[92mA[94mC[94mC[93mG[91mT[91mT[91mT[94mC[91mT[94mC[91mT[93mG[92mA[93mG[93mG[94mC[91mT[91mT[94mC[94mC[93mG[93mG[94mC[94mC[91mT[91mT[94mC[94mC[94mC[91mT[94mC[94mC[94mC[92mA[94mC[91mT[92mA[92mA[91mT[92mA[92mA[91mT[91mT[94mC[91mT[93mG[92mA[93mG[93mG[0;0m

 Check the Number of Nucleodites I have in my DNA list 

[92mA[0;0m:[0;0m [0;0m1[0;0m8[0;0m,[0;0m [91mT[0;0m:[0;0m [0;0m1[0;0m9[0;0m,[0;0m [94mC[0;0m:[0;0m [0;0m2[0;0m5[0;0m,[0;0m [93mG[0;0m:[0;0m [0;0m1[0;0m8[0;0m
18 19 25 18

 Transcription of DNA to RNA 

[94mC[94mC[91mU[93mG[94mC[93mG[93mG[92mA[92mA[93mG[92mA[91mU[94mC[93mG[93mG[94mC[92mA[94mC[91mU[92mA[93mG[92mA[92mA[91mU[92mA[93mG[94mC[94mC[92mA[93mG[92mA

#Applying the functions in the object-oriented model to create a package later

In [3]:
from typing_extensions import Counter
import collections


class bioinformatics_funcions :
  def __init__(self,dna='ATCG', seq_type='DNA',label='No Label'):
    self.dna = dna
    self.label = label
    self.seq_type_ = seq_type
    self.is_valid = self.validate_seg_dna()
    assert self.is_valid, f"Invalid {self.seq_type_} sequence"

  def dna_colors(self):
      """Define a color for each nucleotide in a DNA sequence"""
      dna = self.dna.upper()
      dnacolors = {
          'A': '\033[92m',
          'C': '\033[94m',
          'G': '\033[93m',
          'T': '\033[91m',
          'U': '\033[91m',
          'reset': '\033[0;0m'
      }
      variavel = ""

      for nuc in dna:
          if nuc in dnacolors:
              variavel += dnacolors[nuc] + nuc
          else:
              variavel += dnacolors['reset'] + nuc

      return variavel + '\033[0;0m'

  def values(self):
        """Returns a list of the values of the object's attributes."""
        return [f"DNA: \n 5' {self.dna} 3' \n"+f"    {''.join(['|' for nuc in range(len(self.dna))])}", self.label, self.seq_type_]
  def validate_seg_dna(self):
    """Validate if a sequence is a DNA sequence"""
    return set(NUCLEOTIDE_BASE[self.seq_type_]).issuperset(self.dna)

  def seq_type(self):
        """Returns sequence type"""
        return self.seq_type_

  def seq_info(self):
        """Returns 4 strings. Full sequence information"""
        return f"[Label]: {self.label}\n[Sequence]: {self.dna}\n[Type]: {self.seq_type_}\n[Length]: {len(self.dna)}"



  def count_nucleodites(self):
    """Count the number of nucleodites in a DNA sequence"""
    if self.seq_type_ == "DNA":
      nucleodites_count = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
      dna = self.dna.upper()
      for nuc in dna:
          nucleodites_count[nuc] += 1
      # Format the output string using the nucleotide counts
      #formatted_counts = f"A: {nucleodites_count['A']}, T: {nucleodites_count['T']}, C: {nucleodites_count['C']}, G: {nucleodites_count['G']}"
      # Apply colors to the formatted string
      colored_counts = self.dna_colors()
      formatted_counts = f"A: {nucleodites_count['A']}, T: {nucleodites_count['T']}, C: {nucleodites_count['C']}, G: {nucleodites_count['G']}"
      return nucleodites_count
    else:
      nucleodites_count = {'A': 0, 'U': 0, 'C': 0, 'G': 0}
      dna = self.dna.upper()
      for nuc in dna:
          nucleodites_count[nuc] += 1
      # Format the output string using the nucleotide counts
      #formatted_counts = f"A: {nucleodites_count['A']}, T: {nucleodites_count['T']}, C: {nucleodites_count['C']}, G: {nucleodites_count['G']}"
      # Apply colors to the formatted string
      colored_counts = self.dna_colors()
      formatted_counts = f"A: {nucleodites_count['A']}, U: {nucleodites_count['U']}, C: {nucleodites_count['C']}, G: {nucleodites_count['G']}"
      return nucleodites_count

 #Return the results

  def transcription(self):
      """Return a list of transcription of the dna"""
      if self.seq_type_ == "DNA":
           #return self.dna.upper().replace('T','U')
           return f"RNA: \n 5' {self.dna.upper().replace('T','U')} 3' \n"+f"    {''.join(['|' for nuc in range(len(self.dna))])}"
      else:
           #return self.dna.upper().replace('U','T')
           #return f"RNA: \n 5' {self.dna.upper().replace('U','T')} 3' \n"+f"    {''.join(['|' for nuc in range(len(self.dna))])}"
           return "Not a DNA sequence, Please add a DNA sequence to transcription."

  def reverse_complement(self):
      """Return a list of nucleodites to complete the dna"""
      dna = self.dna.upper()
      if self.seq_type_ == "DNA":
            mapping = str.maketrans('ATCG', 'TAGC')
      else:
            mapping = str.maketrans('AUCG', 'UAGC')
       #dna.translate(mapping)[::-1]
      return f"DNA: \n 3' {dna.translate(mapping)[::-1]} 5' \n"+f"    {''.join(['|' for nuc in range(len(self.dna))])}"

  def dna_complement(self):
      """Return a list of nucleodites reverse of a list of nucleodites"""
      #return ''.join([reverse_nucleodites[nuc.upper()] for nuc in self.dna])
      return (f"DNA Complement: \n 5' {''.join([reverse_nucleodites[nuc.upper()] for nuc in self.dna])} 3' \n"+
      f"    {''.join(['|' for nuc in range(len(self.dna))])}\n"+
      f"    {self.dna}   ")

  def gc_content(self):
    """GC Content in a DNA/RNA sequence"""
    if len(self.dna) == 0:
      return 0
    return round((self.dna.count('C') + self.dna.count('G')) / len(self.dna) * 100,6)

  def gc_content_subsets(self,Fastadict,k=20):
    """GC Content in a DNA/RNA sub-sequence length k. k=20 by default"""
    dna = self.dna.upper()
    props=[]
    for i in range(0,len(dna)-k+1,k):
      subsets=dna[i:i+k]
      self.dna=subsets
      props.append(self.gc_content())
    self.dna=dna
    DictGC={key:gc_content(value) for (key,value) in Fastadict.items()}
    maxGC = (max(DictGC, key=DictGC.get))
    return (f'{maxGC}\n{DictGC[maxGC]}')

  # List the aminoacids
  def translate_dna(self,position_initial=0):
      """Translates a DNA sequence into an aminoacid sequence"""
      #Check if dna is a string
      if isinstance(self.dna, str):
          return [DNA_Codons[self.dna[pos:pos + 3]] for pos in range(position_initial, len(self.dna)-2, 3)]
      #If dna is not a string (e.g., a list), iterate over its elements and translate them
      elif isinstance(self.dna, list):
          translated_dna = []
          for item in self.dna:
              #Check if each element is a string
              if isinstance(item, str):
                  #Use the original translate_dna function on strings
                  translated_dna.extend(self.translate_dna(item, position_initial))
              else:
                  #Handle cases for non-string elements if necessary
                  pass
          return translated_dna
      else:
          return []

  #Frequency of a codon and returns the amino acid
  def codon_freq(self, aminoacid):
        """Provides the frequency of each codon encoding a given aminoacid in a DNA sequence"""
        dna = self.dna.upper()
        lista=[]
        if self.seq_type_ == "DNA":
          for i in range(0,len(dna)-2,3):
            if DNA_Codons[dna[i:i+3]]==aminoacid:
              lista.append(dna[i:i+3])

        else:
          for i in range(0,len(dna)-2,3):
            if RNA_Codons[dna[i:i+3]]==aminoacid:
              lista.append(dna[i:i+3])

        freqdic=dict(Counter(lista))
        total_freq=sum(freqdic.values())
        for dna in freqdic:
            freqdic[dna]=round(freqdic[dna]/total_freq,2)
        return freqdic


  #In molecular biology, a reading frame is a specific choice out of the possible ways to read the sequence of nucleotides in a nucleic acid (DNA or RNA)
  # molecule as a sequence of triplets. Where these triplets equate to amino acids or stop signals during translation, they are called codons.
  def reading_frames(self):
    """Generate reading frames of a dna sequence, including reverse complement"""
    frames=[]
    frames.append(self.translate_dna(position_initial=0))
    frames.append(self.translate_dna(position_initial=1))
    frames.append(self.translate_dna(position_initial=2))
    # Store the original DNA sequence
    original_dna = self.dna
    # Temporarily use the reverse complement for the remaining frames
    self.dna = self.dna.upper().translate(str.maketrans('ATCG', 'TAGC'))[::-1]
    frames.append(self.translate_dna(position_initial=0))
    frames.append(self.translate_dna(position_initial=1))
    frames.append(self.translate_dna(position_initial=2))
    # Restore the original DNA sequence
    self.dna = original_dna
    return frames


  def proteins(self):
    """Compute all possible proteins and return a list of possible proteins"""
    proteinas=[]
    proteina_atual=[]
    for amino in self.dna:
      if amino =="_":
        if proteina_atual:
          for p in proteina_atual:
            proteinas.append(''.join(p))
            proteina_atual=[]
      else:
        if amino == "M":
          proteina_atual.append("")
        for i in range(len(proteina_atual)):
          proteina_atual[i]+=amino
    return proteinas

  def frames_proteins(self,inicio=0,final=0,order='False'):
    """Compute all possible proteins for all reading frames"""
    """Protine Search DB: https://www.ncbi.nlm.nih.gov/nuccore/NM_001185097.2"""
    """API can be used to pull protein info"""
    if final> inicio:
      frames=reading_frames(self.dna[inicio:final])
    else:
      frames = reading_frames(self.dna)

    find_prot=[]
    for frame in frames:
      proteinas=proteins(frame)
      for prot in proteinas:
        find_prot.append(prot)
    if order=='True':
      find_prot.sort(key=len)
    return find_prot


In [None]:
Fastadict=read_FASTA('GC_TEST.txt')
first_sequence_key = list(Fastadict.keys())[0]
first_sequence_value = Fastadict[first_sequence_key]
dna_random= first_sequence_value

In [None]:
dna_obj = bioinformatics_funcions(dna=dna_random)
print('\n Check if my Data is a DNA Sequence \n')
print(dna_obj.validate_seg_dna())
print('\n My List of nucleodites \n')
print(dna_obj.values()[0])
#print(f"DNA: \n 5' {dna_obj.values()[0]} 3' \n"+f"    {''.join(['|' for nuc in range(len(dna_obj.values()[0]))])}")
print('\n Check the Sequence Type \n')
print(dna_obj.seq_type(),
dna_obj.seq_info())
print('\n Check the Number of Nucleodites I have in my DNA list \n')
print(dna_obj.count_nucleodites())
#print(f"DNA + Reverse: \n 5' {dna_colors(dna_random)} 3'")
#print(f"    {''.join(['|' for nuc in range(len(dna_random))])}")
print('\n Transcription of DNA to RNA \n')
print(dna_obj.transcription())
#print(f"RNA: \n 5' {dna_obj.transcription()} 3' \n"+f"    {''.join(['|' for nuc in range(len(dna_obj.transcription()))])}")
print('\n Implement my DNA  Reverse Complement \n')
#print(f"DNA: \n 3' {dna_obj.reverse_complement()} 5' \n"+f"    {''.join(['|' for nuc in range(len(dna_obj.reverse_complement()))])}")
print(dna_obj.reverse_complement())
print('\n Implement my DNA complement \n')
#print(f"DNA Complement: \n 5' {dna_obj.dna_complement()} 3' \n"+f"    {''.join(['|' for nuc in range(len(dna_obj.dna_complement()))])}")
print(dna_obj.dna_complement())
print('\n Calculate my GC Content total and subsets \n')
print(dna_obj.gc_content())
print(dna_obj.gc_content_subsets(Fastadict))
print(f'\n Amino Acid Sequence \n')
print(dna_obj.translate_dna())
print(f'\n Frequency of a codon and returns the amino acid \n')
print(dna_obj.codon_freq('P'))
print(f'\n Reading Frames \n')
for reading in dna_obj.reading_frames():
  print(reading)
print(f'\n List of Proteins \n')
for protein in dna_obj.frames_proteins():
  print(protein)

# writing a file, read a file
#writetextfile('test.txt',dna_obj,'a') - 'a  indicate append the elements
#readTextFile('test.txt')


 Check if my Data is a DNA Sequence 

True

 My List of nucleodites 

DNA: 
 5' AGCCCTCCAGGACAGGCTGCATCAGAAGAGGCCATCAAGCAGGTCTGTTCCAAGGGCCTTTGCGTCAGATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACCGAGAGAGATGGAATAAAGCCCTTGAACCAGC 3' 
    |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [10]:
def hamming_dist(dna1,dna2):
    zip_dna=zip(dna1,dna2)
    h_dist=[(n1,n2) for n1,n2 in zip_dna if n1!=n2]
    return f"The Hamming Distance is {len(h_dist)} unitis"
hamming_dist('TCGAAA','TAGCAA')

'The Hamming Distance is 2 unitis'

Genomic  Projetc

In [24]:
class genome_python:
  def __init__(self):
    print('Genome class')

  def count_kmers(self, dna, kmer):
    '''We can define a k-mer as a substring of length k of a DNA, RNA or protein sequence.
       A sequence of length n can be constructed by overlapping n - k + 1 k-mers'''
    kmer_count = 0
    for i in range(len(dna) -1):
        if dna[i:i+len(kmer)] == kmer:
            kmer_count += 1
    return kmer_count

  def find_frequent_kmers(self, dna, k_len):
        """Find the most frequent k-mers in a sequence."""
        kmer_freq = {}

        for i in range(len(dna) - k_len + 1):
            kmer = dna[i:i+k_len]
            if kmer in kmer_freq:
                kmer_freq[kmer] += 1
            else:
                kmer_freq[kmer] = 1

        highest_frequency = max(kmer_freq.values())

        return [
            kmer for kmer, frequency in kmer_freq.items()
            if frequency == highest_frequency]


In [29]:
dna='ATCGATCGATCGATCGACTGACTAG'
kmer='TC'
tamanho=3
genome=genome_python()
print(f'\n The total of {kmer} repetions \n')
print(genome.count_kmers(dna,kmer))
print(f'\n The most Frequent kmers with lengh {tamanho} \n')
print(genome.find_frequent_kmers(dna,tamanho))

Genome class

 The total of TC repetions 

4

 The most Frequent kmers with lengh 3 

['ATC', 'TCG', 'CGA']
