<a href="https://colab.research.google.com/github/Baek-Donghyeon/Bioinformatics-Algorithms/blob/main/Bioinformatics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bioinformatics

In [None]:
import numpy
import random
import re
import sys
sys.setrecursionlimit(3000)

ㄴㄴ
def PatternCount(Text, Pattern):
  count = 0
  k = len(Pattern)
  for i in range(len(Text)-k+1):
      if Text[i:i+k] == Pattern:
          count += 1
  return count


def FrequentWords(Text, k):
#Find the most frequent k-mers in a string.
#Most frequent k-mer in Text maximizes Count(Text, Pattern) among all k-mers.
  FrequentPatterns = set()
  maxCount = 0
  Count = []
  for i in range(len(Text)-k):
      Pattern = Text[i:i+k]
      Count.append(PatternCount(Text, Pattern))
      if Count[i] > maxCount:
          maxCount = Count[i]
  for i in range(len(Text)-k):
      if Count[i] == maxCount:
          FrequentPatterns.add(Text[i:i+k])
  return FrequentPatterns


def PatternToNumber(Pattern):
#Convert a DNA string to a number.
  if len(Pattern) == 0:
      return 0
  Symbol = Pattern[-1]
  Prefix = Pattern[:-1]
  return 4*PatternToNumber(Prefix) + SymbolToNumber(Symbol)


def SymbolToNumber(Symbol):
  D = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
  return D[Symbol]


def NumberToSymbol(r):
  D = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
  return D[r]


def NumberToPattern(index, k):
#Convert an integer to its corresponding DNA string.
  if k == 1:
    return NumberToSymbol(index)
  prefixIndex = index // 4
  r = index % 4
  return NumberToPattern(prefixIndex, k-1) + NumberToSymbol(r)


def ComputingAFrequencyArray(Text, k):
#Generate the frequency array of a DNA string.
  '''
  Frequency array of a string Text as an array of length 4^k,
  where the i-th element of the array holds the number of times
  that the i-th k-mer(in the lexicographic order) appears in Text
  '''
  FrequencyArray = [0] * (4**k)
  for i in range(len(Text)-k+1):
    Pattern = Text[i:i+k]
    FrequencyArray[PatternToNumber(Pattern)] += 1
  return FrequencyArray


def ReverseCompliment(Text):
#Find the reverse complement of a DNA string.
  dictionary = ''
  D = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
  for i in Text:
    dictionary += D[i]
  return dictionary[::-1]


def PatternMatching(Genome, Pattern):
#Find all occurrences of a pattern in a string.
  Positions = []
  for i in range(len(Genome) - len(Pattern)):
    if Genome[i: i + len(Pattern)] == Pattern:
      Positions.append(i)
  return Positions


def ClumpFinding(Genome, k, L, t):
#Find patterns forming clumps in a string.
#(L, t)-clump, an interval of Genome of length L, in which Pattern appears at least t times
  Clumps = []
  FrequencyArray = ComputingFrequencies(Genome, k)
  for i in range(4**k):
    if FrequencyArray[i] >= t:
      Pattern = NumberToPattern(i, k)
      Position = PatternMatching(Genome, Pattern)
      for j in range(len(Position)-t+1):
        if Position[j+t-1]-Position[j] <= L-k:
          Clumps.append(Pattern)
          break
  return Clumps


def MinimumSkew(Genome):
#Find a position in a genome minimizing the skew.
#Skew is the difference between the total number of occurrences of 'G' and 'C' in Genome.
  Skew = [0]
  Count = 0
  MinPosition = []
  for i in range(len(Genome)):
    if Genome[i] == 'G':
      Count += 1
    elif Genome[i] == 'C':
      Count -= 1
    Skew.append(Count)
  Min = min(Skew)
  for i in range(len(Genome)+1):
    if Skew[i] == Min:
      MinPosition.append(i)
  return MinPosition


def HammingDistance(p, q):
#Compute the Hamming distance between two DNA strings.
#Position i in k-mers p1 … pk and q1 … qk is a mismatch if pi ≠ qi
  Count = 0
  for i in range(len(p)):
    if p[i] != q[i]:
      Count += 1
  return Count


def ApproximatePatternMatching(Text, Pattern, d):
#Find all approximate occurrences of a pattern in a string.
#HammingDistance(Pattern, Pattern') ≤ d
  Position = []
  k = len(Pattern)
  for i in range(len(Text)-k+1):
    if HammingDistance(Pattern, Text[i:i+k]) <= d:
      Position.append(i)
  return Position


def Neighbors(Pattern, d):
#Find all the neighbors of a pattern.
  if d == 0:
    return Pattern
  if len(Pattern) == 1:
    return ['A', 'C', 'G', 'T']
  Neighborhood = []
  SuffixNeighbors = Neighbors(Pattern[1:], d)
  for Text in SuffixNeighbors:
    if HammingDistance(Pattern[1:], Text) < d:
      for x in ['A', 'C', 'G', 'T']:
        Neighborhood.append(x+Text)
    else:
      Neighborhood.append(Pattern[0]+Text)
  return Neighborhood


def FrequantWordsWithMismatches(Text, k, d):
#Find the most frequent k-mers with mismatches in a string.
#A most frequent k-mer with up to d mismatches in Text is a string Pattern maximizing Countd(Text, Pattern) among all k-mers
  FreqArrays = [0] * (4**k)
  FreqPatterns = []
  Neighborhoods = []
  for i in range(len(Text)-k+1):
    Neighborhoods.extend(Neighbors(Text[i:i+k], d))
  for i in Neighborhoods:
    FreqArrays[PatternToNumber(i)] += 1
  Maximum = max(FreqArrays)
  for i in range(4**k):
    if FreqArrays[i] == Maximum:
      FreqPatterns.append(NumberToPattern(i, k))
  return FreqPatterns


def FrequantWordsWithMismatchesAndReverseComplements(Text, k, d):
#Find the most frequent k-mers (with mismatches and reverse complements) in a DNA string.
  FreqArrays = [0] * (4**k)
  FreqPatterns = []
  Neighborhoods = []
  RC = ReverseCompliment(Text)
  for i in range(len(Text)-k+1):
    Neighborhoods.extend(Neighbors(Text[i:i+k], d))
    Neighborhoods.extend(Neighbors(RC[i:i+k], d))
  for i in Neighborhoods:
    FreqArrays[PatternToNumber(i)] += 1
  Maximum = max(FreqArrays)
  for i in range(4**k):
    if FreqArrays[i] == Maximum:
      FreqPatterns.append(NumberToPattern(i, k))
  return FreqPatterns


def MotifEnumeration(Dna, k, d):
#Find all (k, d)-motifs in a collection of strings.
#A k-mer is a (k,d)-motif if it appears in every string from Dna with at most d mismatches.
  Patterns = []
  for string in Dna:
    motifset = set()
    for i in range(len(string)-k+1):
      motifset.update(Neighbors(string[i:i+k], d))
    Patterns.append(motifset)
  Motif = Patterns[0]
  for motif in Patterns:
    Motif = (Motif & motif)
  return Motif


def DistanceBetweenPatternAndStrings(Pattern, Dna):
#Find the distance between a pattern and a set of strings.
  k = len(Pattern)
  distance = 0
  for string in Dna:
    HamDistance = k
    for i in range(len(string)-k+1):
      CurrentDistance = HammingDistance(Pattern, string[i:i+k])
      if HamDistance > CurrentDistance:
        HamDistance = CurrentDistance
    distance += HamDistance
  return distance


def MedianString(Dna, k):
#Find a median string.
#A median string for Dna minimizes d(Pattern, Dna) over all k-mers Pattern.
  distance = len(Dna)*k
  Median = ''
  for i in range(4**k):
    Pattern = NumberToPattern(i, k)
    CurrentDistance = DistanceBetweenPatternAndStrings(Pattern, Dna)
    if distance > CurrentDistance:
      distance = CurrentDistance
      Median = Pattern
  return Median


def ProfileMostProbableKmer(Text, k, Profile):
#Find a Profile-most probable k-mer in a string.
#Profile-most probable k-mer in Text, a k-mer that was most likely to have been generated by Profile among all k-mers in Text.
  Score = 0
  Kmer = ''
  for i in range(len(Text)-k):
    CurrentScore = 1
    kmer = Text[i:i+k]
    for j in range(k):
      CurrentScore *= Profile[kmer[j]][j]
    if CurrentScore > Score:
      Score = CurrentScore
      Kmer = kmer
  return Kmer


def Score(Motifs, Profile):
  Score = 0
  k = len(Motifs[0])
  for motif in Motifs:
    CurrentScore = 1
    for i in range(k):
      Score *= Profile[motif[i]][i]
    Score += CurrentScore
  return Score


def ProfileFormation(Motifs, k):
  Profile = [[1 for i in range(k)] for j in range(4)]
  for i in range(k):
    for motif in Motifs:
      Profile[motif[i]][i] += 1
  Profile = numpy.array(Profile)/(len(Motifs)+4)
  return Profile


def GreedyMotifSearch(Dna, k, t):
  BestMotifs = []
  for initialmotifs in Dna:
    BestMotifs.append(initialmotifs[:k])
  w = len(Dna[0])
  for x in range(w-k+1):
    Motifs = []
    Motifs.append(Dna[0][x:x+k])
    for y in range(1, t):
      Profile = ProfileFormation(Motifs, k)
      Motifs.append(MostProbableKmer(Dna[y], k, Profile))
    if Score(Motifs, ProfileFormation(Motifs, k)) > Score(Motifs, ProfileFormation(BestMotifs, k)):
      BestMotifs = Motifs
  return BestMotifs


def RandomizedMotifSearch(Dna, k, t):
  w = len(Dna[0])
  # from Symbol to Number representation
  NumDna = [[SymbolToNumber(Symbol) for Symbol in list(line)] for line in Dna]
  Motifs = []
  for i in range(t):
    j = random.randrange(w-k+1)
    Motifs.append(NumDna[i][j:j+k])
  BestMotifs = Motifs
  for x in range(1000):
    while True:
      Profile = ProfileFormation(Motifs, k)
      NewMotifs = []
      for Text in NumDna:
        NewMotifs.append(MostProbableKmer(Text, k, Profile))
      Motifs = NewMotifs
      if Score(Motifs, Profile) > Score(BestMotifs, Profile):
        BestMotifs = Motifs
      else:
        break
    Motifs = []
    for i in range(t):
      j = random.randrange(w-k+1)
      Motifs.append(NumDna[i][j:j+k])
  # from Number to Symbol representation
  return [''.join([NumberToSymbol(Number) for Number in line]) for line in BestMotifs]


def GibbsSampler(Dna, k, t, N):
  w = len(Dna[0])
  # from Symbol to Number representation
  NumDna = [[SymbolToNumber(Symbol) for Symbol in list(line)] for line in Dna]
  Motifs = []
  for i in range(t):
    j = random.randrange(w-k+1)
    Motifs.append(NumDna[i][j:j+k])
  BestMotifs = Motifs
  for j in range(N):
    i = random.randrange(t)
    Motifs.pop(i)
    Profile = ProfileFormation(Motifs, k)
    #Profile-randomly generated k-mer in a string Text.
    Motifs.insert(i, MostProbableKmer(NumDna[i], k, Profile))
    if Score(Motifs, Profile) > Score(BestMotifs, Profile):
      BestMotifs = Motifs
  return [''.join([NumberToSymbol(Number) for Number in line]) for line in BestMotifs]


def Composition(Text, k):
#Generate the k-mer composition of a string.
#k-mer composition is the collection of all k-mer substrings of Text (including repeated k-mers).
  Kmer = []
  for i in range(len(Text)-k+1):
    Kmer.append(Text[i:i+k])
  Kmer.sort()
  #lexicographic order
  return Kmer


def Reconstruct(Patterns):
#Find the string spelled by a genome path.
  Text = Patterns[0]
  for i in range(1,len(Patterns)):
    Text += Patterns[i][-1]
  return Text


def Overlap(Patterns):
#Construct the overlap graph of a collection of k-mers.
#Overlap graph has a node for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if Suffix(Pattern) is equal to Prefix(Pattern')
  Kmers = [Patterns.pop()]
  EndOfDna = False
  while not EndOfDna:
    EndOfDna = True
    for kmer in Patterns:
      if Kmers[-1][1:] == kmer[:-1]:
        Kmers.append(kmer)
        Patterns.remove(kmer)
        EndOfDna = False
  EndOfDna = False
  while not EndOfDna:
    EndOfDna = True
    for kmer in Patterns:
      if Kmers[0][:-1] == kmer[1:]:
        Kmers.insert(0, kmer)
        Patterns.remove(kmer)
        EndOfDna = False
  AdjList = []
  for i in range(len(Kmers)-1):
    AdjList.append([Kmers[i], Kmers[i+1]])
  AdjList.sort(key = lambda x: x[0])
  return AdjList

#The de Bruijn graph DeBruijnk(Text) is formed by gluing identically labeled nodes in PathGraphk(Text).
def DeBrujinText(Text, k):
#Construct the de Bruijn graph of a string.
  AdjList = []
  for i in range(len(Text)-k+1):
    AdjList.append([Text[i:i+k-1], Text[i+1:i+k]])
  AdjList.sort(key = lambda x: x[0])
  PathGraph = [AdjList.pop(0)]
  while AdjList:
    ToBeAppended = AdjList.pop(0)
    if ToBeAppended[0] == PathGraph[-1][0]:
      PathGraph[-1].append(ToBeAppended[1])
    else:
      PathGraph.append(ToBeAppended)
  return PathGraph


def DeBrujinPatterns(Patterns, k):
#Construct the de Bruijn graph from a collection of k-mers.
  AdjList = []
  for Kmer in Patterns:
    AdjList.append([Kmer[:-1], Kmer[1:]])
  AdjList.sort(key = lambda x: x[0])
  PathGraph = [AdjList.pop(0)]
  while AdjList:
    ToBeAppended = AdjList.pop(0)
    if ToBeAppended[0] == PathGraph[-1][0]:
      PathGraph[-1].append(ToBeAppended[1])
    else:
      PathGraph.append(ToBeAppended)
  return PathGraph


def DeBrujinPaired(PairedComposition, k):
  AdjList = []
  for Kmer in PairedComposition:
    AdjList.append([[Kmer[0][:-1], Kmer[1][:-1]],[Kmer[0][1:], Kmer[1][1:]]])
  AdjList.sort(key = lambda x: x[0])
  PathGraph = [AdjList.pop(0)]
  while AdjList:
    ToBeAppended = AdjList.pop(0)
    if ToBeAppended[0] == PathGraph[-1][0]:
      PathGraph[-1].append(ToBeAppended[1])
    else:
      PathGraph.append(ToBeAppended)
  return PathGraph


def EulerianPath(PathGraph):
  Graph = []
  AdjDict = {}
  DegreeDict = {}
  for Nodes in PathGraph:
    for Node in Nodes:
      AdjDict.setdefault(Node)
      DegreeDict.setdefault(Node, 0)
  for Nodes in PathGraph:
    AdjDict[Nodes[0]] = Nodes[1:]
    DegreeDict[Nodes[0]] = len(Nodes[1:])

  for EndNodes in AdjDict.values():
    if EndNodes:
      for EndNode in EndNodes:
        DegreeDict[EndNode] = (DegreeDict[EndNode]-1)

  def DFS(index):
    while(AdjDict[index]):
      DFS(AdjDict[index].pop())
    Graph.insert(0, index)

  for index in DegreeDict.keys():
    if DegreeDict[index]==1: # Choosing a start point
      DFS(index)
      break

  return Graph


def DecimalToBinary(Number, k):
  if k == 1:
    return str(Number)
  prefix = Number // 2
  r = Number % 2
  return DecimalToBinary(prefix, k-1) + str(r)


def KUnivercialCircularRing(k):
#A k-universal circular string is a circular string that contains every possible k-mer constructed over a given alphabet.
  AdjDict = {}
  Graph = []
  for i in range(2**(k-1)):
    Binary = DecimalToBinary(i, k-1)
    AdjDict.setdefault(Binary, [Binary[1:]+'1',Binary[1:]+'0'])
  def DFS(index):
    while(AdjDict[index]):
      DFS(AdjDict[index].pop())
    Graph.insert(0, index)
  DFS('0'*(k-1))
  return Graph[:-(k-1)]


def StringReconsturctionFromReadPairs(PathGraph, k, d):
#Reconstruct a string from its paired composition.
#(k,d)-mer is a pair of k-mers in Text separated by distance d.
  AdjDict = {}
  DegreeDict = {}
  PairedGraph = []

  for Pair in PathGraph:
    for Node in Pair:
      AdjDict.setdefault(tuple(Node))
      DegreeDict.setdefault(tuple(Node),0)

  for Pair in PathGraph:
    Prefix = tuple(Pair[0])
    Suffix = Pair[1:]
    AdjDict[Prefix] = Suffix
    DegreeDict[Prefix] = len(Suffix)
  for EndNodes in AdjDict.values():
    if EndNodes:
      for EndNode in EndNodes:
        DegreeDict[tuple(EndNode)] = (DegreeDict[tuple(EndNode)]-1)

  def DFS(index):
    while(AdjDict[index]):
      DFS(tuple(AdjDict[index].pop()))
    PairedGraph.insert(0, list(index))

  for index in DegreeDict.keys():
    if DegreeDict[index]==1:
      DFS(index)
      break

  Graph = []
  for Pair in PairedGraph:
    Graph.append(Pair[0])
  for i in range(-k-d,0):
    Graph.append(PairedGraph[i][1])
  return Graph


def ContigGeneration(PathGraph):
#Generate the contigs from a collection of reads (with imperfect coverage).
#Non-branching if in(v) = out(v) = 1 for each intermediate node v of this path,
  Graph = []
  AdjDict = {}
  DegreeDict = {} # Degree : # of nodes [In, Out]
  for Nodes in PathGraph:
    for Node in Nodes:
      AdjDict.setdefault(Node)
      DegreeDict.setdefault(Node, [0,0])
  for Nodes in PathGraph:
    AdjDict[Nodes[0]] = Nodes[1:]
    DegreeDict[Nodes[0]][1] = len(Nodes[1:])

  for EndNodes in AdjDict.values():
    if EndNodes:
      for EndNode in EndNodes:
        DegreeDict[EndNode][0] = (DegreeDict[EndNode][0]+1)

  def Search(key, contig):
    value = AdjDict[key].pop()
    contig.append(value)
    if DegreeDict[value] == [1,1] and AdjDict[value]:
      return Search(value, contig)
    else: # Finish Search if value is a branch point or has no out
      return Reconstruct(contig)
  
  Contigs = []
  IsNotEmpty = True
  while IsNotEmpty:
    IsNotEmpty = False
    for key in AdjDict.keys():
      if AdjDict[key] and not DegreeDict[key] == [1,1]:
      # key is a branch point
        Contigs.append(Search(key, [key]))
        IsNotEmpty = True

  return Contigs


def ProteinTranslation(Pattern):
  Protein = []
  for i in range(len(Pattern)//3):
    Codon = Pattern[3*i:3*i+3]
    if Codon in ['UAA','UAG','UGA']:
      break
    Protein.append(GeneticCode[Codon])
  return ''.join(Protein)


def DnaToRna(Dna):
  return re.sub('T','U',Dna)


def PeptideEncoding(Text, Peptide):
#Find substrings of a genome encoding a given amino acid sequence.
  Substrings = []
  PeptideLen = len(Peptide)
  for i in range(len(Text)-3*PeptideLen+1):
    substring = Text[i:i+3*PeptideLen]
    if ProteinTranslation(DnaToRna(substring)) == Peptide or ProteinTranslation(DnaToRna(ReverseCompliment(substring))) == Peptide:
      Substrings.append(substring)
  return Substrings


def Cyclospectrum(Peptide):
#Generate the theoretical spectrum of a cyclic peptide.
#The theoretical spectrum of a cyclic peptide Peptide is the collection of all of the masses of its subpeptides
  PeptideLen = len(Peptide)
  TheoreticalSpectrum = [0,Mass(Peptide)]

  Peptide += Peptide
  for i in range(1,PeptideLen):
    for j in range(PeptideLen):
      TheoreticalSpectrum.append(Mass(Peptide[j:j+i]))
  
  return sorted(TheoreticalSpectrum)


def Mass(Peptide):
  Mass = 0
  for AminoAcid in Peptide:
    Mass += IntegerMass[AminoAcid]
  return Mass


def BFCyclopeptideSequencing(Mass):
#Compute the number of peptides of given total mass.
  if Mass in ReducedMass.values():
    global cnt
    cnt += 1
  elif Mass > 57:
    for AminoAcid in ReducedMass.values():
      if Mass > AminoAcid:
        BFCyclopeptideSequencing(Mass-AminoAcid)


def CyclopeptideSequencing(Spectrum):
  
  def Expand(CandidatePeptides):
    Expand = []
    for Peptide in CandidatePeptides:
      for AminoAcid in ReducedMass.keys():
        Expand.append(Peptide+AminoAcid)
    return Expand
  CandidatePeptides = list(ReducedMass.keys())
  FinalPeptides = []
  while True:
    CopyCandidatePeptides = CandidatePeptides.copy()
    for Peptide in CopyCandidatePeptides:
      Peptidespectrum = Cyclospectrum(Peptide)
      if Peptidespectrum == Spectrum:
        FinalPeptides.append(Peptide)
        CandidatePeptides.remove(Peptide)
      else:
        for Fragment in Peptidespectrum:
          if Fragment not in Spectrum:
            CandidatePeptides.remove(Peptide)
            break
    if not CandidatePeptides:
      break
    print(CandidatePeptides)
    CandidatePeptides = Expand(CandidatePeptides)
  print(FinalPeptides)
  return FinalPeptides


GeneticCode = {'AAA':'K','AAC':'N','AAG':'K','AAU':'N','ACA':'T','ACC':'T','ACG':'T','ACU':'T',
               'AGA':'R','AGC':'S','AGG':'R','AGU':'S','AUA':'I','AUC':'I','AUG':'M','AUU':'I',
               'CAA':'Q','CAC':'H','CAG':'Q','CAU':'H','CCA':'P','CCC':'P','CCG':'P','CCU':'P',
               'CGA':'R','CGC':'R','CGG':'R','CGU':'R','CUA':'L','CUC':'L','CUG':'L','CUU':'L',
               'GAA':'E','GAC':'D','GAG':'E','GAU':'D','GCA':'A','GCC':'A','GCG':'A','GCU':'A',
               'GGA':'G','GGC':'G','GGG':'G','GGU':'G','GUA':'V','GUC':'V','GUG':'V','GUU':'V',
               'UAA':'*','UAC':'Y','UAG':'*','UAU':'Y','UCA':'S','UCC':'S','UCG':'S','UCU':'S',
               'UGA':'*','UGC':'C','UGG':'W','UGU':'C','UUA':'L','UUC':'F','UUG':'L','UUU':'F'}

IntegerMass = {'G':57,'A':71,'S':87,'P':97,'V':99,'T':101,'C':103,'I':113,'L':113,'N':114,
               'D':115,'K':128,'Q':128,'E':129,'M':131,'H':137,'F':147,'R':156,'Y':163,'W':186}

ReducedMass = {'G':57,'A':71,'S':87,'P':97,'V':99,'T':101,'C':103,'I':113,'N':114,
               'D':115,'K':128,'E':129,'M':131,'H':137,'F':147,'R':156,'Y':163,'W':186}

f = open('/content/drive/My Drive/Colab Notebooks/input.txt', 'r')
Spectrum = list(map(int,f.readline().split(' ')))
print(CyclopeptideSequencing(Spectrum))
f.close()

['A', 'P', 'V', 'C', 'I', 'N', 'D', 'M', 'H']
['AM', 'AH', 'PV', 'PC', 'PM', 'VP', 'VC', 'VD', 'CP', 'CV', 'CH', 'II', 'IN', 'ID', 'NI', 'NN', 'NM', 'DV', 'DI', 'MA', 'MP', 'MN', 'HA', 'HC']
['PVC', 'PCV', 'VPC', 'VCP', 'CPV', 'CVP', 'III', 'IIN', 'IID', 'INI', 'INN', 'IDI', 'NII', 'NIN', 'NNI', 'DII']
['IIIN', 'IINI', 'INII', 'NIII']
[]
[]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
