In [1]:
from collections import Counter
from functools import reduce
from operator import ne
from typing import List
import re
from math import ceil
import itertools as it

## Check whether string is valide nucleotide sequence

In [9]:
bases = "acgt"
bases

'acgt'

In [10]:
null = "agctACGTagagagatatatcatcatcactcatctaccaggctgtcgcagcgac" * 200
alt = "asdasdfjsda;fljsdaflk;fjl;asjdf;aksldjf;asljkdfsd;nf;askflj;sdfj" * 200

In [11]:
# Attempt 1 broken
def validate(seqs: str):
    for seq in seqs:
        if seq not in bases:
            return False
    return True

In [12]:

print(validate(null))
print(validate(alt))

False
False


In [13]:
%timeit validate(null)

233 ns ± 9.67 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [14]:
%timeit validate(alt)

151 ns ± 6.56 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [15]:
# Fastest
%timeit not re.findall(r"[^acgt]", null.lower())

49.5 µs ± 1.71 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
%timeit not re.findall(r"[^acgt]", alt.lower())

825 µs ± 39.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [17]:
# Fixed with short circuit
def validate(seqs: str) -> bool:
    for seq in seqs.upper():
        if seq not in bases:
            return False
    return True

In [18]:
# Slower than regex
%timeit validate(null)

3.84 µs ± 211 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [19]:
%timeit not next(re.finditer(r"[^acgt]", null.lower()), False)

50 µs ± 288 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [20]:
%timeit not next(re.finditer(r"[^acgt]", alt.lower()), False)

5.55 µs ± 73.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### Answer

In [21]:
def valid(seq: str) -> bool:
    """
    seq[str]: String to check for valid nucleotide bases

    returns: Sequence if valid nucleotide sequence else ''

    >>>valid(null)
    True
    >>>valid(alt)
    False
    """
    seq = seq.lower()
    return seq if not next(re.finditer(r"[^acgt]", seq.lower()), False) else ''

In [22]:
def clean(seq: str, bases=bases) -> str:
    """
    seq: Any string

    returns: a valid nucleotide sequence
    >>>valid("acgt")
    acgt
    >>>valid("ACgt")
    acgt
    >>>valid("addt")
    at
    """
    return "".join(re.findall(re.compile(f"[{bases}]"), seq.lower()))

## Count number of bases in sequence

In [23]:
{base: null.count(base) for base in bases}

{'a': 3000, 'c': 2800, 'g': 2000, 't': 2200}

In [24]:
{base: clean(alt).count(base) for base in bases}

{'a': 1600, 'c': 0, 'g': 0, 't': 0}

In [25]:
def count(seq: str) -> dict:
    """
    seq: Any string
    returns: Count of bases a, c, g, t on cleaned string
    """
    return {base: clean(alt).count(base) for base in bases}

In [26]:
%timeit count(null)

697 µs ± 27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [27]:
%timeit count(alt)

666 µs ± 58.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Answer

In [28]:
def count(seq: str) -> dict:
    """
    seq: Any string
    returns: Count of bases a, c, g, t on cleaned string
    """
    return dict(Counter(clean(seq)))

In [29]:
%timeit count(null)

985 µs ± 17.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [30]:
%timeit count(alt)

201 µs ± 8.42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Find reverse complement

### Answer

In [31]:
def reverse_complement(seq: str) -> str:
    complements = str.maketrans('acgt', 'tgca')
    return clean(seq).translate(complements)[::-1]

In [32]:
reverse_complement('acgttcga')

'tcgaacgt'

## Find gc content

### Answer

In [33]:
def gc(seq: str, bases='gc') -> float:
    return round(len(clean(seq, bases=bases)) / len(clean(seq)) * 100, ndigits=3)

In [34]:
gc(null)

48.148

In [35]:
gc(alt)

0.0

## Find gc in kmer

In [36]:
kmer=6
stop = ceil(len(null) / kmer)

In [37]:
from math import ceil

In [38]:
ceil(len(null + 'a') / 6)

1801

In [39]:
len(null + 'l') // 6

1800

In [40]:
range(0, stop, kmer)

range(0, 1800, 6)

In [41]:
list(range(0, stop, kmer))

[0,
 6,
 12,
 18,
 24,
 30,
 36,
 42,
 48,
 54,
 60,
 66,
 72,
 78,
 84,
 90,
 96,
 102,
 108,
 114,
 120,
 126,
 132,
 138,
 144,
 150,
 156,
 162,
 168,
 174,
 180,
 186,
 192,
 198,
 204,
 210,
 216,
 222,
 228,
 234,
 240,
 246,
 252,
 258,
 264,
 270,
 276,
 282,
 288,
 294,
 300,
 306,
 312,
 318,
 324,
 330,
 336,
 342,
 348,
 354,
 360,
 366,
 372,
 378,
 384,
 390,
 396,
 402,
 408,
 414,
 420,
 426,
 432,
 438,
 444,
 450,
 456,
 462,
 468,
 474,
 480,
 486,
 492,
 498,
 504,
 510,
 516,
 522,
 528,
 534,
 540,
 546,
 552,
 558,
 564,
 570,
 576,
 582,
 588,
 594,
 600,
 606,
 612,
 618,
 624,
 630,
 636,
 642,
 648,
 654,
 660,
 666,
 672,
 678,
 684,
 690,
 696,
 702,
 708,
 714,
 720,
 726,
 732,
 738,
 744,
 750,
 756,
 762,
 768,
 774,
 780,
 786,
 792,
 798,
 804,
 810,
 816,
 822,
 828,
 834,
 840,
 846,
 852,
 858,
 864,
 870,
 876,
 882,
 888,
 894,
 900,
 906,
 912,
 918,
 924,
 930,
 936,
 942,
 948,
 954,
 960,
 966,
 972,
 978,
 984,
 990,
 996,
 1002,
 1008,
 10

### Answer

In [42]:
def kmers(seq: str, n: int = 6):
    stop = ceil(len(seq) / n)
    slices = range(0, stop, n)
    subsets = zip(slices, slices[1:])
    return [clean(seq)[start: stop] for start, stop in subsets]

In [43]:
kmers(null)

['agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',
 'gtagag',
 'agatat',
 'atcatc',
 'atcact',
 'catcta',
 'ccaggc',
 'tgtcgc',
 'agcgac',
 'agctac',

In [44]:
def kgc(seq: str, n: int = 6):
    return [gc(kmer) for kmer in kmers(seq, n)]

In [45]:
kgc(null)

[50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,
 50.0,
 50.0,
 16.667,
 33.333,
 33.333,
 33.333,
 83.333,
 66.667,
 66.667,

## Find gc content of fasta

In [46]:
sample= """
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
"""


In [47]:
sample.split(">")[1:]

['Rosalind_6404\nCCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC\nTCCCACTAATAATTCTGAGG\n',
 'Rosalind_5959\nCCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT\nATATCCATTTGTCAGCAGACACGC\n',
 'Rosalind_0808\nCCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC\nTGGGAACCTGCGGGCAGTAGGTGGAAT\n']

In [48]:
def fasta_gc(fasta: str=sample) -> dict:
    return { gc(seq): desc for desc, seq in [read.split("\n", 1) for read in fasta.split(">")[1:]] }

In [49]:
max(fasta_gc())

60.92

In [50]:
def max_gc(fasta: str=sample) -> str:
    gcs = fasta_gc(fasta)
    most = max(gcs)
    print(f"""{gcs[most]}\n{most}""")
    return {gcs[most]: most}

In [51]:
max_gc()

Rosalind_0808
60.92


{'Rosalind_0808': 60.92}

In [52]:
eg = """>Rosalind_0310
TTTAGGCTCTATAGAATATGCCCTGACTAGGAAGAAATAGGGGCTGATCTGATTGATATC
GATAGGATGGAATGGCTTGATAATAGGGATTAAGTTTAATAAGGTCCACAAGCCAGGCAG
TTCTGGGCCGGGAGTCTTTGAGTATGCATCCCGGCTATGCAGTTTTATTACACTTATGCA
TCAACAATTATTCGCGATTCCTTTGGGCTGCATGGCTGGGGGTCCTCATAAATCGACGGA
AAATCGAAATCATACTGTCAAGGAGAAACGGGACAACTTTAACATACTCGATTACCCCAC
GCCCCTCAGGAAGCCGGGACTCAGGCATGAGGCTTCTATATTCAGCAAACACGGGCGCCG
TGTCAGGTGCGGACCAACTGCTGCGCATACATGCAGCCTCCCGGATCCCCGCGTTTGTTT
TAATACGCCTGTCTTCCACGGTACCCGGGTATATTACTTCGTTGCGTTCCAGGGGTTAAA
CGGCCCTGGTAGGAATTTGCAACCTCTATCGCGAATTCCCGACAACGTGGTCATGTTACG
GCTCATTGATCCCCAACTGAAGGGTTAGAGTAAGTAGGCCGCCATTGTTTGACTACACCG
CAGCTCTGTGGTTTGTGGCAAGGCTAAGGCCGCGAATCGCAGGTTTCTTCAGGTCACGTA
GCCCGAGTCCCCGGATCTATCGTGCTTATACATCAAAATCCCGGTTGGGCAGTCTTAGGC
TTAATAGTTTATCACGTCGAGAGTGAGACGATCGGAAACCATCCAATAAGCCGAAGAGAT
GACAAAGCGTAAGCATAGTACTATGGGACAATGACTCACAACGTGCTTATGGAAGCACTA
GGACGTTCCGGGTTGGTAAGATCTCGATGCTTCAAAAGGTCGTACAAGGGAAGTTAAGTA
GCTACCGTCCATAAGGGACCCCAGGCCGCGATTGGAAAGGTCGTATTGCAAAGAAAAC
>Rosalind_8377
GTGGTATGACCACAGGGACTTGGTTTGGCGAAAAGGGCAAGGATATTGTGGGACATCTCT
GGAAAGCTGAGCCGTACTGCTACCTGGACATTGGCAGTGTTGGGTCGCTGGCTACTGGTA
AGTCTGCAGAAGATACCAAGCGCTATCGGTCCGGATGAGGCCCGGACGTACCTCATCCCG
CAATCTTCTCTCTTTGGGCTGGACGTGGACTGTCGCGCCTCCAATGTTGGATGGGGCAAT
AGGACGTAGTACACGGCACACCACTTATATTATTCTCAGGGCATCTGATGGATTCCGTTG
TCCCCATCGAGACTACGAAGCTTAGAAGACCCGACCCTAGCCACAATTCTGAAGGTCAGG
GGTCTCAGTCCTTAAACGACTCAGAGGCCCAGCCGCCCATAATTTGGACCATAAACTTGT
CGAGACAGGCGCATGTTTTCGCTGTCAACTCAAAACCAACCGTTCAGACCCACGAGGCTT
CGTACGGCCAGGTCTACTTCGAGTCGAAAGAGTAACTCATTGATTTCAGCTTTACACCAT
GCATTTACCAGGTTAGAGCCCGCCTGGCTTAGAATACAAGATTCTATATGGGAAAAAGCG
GGCCACCCCTTCTCGAATGTTACAACGCGCCATGCTGGTGAAAACATTCCCTCGTTAATC
GCAATGGGTTCTTCGGTATGCGCACAAAAGTTGTGCGCCTGTAAGGGAAGTACCAGTTAT
GTGTGGTCTGGTCTGAGTCCTCGGATCGATGAAATCACTTATATTGTGGTGCTGAAGCGG
AGGACGCCACAATTACTTCATATGTAAGGAGACC
>Rosalind_2099
TCTTTCCACAAGACCCCCGTAGCAAGTCCGTGAACATTATTGTTAGACGTAACCCCGTTG
CAATCAGATCTGGGCCCGGGCGTACACAGCCGCGCTCAAATAGCTTGAATATATGGGAAC
GGAGACGTACGCTTGTACAATAGTTCAGAGAGGTCGATCGACTCTACCATTGCAATGGGG
CGGTCGGGGACCCGATAGCGTACTTTCTTTTCATATCGGAATGATCCGACGTCCAGTGTT
CACCGAAGGTTAAGAAACCAGTAGCTCAAGAGGTCTTGTTTTAGGACTCCCGTGGACCCC
ATTCTCGGAAAGTCTCTTTAGACTTATTGTTTCACAGAGTATATCCCACGGCGGAACATA
GCTTACTTTGGATAAACCTCTTCGTTGACGCTTCCTTGGCTCTCCGCAGCGGCAGTCAGG
TAAATTTCTTGCGACTTTTGGCTTGACCGAAGCGGATTCGGTGAAATCACCATGCCCCAT
TTACACTCGACTCGTAGTCTAGGATTCAATTAGTGATAGGCAGCCTCTGTTCTCAATAGA
GGATTATGTTCCAATACTATACTTAGTGCAAAATGGAATAATGACGTGGAATATGTTCGC
AGGTGCCACGTTCACCTGTACTTTATCCAAAGGTTCGGTAAAGGTAATGTTTTCTATCTG
GGTGTGGCTCCAACGTACCAGTTACCGATATATGGGCAAGGACCAAGCATATAGTTATCC
ACAAGTGACATGACAAGTGTGACCCCGCCAATACCACTAGCCACTCACACTCTATTCGCG
AGGGAATCTGTATCTGTCGCACTAGATATTGCTACCCACGTGTTGACCATGGAGCAGGCT
CGTTCTGTC
>Rosalind_2387
CCTCCAATCAGTTCGGACTTTTCGACTCATTAGCCGAAACATTAACATTCGAGACGTGAT
GTGTTCCGACAAAGCCAGATGGCCATCAATAACTGGATTCACATCAACCCTATTCTAGAC
CTTAATCAGACCGAAAACTGGCGATATCATTATGTATGTGTCAGCGACAGACCGTGTTGC
ACTTCAAGGAGGCACTTCCATTCTACACTACTCGGTGTCACCAACTGGCCTCATCGGGAA
GTGAACCACGCACTTGCTTACTTATCAATGCCTTGTGTTTTCCTAGGGCGAAGAGAGAGA
TTGCGAAAGGGAGCCGCTCACCGCCATGAAGTAGACTGTAGCTTCCACTGGAACGGCATA
GGCGATACTCTCGTTATGAACGGATTCGAGGGTTCTTGGATTGATTGTCGTCCGTGGGTC
AACTTAAGTCTTATCTTACATGATGGACGGATGCCGACCCTGCTCCGACGACGAGCGATA
TGGACATCTGACACCTCACCTCGATACATCCGATCTGTATGGTACGCCCGCGTTTCCACG
TGGTTAGGTTGCTTATATTCAGCCTGTTTAGGAGTTGCTCGGCAATACGCCGGTATTGTC
GGAGTCCCCAACTACTCATCCCCAAGTAATTACGCTACCCATGCCAGGCCGCACTTAACG
CTAGTCCTACATGGATGGTGACGTGGTTGGCCCGGCGGCCAGGGAAGCGAACTCGCTCCG
TCACCAGACATACCTTAACACCCGCGCAGCGATTGAAGAGATCCGGCTCATCGCTTTCGA
GTTTCTAATCTGGCCTAGTTGTCGTGCGCCCTTTTCAGTTCACTCGCTACCTACTGGGAC
TGTCAGTTTAG
>Rosalind_3276
ATCGGGACAGTTCCTAGCAATTCGCGGGTTATGTTGGTAGACAACAAGGCAACCGCACAA
CATCTGTAGCCGGTCTACCTCGCGTCTCCTTACATCCAACCGTTGCGAAGGACACGAGCA
GGGGATATTACGTTCCATCCCGGCTAGTCGGGAAGGCCTCCGGATTGTACCCGTAAGTAA
TTGGGTAACAAAGGCTCAGTAGCGTATCCCCGTAGACTGCCTGCCCCCACACTATTCTTT
ATCGTGAACCTAAGTTGACAACACAAAACGAGCGTTCATTTGAATAATAAAAGCACACGA
ACAACTTCAAGCAGCTCGCAACTTTAGACCCCTTCCTTCGGCTTTATACTTCTGTACGTT
TAGTAATAGCGCGAAACTCCGTTACACGGAGTTCTGTCCTGGCTTGAAAGCCGGCAACCT
GAGGTACTTACAATCGAGCGGCAAATTGGTGATTATAAACTCCTAACCCTACGCCCCTTG
TAAATTGTCCCGCTAAACCCTTAAAGGGCACAGATACCGTTTCCTGCGTGCTCGGCCAGA
TTCTTGGATCATGTTTAAGTCTTAGTCCCCAGAGGTAGTCCGCCCTCGCGCGGTCCGTTC
GCCTTGAGGTACTTAACCTCAATTAGATAGGGACACGCTCATACAAGATTTGCGACGCGG
GGACATAATCATCTACTTTGAGCGAGGCCCTACGTAAAGGAAGACTCTTAGCTAACAAGT
AATCGACACGGGGCTGTGTAGCTCCCGCTGGCGGGGCGCCGATCCTTTTCTGTATCTCGA
ACGCGGACAGTGAGGTCCAGCTCCATCGGATAAGCTCATGTACACTCTCCCATAATATCT
ATTGCGGTGCTCCTTCACACACACTCCTTAACGACTACACAACTGCCCCGGTCGGTTTTC
ATCGCCCAACTTCGAAAATAGAAGGATTCGGACCCGAGGGGTCTTTGAAGAGTGTTCTCT
GAGTCAAGATGAATCTTAATGAT
>Rosalind_7396
TAAATAAGAACTCGTGCCCGGCCCTTTCATCCCATTCCTTGAAGTGTGTACTGTGGCCCT
AAGACCCACTGGCGTTGACTTATATTAGAAGCCACATGAATCATCGGTATAATTATTGAT
ATCGATTGTCTCTCACGGAATATTGACGGGAGTCCCGCTAATAGGTCGACTGACGCACCA
TTTTCCAGTTTGAGTACGCACATACCACGATTCACGGCATCGAGTCGGTCTTAACAAGCT
AGTTCGGTCTTATCATGTTCGACTACATAAGTATCCAGGGAACGTACCCACCGGTACTGT
TTAGCCTGCTTTGGCGCTTACTAATATAAATCATGATTCACTTCACACCGTTATTCCCCG
TCGATTGTGCAAACTACGAAACACGTGCTTGAAAATTTCCTTAGTAGCCCCTAGAGACTG
CCAGCTATGGGGATGCTTCCACGTTAGACCATGAGAGCACTAGGGGGCTGGTATTTTAGC
GAGCAGTTAACTAACGGATAGCCGCTACGGGTTGACAGGCAGGCGGCTAAACGATCCAGA
GACCCGGACACTGGCTAAAGGAACCGTAGGGTACGTGTTAAACAAACACAGATGCAGACG
TCGGTCATTATGGGGAGCCACTCGTAACCCAGGATTATGTGACTCAAACCGGGAAGACGG
TTTGAGGGTGTAATGTTGAGTGCTGCAGCCAGTCTAAAGCGTCGCTGGTAGCGAGCCAGA
CCAAGTTTTAAAATATCCGACTATATGAACGGCTTAGCATGGAAGTCTGAGTAGCTAATG
GTGACCTTGCACTCTCTTGTTTTCGGGGCGTAGGTGGTATGATTGCCAGACCCGTGACAT
ACACGTTGACAGCCTCAGACTGGTGTGAATTGTTGGTGCGCGGGGCTCGCCTATAG
>Rosalind_2351
CATGCCGTTCCTCCCTGTGCATTAAGATACGACTGGGTCGTTGGGGTACCAAGCTCCAAG
CACGCCCTGACGCGTGCCGGGTACATTCCCGTGCCCATAGTAAGGACCTGGGACTAGGGT
CAGCAATCCAACCCCGGGCTTGACAGTTCGTTGGAAGTACTCGTGCCGCTGCACGACATC
CGATTGACTAATGAGAAATCTAGTCAGGTTCTACCCGAGGTTCTCCTACGCGTGCACACT
TACGTCCCCTGTGAAAAGGGGGCCGTCGGGCACACCATCGCCCATATTTTCGTTGACGCA
CGGATCCTTAAAAGTCACGGAGTTACGGCACAATCTTTTCGGCATGTTCTTCCTACCGAT
GGCGTACATGTGGGGAGGCATGCTGACTCCTAGCGTAAACTTGTCTAAAATCAGCTTGTC
GAACCTCATCTATTGTACACCTTTGGAGGGGCGAATAACCCGTCTATAAATCCTTTTTAT
TAGTTAGAAATCTGCACTCGTGGGGGACGTGACGAGGGATGTGGTACCCAACAACGCTAC
CTTTGCGAGCGGAAACCGGAATACCAGTCGCAGCGGTGGACCTCTAGTCACCGATAGCAG
TTTCACCCTAAGATGGTTTGATAAACATCTAGCCTATCTCAAAATGCCGGCGGCCTCCGC
GGTACAGACCGGGGGCCGGGCTCGGTACCGGACACATCCTCTCACTTTCGGAAGCAATTA
ATGCGTATTGCTCCCGTAACGATTAACACGAGCAAGGGTGGAACGATTGAGTAGCTGCTT
AATTGCATCGCGCCGTCCCGAAC
>Rosalind_7908
AAGGGTACTATCGCTTAAAGTGACTCTGCGCTGTTGTATTTTGTGCGCGGAACGGCGTTT
ACCAAAAACTTCGTATGGACCGTAATGTTAGTACGACCAGCGTTCCTTGTGGCATCCCCA
TGAGGCGGACGCTATATGGATCAATAATGTTACCAGCATGAACTATTTTGCAACGCGCCT
GATGCCTCCGTCACCGTGTCGGCCTGGACCAGACAAAGAGGCCCCGGGCAAGCTTGAAAC
CTCGGGGTTATCTGTGCGTCCGTTTTTTATGTGAGGAGGAGGGCCCACCAGTAATCCTGG
CTTATGTAAGCGATGTTTGCAGAAACAATAATTAGGCCCGGTTAACACGCCTAGTCAAAC
CAATTGGGTATCTAAGCTTGTATCATCAGTACCGCATCGAAGTGAACAATCGACGTTTTA
AGAAGGGAGGCGACTTCGGAATTTCCTACCCCCAACTGCTTTGTGGAATTGATATCCAAT
TTTACACCCCCATAGATACCCCTCGCGCCTGAACGACACGGCCAAACTACCTGGCCAAAA
AACAGGACACAGCGTAAGAGGAGACATTACGCCATAAAACGCGTTAGGGAGGATCGGAGA
AGCATCTCCCGCTAAGACATCGGTCACTGATCACACTTCACTCTCGAGGCATAGTAACAC
AGCCAGGGTAGGAATGTCTCCCCTTTGCTTTAGCAACGTAATACATACAATGTTCACCGT
GTCATTCAAGGTTTGGGTACCTGTAAGGTGGCATCGGCTCTATTGGTCACCTATCAGTAC
GCATACCTGTCCAATGACGCTTGACCGTGCTCTTGTCGTTAGCCGTCAGAT
>Rosalind_8120
TAAATCTATGTTCCGGCGCCGCCTTGTCCCGCAGAGGTCAGTAGGATCCTCTTGACCCGT
CCCAGGTCTACCAGTATTCATAGCAATTATTCTCGTCTCCCCTCACATCTCATTTTAGGG
ACAGTAATTAACCCTACTTTACCTACCGGCAATAGATCCATGAAGTCGGGTCAAGATCGA
TAGAGGTTTTTTATCGATCGTGGTATACCCTCAGCTCCTAAGATTCGTTACTATCACTTT
TAATCCCACCCCCATGGGAAGGGACGGTAGGCCCGTTAGTACGGGGATCAGTGCATGGCA
ACCTACGCCCAAATTGCCATGTTTAGTCACTACTAAAACAGTTAAGGACTAGTAAATCCA
CTGCTTGTCCGATCTCGATCAGTCCCGTAGCGATGGTTGCCGTTGGTGGGTGAAGATGTA
ATGGACTTCTTAGAAGCTGCACTACTGACCATATAGGAACATTAGTTCGCATCTCCCCTT
ACCTTCGCTCCCAGGGATTCCGCACGCCCGCCTCATTCGCCGATTCCATCGACCTGTCAG
AGAGACCCTAGCGCCGTACGCTGGCGTGCCGTATGACCCGTTGGTTGCAAGCTTCTATAT
AAGGGACGTTATGGGGTTGTTTGCCTCCGCATTTAGATAACGTGAGTCTATCAACTGTTA
CGATAGGATGCGCAACTATTAAGCTGCGCGAGTTGCCCCCTTGCCGCCATGCGTAAGGCA
GTGAACTGAGACTATGCCATCGTACCTTCGAATCCCCTTCGCTACCATCTTCCCACCGCA
TCGATCTAAGCCTCTTATTTATCCACAAAAGACTGGCCCCACTGCTCGGGATGTTACCCA
GCAGC
>Rosalind_2490
CCCGCAGCTTTCTGTCTTGTTATGGTGAATGGATACCTACCAGCTCAGAGGCTATATTTC
AAACATCATTAGTTACGCTAGATCTAAGTGCTTGCTAACTTCAGGACCTCGGTGACCAGT
GTTTGCCTACAACGGGGATTGCCATTGCGGCCACTCCGTCCTGGGATTAACCCTCTAGCG
AGCAACAGCGAGGATTCCCTTACTCGGGCACCGGTGCTATCTCAAGCCATGTCGTGAGCT
GTGATAGTACGTTCCGCTGAATGCTGGCCCATATAAGACTGGTTTTATTATACCTATGTC
AGGGTCCAGAATTCTTACTAGGTAAGATAACCACTATATTACACAAAAGGCAAATCCGGT
CTGCCCGAATTGATTCGTGGTTAGTTCATCAACGATGCCCACACAACTATTAGAGTACAT
GTTACGAGTATATTCTTGCGCCAGTATAGCCTGGCCGTTTGCGGATAATCAGTGGTCTAG
GCTGACGACTAGGATATGAATCAGTAACGTACGCGACAGCATTAGCCAGGGTCGTATCCA
TCTGTTGGTATCCTCTCCAAACCGTCGCGCTGCCAAGAGACGCTAAAATATCCGTGGGCC
TTACTATGGCTGGGGATTTGCTTCACTAAGACCGGTTTCGGGACTGACGTCCCCAAGGGA
CCGCGCCGGCTTGCGATATCTTGAAGCCCCTAAGGAGCATGAGGTACTTCCAGGGGGTCC
GAAATGGAAATCTGCGCATCAATAACCGTGCGAAGGTCCCTCGGTAACGATCGACCTGGA
GGTAATCCTGGACGAAGGAACTGCGGAATTCGTTACACTTCTTTTCGTCGAAGTCACACT
AGGTGTCACTGGTAGTCATGGCTCGTTTTGACCATCAGGTCGTAGGAAGCTTGACAGCAT
TTGCGACAATAATACAATAACCCTTAGTCGAACTGCTTGCTCGGCGAACACTATAAATAC
CTTGACTGGCTTCTTTTAGGAT
"""

In [53]:
max_gc(eg)

Rosalind_2351
53.176


{'Rosalind_2351': 53.176}

## Generate codons

In [132]:
codons_tsv="""
name	slc	codon
Isoleucine 	I 	ATT, ATC, ATA
Leucine 	L 	CTT, CTC, CTA, CTG, TTA, TTG
Valine 	V 	GTT, GTC, GTA, GTG
Phenylalanine 	F 	TTT, TTC
Methionine 	M 	ATG
Cysteine 	C 	TGT, TGC
Alanine 	A 	GCT, GCC, GCA, GCG
Glycine 	G 	GGT, GGC, GGA, GGG
Proline 	P 	CCT, CCC, CCA, CCG
Threonine 	T 	ACT, ACC, ACA, ACG
Serine 	S 	TCT, TCC, TCA, TCG, AGT, AGC
Tyrosine 	Y 	TAT, TAC
Tryptophan 	W 	TGG
Glutamine 	Q 	CAA, CAG
Asparagine 	N 	AAT, AAC
Histidine 	H 	CAT, CAC
Glutamic acid 	E 	GAA, GAG
Aspartic acid 	D 	GAT, GAC
Lysine 	K 	AAA, AAG
Arginine 	R 	CGT, CGC, CGA, CGG, AGA, AGG
Stop codons 	* 	TAA, TAG, TGA
"""

In [133]:
slc = {
    slc.upper().strip(): codon.upper().replace(" ", "").split(",")
    for _, slc, codon in [
        row.split("\t") for row in codons_tsv.splitlines()[2:]
    ]
}

In [134]:
list(zip(slc['I'], it.repeat('I', len(slc['I']))))

[('ATT', 'I'), ('ATC', 'I'), ('ATA', 'I')]

In [135]:
help(zip)

Help on class zip in module builtins:

class zip(object)
 |  zip(*iterables) --> zip object
 |  
 |  Return a zip object whose .__next__() method returns a tuple where
 |  the i-th element comes from the i-th iterable argument.  The .__next__()
 |  method continues until the shortest iterable in the argument sequence
 |  is exhausted and then it raises StopIteration.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.



In [136]:
import itertools

In [137]:
help(itertools)

Help on built-in module itertools:

NAME
    itertools - Functional tools for creating and using iterators.

DESCRIPTION
    Infinite iterators:
    count(start=0, step=1) --> start, start+step, start+2*step, ...
    cycle(p) --> p0, p1, ... plast, p0, p1, ...
    repeat(elem [,n]) --> elem, elem, elem, ... endlessly or up to n times
    
    Iterators terminating on the shortest input sequence:
    accumulate(p[, func]) --> p0, p0+p1, p0+p1+p2
    chain(p, q, ...) --> p0, p1, ... plast, q0, q1, ... 
    chain.from_iterable([p, q, ...]) --> p0, p1, ... plast, q0, q1, ... 
    compress(data, selectors) --> (d[0] if s[0]), (d[1] if s[1]), ...
    dropwhile(pred, seq) --> seq[n], seq[n+1], starting when pred fails
    groupby(iterable[, keyfunc]) --> sub-iterators grouped by value of keyfunc(v)
    filterfalse(pred, seq) --> elements of seq where pred(elem) is False
    islice(seq, [start,] stop [, step]) --> elements from
           seq[start:stop:step]
    starmap(fun, seq) --> fun(*seq

In [138]:
list(zip(slc['I'], it.cycle('I')))

[('ATT', 'I'), ('ATC', 'I'), ('ATA', 'I')]

In [139]:
slc

{'I': ['ATT', 'ATC', 'ATA'],
 'L': ['CTT', 'CTC', 'CTA', 'CTG', 'TTA', 'TTG'],
 'V': ['GTT', 'GTC', 'GTA', 'GTG'],
 'F': ['TTT', 'TTC'],
 'M': ['ATG'],
 'C': ['TGT', 'TGC'],
 'A': ['GCT', 'GCC', 'GCA', 'GCG'],
 'G': ['GGT', 'GGC', 'GGA', 'GGG'],
 'P': ['CCT', 'CCC', 'CCA', 'CCG'],
 'T': ['ACT', 'ACC', 'ACA', 'ACG'],
 'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
 'Y': ['TAT', 'TAC'],
 'W': ['TGG'],
 'Q': ['CAA', 'CAG'],
 'N': ['AAT', 'AAC'],
 'H': ['CAT', 'CAC'],
 'E': ['GAA', 'GAG'],
 'D': ['GAT', 'GAC'],
 'K': ['AAA', 'AAG'],
 'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
 '*': ['TAA', 'TAG', 'TGA']}

In [140]:
help(reduce)

Help on built-in function reduce in module _functools:

reduce(...)
    reduce(function, sequence[, initial]) -> value
    
    Apply a function of two arguments cumulatively to the items of a sequence,
    from left to right, so as to reduce the sequence to a single value.
    For example, reduce(lambda x, y: x+y, [1, 2, 3, 4, 5]) calculates
    ((((1+2)+3)+4)+5).  If initial is present, it is placed before the items
    of the sequence in the calculation, and serves as a default when the
    sequence is empty.



In [141]:
dict(reduce(lambda prev, next: prev + list(zip(next[1], it.cycle(next[0]))),
    slc.items(),[]))

{'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'TTA': 'L',
 'TTG': 'L',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TTT': 'F',
 'TTC': 'F',
 'ATG': 'M',
 'TGT': 'C',
 'TGC': 'C',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TGG': 'W',
 'CAA': 'Q',
 'CAG': 'Q',
 'AAT': 'N',
 'AAC': 'N',
 'CAT': 'H',
 'CAC': 'H',
 'GAA': 'E',
 'GAG': 'E',
 'GAT': 'D',
 'GAC': 'D',
 'AAA': 'K',
 'AAG': 'K',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TAA': '*',
 'TAG': '*',
 'TGA': '*'}

In [142]:
slc.items()

dict_items([('I', ['ATT', 'ATC', 'ATA']), ('L', ['CTT', 'CTC', 'CTA', 'CTG', 'TTA', 'TTG']), ('V', ['GTT', 'GTC', 'GTA', 'GTG']), ('F', ['TTT', 'TTC']), ('M', ['ATG']), ('C', ['TGT', 'TGC']), ('A', ['GCT', 'GCC', 'GCA', 'GCG']), ('G', ['GGT', 'GGC', 'GGA', 'GGG']), ('P', ['CCT', 'CCC', 'CCA', 'CCG']), ('T', ['ACT', 'ACC', 'ACA', 'ACG']), ('S', ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC']), ('Y', ['TAT', 'TAC']), ('W', ['TGG']), ('Q', ['CAA', 'CAG']), ('N', ['AAT', 'AAC']), ('H', ['CAT', 'CAC']), ('E', ['GAA', 'GAG']), ('D', ['GAT', 'GAC']), ('K', ['AAA', 'AAG']), ('R', ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']), ('*', ['TAA', 'TAG', 'TGA'])])

In [143]:
# [] + zip(1,1)

TypeError: zip argument #1 must support iteration

In [73]:
# [(2, 1)] + zip([1],[1])

TypeError: can only concatenate list (not "zip") to list

In [144]:
dict(
    reduce(
        lambda prev, next: prev + [(codon, next[0]) for codon in next[1]],
        slc.items(),
        []
    )
)

{'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'TTA': 'L',
 'TTG': 'L',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TTT': 'F',
 'TTC': 'F',
 'ATG': 'M',
 'TGT': 'C',
 'TGC': 'C',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TGG': 'W',
 'CAA': 'Q',
 'CAG': 'Q',
 'AAT': 'N',
 'AAC': 'N',
 'CAT': 'H',
 'CAC': 'H',
 'GAA': 'E',
 'GAG': 'E',
 'GAT': 'D',
 'GAC': 'D',
 'AAA': 'K',
 'AAG': 'K',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TAA': '*',
 'TAG': '*',
 'TGA': '*'}

In [77]:
dict(
    [(codon, amino) for amino, codons in slc.items() for codon in codons]
)

{'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'TTA': 'L',
 'TTG': 'L',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TTT': 'F',
 'TTC': 'F',
 'ATG': 'M',
 'TGT': 'C',
 'TGC': 'C',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TGG': 'W',
 'CAA': 'Q',
 'CAG': 'Q',
 'AAT': 'N',
 'AAC': 'N',
 'CAT': 'H',
 'CAC': 'H',
 'GAA': 'E',
 'GAG': 'E',
 'GAT': 'D',
 'GAC': 'D',
 'AAA': 'K',
 'AAG': 'K',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TAA': 'STOP',
 'TAG': 'STOP',
 'TGA': 'STOP'}

### Answer

In [145]:
codons = dict(
    [(codon, amino) for amino, codons in slc.items() for codon in codons]
)


## Translate DNA sequence to protein

In [146]:
def clean(seq: str, bases=bases, case=str.lower) -> str:
    """
    seq: Any string

    returns: a valid nucleotide sequence
    >>>valid("acgt")
    acgt
    >>>valid("ACgt")
    acgt
    >>>valid("addt")
    at
    """
    return "".join(re.findall(re.compile(f"[{case(bases)}]"), case(seq)))


cleaned = clean(null, case=str.upper)


In [147]:
def test(x=str.upper):
    return x(null)

In [148]:
cleaned[:10]

'AGCTACGTAG'

### Answer

In [160]:
def translate(seq, start=0):
    return ''.join([codons.get(seq[k-3: k], '-') for k in range(start + 3, len(seq))])

## Codon content

In [166]:

prots = translate(cleaned)
prots_1 = translate(cleaned, 1)
prots_2 = translate(cleaned, 2)
ports_3 = translate(cleaned, 3)

In [168]:
print(cleaned[:30])
print(prots[:10])
print(prots_1[:10])
print(prots_2[:10])

AGCTACGTAGAGAGATATATCATCATCACT
SALYTRV*RE
ALYTRV*RER
LYTRV*RERE


In [170]:
xylo = """ATAACGTTCAAGCTTGTGTTTCAGATAGTGGGGAGGAAGGAGAAAATAATTACATCATAGAGAAATAATT
TGGTATTGTTTTTCGGATTAACTTTCCCCCAGAGATTTCAGAACAAGTAGTTGCTCCGCTTTAACAATGA
TCGAGATCTATTATATAATCGCAGCAATATAATCTAAGACCTTTTCTTAACTTGAATAGCTACATTTAGT
GCATATCGTTTTGAAAAGAAATTGCTAATAGCAAGTTATTGCCCCCAAAGAAAAAATCCCACAACTCGGA
AAGACTTTCTATCTGCCTTGCGAAAAATCTCCCCCCTGAAATAATGTTTAAATTTTTCACTTCTCCAACA
ACTATAAAGAGATCGACATTTCTCTTCAATTATTCTAGTGGTTTGTTTTCAAAATCATCAATTACATACT
ATAGAACCATGTCTACTACTCCTACTATTCCTACCATTAAATTAAACTCTGGTTATGAAATGCCATTAGT
TGGTTTCGGATGTTGGAAAGTCAATAATGAAACTGCTGCTGACCAAATCTACAATGCTATCAAAACTGGT
TACAGATTATTTGATGGTGCTGAAGATTACGGTAATGAAAAAGAAGTTGGTGAAGGTATTAACAGAGCCA
TTAAAGAAGGATTAGTTAAAAGAGAAGAATTATTCATCACTTCTAAATTATGGAACAATTTCCATGATCC
AAAGAATGTTGAAACTGCTTTAAACAAAACTTTAAGTGACTTGAACTTGGACTATGTTGATTTATTCTTG
ATTCATTTTCCAATTGCTTTTAAATTTGTTCCAATTGAAGAAAAATACCCACCTGGTTTCTACTGTGGTG
ATGGTGATAACTTCCACTATGAAGATGTTCCATTATTAGATACTTGGAAAGCTTTGGAAAAATTGGTTGA
AGCTGGTAAGATCAAATCTATTGGTATTTCCAATTTTACTGGTGCTTTGATTTACGATTTGATCAGAGGT
GCTACTATCAAACCAGCTGTTTTACAAATTGAACATCACCCATACTTGCAACAACCAAAATTGATTGAAT
ATGTTCAAAAAGCTGGTATTGCCATTACTGGTTACTCTTCATTTGGTCCACAATCATTCTTGGAATTGGA
ATCCAAGAGAGCTTTGAACACCCCAACTTTATTTGAACATGAAACTATTAAATCAATTGCTGATAAACAT
GGTAAATCTCCAGCTCAAGTTTTATTAAGATGGGCTACTCAAAGAAATATTGCTGTTATTCCAAAATCAA
ACAATCCAGAAAGATTAGCTCAAAACTTGTCTGTTGTTGACTTTGACTTGACTAAGGATGATTTGGACAA
TATTGCTAAATTGGACATTGGTTTGAGATTCAATGATCCATGGGACTGGGACAACATTCCAATCTTTGTT
TAAATTCAATAAGATCTGTGTTTGTTTATTAATTAATAGATAGAAAATTTTTTTATATGGCATCTAAAAC
AGTTTGTCTACCTAAAACTAATAATTTTAATCTACTTACAAAATCATCTCTTTCGATATAAGCACCACAT
AATCTTCTTTTACCTGTAAATTCAGTTCTCAGGTGGAAACATCTCCAATTATCAAAAATCAAGCATTGGC
CAGGCTTCAATTGATAAAAGATTTCATTTTCTGGAGAACTAATAATACTAACCCAATTACGAATAGCTTC
ATAAAATTTTGGTACATCATTTGGATTCTCCCAATTATCCATTGTTGATCTATCACTTTGATTCCATCTA
ACT
"""

In [171]:
translate(xylo)

'I*NTRVFSQKSALLCVCVFFSQRDI*SVWGGGERGEKRGEREKKNI*NILYTHISHI*REREKNI*NI---WGVYILCVFFFFSRGDIL*NTLFFSPPPPQRERDIFFSQRENTQKSV*SVLCALSPRALFL*NTQNM*---SRERDISLYILYIYI*NISRAQSAQNIYI*NISL*KRDTPLFFFSLL*NTLL*ENI*SALYTHIFL*S---AHIYISRVFFL*EKKKREKNILCAL*NI*SAQKSVLYILCAPPPPQKKREKKKKNISPPHTQNTLSRG---KRDTLFFSLYISLCAPLLCAREKKKNISLSPPPPPL*EKNI*NMCVFL*KNIFFFFSHTLFSLSPQNT---TLYI*KKRERDISRDTHIFFSLSLFSQNILYIFSL*SVWGVFLCVFFFSQKKNISHISQNILYTHIYT---I*RENTPHMCVSLYTLYTLSPLYTLYIFSPLYTPHIL*KNIL*KNTLSLWGVLYM*EKNMCAPHIL*S---WGVFFSRGDMCVLWGEKKSVSQNI*NM*EKNTLCALCAL*DTPQKNISLYTQNMCALYISQKKNTLWG---YTQRDILYIFL*DMWGVCAL*EKRDILYTRGV*NM*EKKKKREKSVLWGV*EKRGVYIL*NTQRESAP---L*KKREKRGDIL*SVL*KKKREREKRENILYIFSHISHTLFSL*KNILYMWGENTQNIFFSPHM*DIS---KKRENMCVL*EKNTLCALFL*KNTQKKNTLFL*KSV*DTLL*ENTLLWGDTLYMCVL*DIFLYIFSLL---IFSHIFFFSPQNILCALFFL*KNIFLCVFSPQNIL*EKREKKKNIYTPPHTPLWGVFFSLYTLCVWGV---MWGV*DI*NTLFSPHTLYM*EKRDMCVFSPHILYIL*RDIYTLLWGEKKSALFLWGEKKKNILWGVL*---SALWGV*KRDISQKNISLYILWGVYIFFSPQNIFFLYTLWGVCALFL*DIFLYTRDIFL*DISQRERG---ALYTL

In [173]:
translate(xylo, 2)

'NTRVFSQKSALLCVCVFFSQRDI*SVWGGGERGEKRGEREKKNI*NILYTHISHI*REREKNI*NI---WGVYILCVFFFFSRGDIL*NTLFFSPPPPQRERDIFFSQRENTQKSV*SVLCALSPRALFL*NTQNM*---SRERDISLYILYIYI*NISRAQSAQNIYI*NISL*KRDTPLFFFSLL*NTLL*ENI*SALYTHIFL*S---AHIYISRVFFL*EKKKREKNILCAL*NI*SAQKSVLYILCAPPPPQKKREKKKKNISPPHTQNTLSRG---KRDTLFFSLYISLCAPLLCAREKKKNISLSPPPPPL*EKNI*NMCVFL*KNIFFFFSHTLFSLSPQNT---TLYI*KKRERDISRDTHIFFSLSLFSQNILYIFSL*SVWGVFLCVFFFSQKKNISHISQNILYTHIYT---I*RENTPHMCVSLYTLYTLSPLYTLYIFSPLYTPHIL*KNIL*KNTLSLWGVLYM*EKNMCAPHIL*S---WGVFFSRGDMCVLWGEKKSVSQNI*NM*EKNTLCALCAL*DTPQKNISLYTQNMCALYISQKKNTLWG---YTQRDILYIFL*DMWGVCAL*EKRDILYTRGV*NM*EKKKKREKSVLWGV*EKRGVYIL*NTQRESAP---L*KKREKRGDIL*SVL*KKKREREKRENILYIFSHISHTLFSL*KNILYMWGENTQNIFFSPHM*DIS---KKRENMCVL*EKNTLCALFL*KNTQKKNTLFL*KSV*DTLL*ENTLLWGDTLYMCVL*DIFLYIFSLL---IFSHIFFFSPQNILCALFFL*KNIFLCVFSPQNIL*EKREKKKNIYTPPHTPLWGVFFSLYTLCVWGV---MWGV*DI*NTLFSPHTLYM*EKRDMCVFSPHILYIL*RDIYTLLWGEKKSALFLWGEKKKNILWGVL*---SALWGV*KRDISQKNISLYILWGVYIFFSPQNIFFLYTLWGVCALFL*DIFLYTRDIFL*DISQRERG---ALYTLYI

In [154]:
prots.count('S')

1600

In [155]:
def cc(seq, *codons):
        return {codon: seq.count(codon) for codon in codons} if codons else Counter(seq)

In [156]:
cc(prots)

Counter({'S': 1600,
         'A': 800,
         'L': 800,
         'Y': 800,
         'T': 799,
         'R': 1400,
         'V': 400,
         '*': 200,
         'E': 400,
         'D': 399,
         'I': 1200,
         'H': 800,
         'P': 200,
         'Q': 599,
         'G': 200,
         'C': 200})

In [180]:
pattern = re.compile(r'''
    (?=          # Positive lookahead assertion
        (ATG     # Start codon "ATG"
        (?:...)*? # Non-greedy matching of any three characters (nucleotides)
        )         # End of capturing group for start codon and sequence
        (?=TAG|TGA|TAA)  # Positive lookahead assertion for stop codons
    )             # End of positive lookahead assertion
''')

In [216]:
pattern = re.compile(r'(?=(ATG(?:...)*?)(?=TAG|TGA|TAA))')

def revcomp(dna_seq):
    return dna_seq[::-1].translate(str.maketrans("ATGC","TACG"))

def orfs(dna):
    return set(pattern.findall(dna) + pattern.findall(revcomp(dna)))

print(orfs(xylo))

{'ATGTTGGAAAGTCAA', 'ATGCTTGATTTT', 'ATGAAATGCCAT', 'ATGGCAATACCAGCTTTT', 'ATGGAATCAAAG', 'ATGGAACAATTTCCA', 'ATGATTGTGGACCAAATGAAGAGTAACCAG', 'ATGATTTTG', 'ATGAAGAGTAACCAG', 'ATGTTCCATTAT', 'ATG', 'ATGGATCAT', 'ATGAAGATGTTCCATTAT', 'ATGTTGAAACTGCTT', 'ATGTTTATCAGCAAT', 'ATGGGTGATGTTCAATTTGTAAAACAGCTGGTT', 'ATGTTCAAA', 'ATGATG', 'ATGTTTCCACCTGAGAAC', 'ATGTTCAATTTG', 'ATGGTGATAACTTCCACTATGAAGATGTTCCATTAT', 'ATGTCGATCTCTTTA', 'ATGGCATTTCATAACCAGAGTTTAATT', 'ATGTGGTGCTTATATCGAAAGAGA', 'ATGATTTTGAAAACAAACCAC', 'ATGGAAATTGTTCCA', 'ATGAAATCTTTTATCAAT'}


In [217]:
[translate(orf) for orf in orfs(xylo)]

['MCVLWGEKKSVS',
 'MCALL*DIF',
 'M*EKNMCAP',
 'MWGAQNIYTPQSALF',
 'MWGENISQK',
 'MWGENTQNIFFS',
 'M*DILCVWGDTPQKNM*EKRESV*NTP',
 'M*DIFF',
 'M*EKRESV*NTP',
 'MCVFSPHIL',
 '',
 'MWGDIS',
 'M*EKRDMCVFSPHIL',
 'MCVL*EKNTLCA',
 'MCVFLYISQSAQ',
 'MWGGV*DMCVFSQNIFLCV*KKNTQSALWG',
 'MCVFSQ',
 'M*D',
 'MCVFFSPHTPL*ERE',
 'MCVFSQNIF',
 'MWGV*DI*NTLFSPHTLYM*EKRDMCVFSPHIL',
 'MCVSRDISLSLF',
 'MWGAHIFFSHI*NTPQRESVFL*N',
 'MCVWGVCALLYIYISREKKRE',
 'M*DIFFL*EKKNTQKNTP',
 'MWGEKNILCVFS',
 'M*EKNISLFFLYISQ']

In [215]:
clean(xylo, case=str.upper)

'ATAACGTTCAAGCTTGTGTTTCAGATAGTGGGGAGGAAGGAGAAAATAATTACATCATAGAGAAATAATTTGGTATTGTTTTTCGGATTAACTTTCCCCCAGAGATTTCAGAACAAGTAGTTGCTCCGCTTTAACAATGATCGAGATCTATTATATAATCGCAGCAATATAATCTAAGACCTTTTCTTAACTTGAATAGCTACATTTAGTGCATATCGTTTTGAAAAGAAATTGCTAATAGCAAGTTATTGCCCCCAAAGAAAAAATCCCACAACTCGGAAAGACTTTCTATCTGCCTTGCGAAAAATCTCCCCCCTGAAATAATGTTTAAATTTTTCACTTCTCCAACAACTATAAAGAGATCGACATTTCTCTTCAATTATTCTAGTGGTTTGTTTTCAAAATCATCAATTACATACTATAGAACCATGTCTACTACTCCTACTATTCCTACCATTAAATTAAACTCTGGTTATGAAATGCCATTAGTTGGTTTCGGATGTTGGAAAGTCAATAATGAAACTGCTGCTGACCAAATCTACAATGCTATCAAAACTGGTTACAGATTATTTGATGGTGCTGAAGATTACGGTAATGAAAAAGAAGTTGGTGAAGGTATTAACAGAGCCATTAAAGAAGGATTAGTTAAAAGAGAAGAATTATTCATCACTTCTAAATTATGGAACAATTTCCATGATCCAAAGAATGTTGAAACTGCTTTAAACAAAACTTTAAGTGACTTGAACTTGGACTATGTTGATTTATTCTTGATTCATTTTCCAATTGCTTTTAAATTTGTTCCAATTGAAGAAAAATACCCACCTGGTTTCTACTGTGGTGATGGTGATAACTTCCACTATGAAGATGTTCCATTATTAGATACTTGGAAAGCTTTGGAAAAATTGGTTGAAGCTGGTAAGATCAAATCTATTGGTATTTCCAATTTTACTGGTGCTTTGATTTACGATTTGATCAGAGGTGCTACTATCAAACCAGCTG

## Fibonnacci population growth

In [211]:
def growth(months:int, offspring:int) -> int:
    parent, child = 1, 1
    for month in range(months - 1):
        child, parent = parent, parent + (child * offspring)
        print(f"{child} - {parent}")
    return child

In [212]:
for i in range(1, 6):
        print(f"{i}: {growth(i, 2)}")

1: 1
1 - 3
2: 1
1 - 3
3 - 5
3: 3
1 - 3
3 - 5
5 - 11
4: 5
1 - 3
3 - 5
5 - 11
11 - 21
5: 11


## Calculate Hamming distance

In [224]:
def hamming(obs: str, act: str) -> int:
    return len(set(enumerate(obs)).difference(set(enumerate(act))))

In [218]:
obs = "GAGCCTACTAACGGGAT"
act = "CATCGTAATGACGGCCT"

In [219]:
set.difference(enumerate(obs), enumerate(act))

TypeError: descriptor 'difference' requires a 'set' object but received a 'enumerate'

In [222]:
len(set(enumerate(obs)).difference(set(enumerate(act))))

7

In [225]:
obs = "GCGTGAAACTTGGGTCTCGCCCCGAAACTACCAACAATCCTTTAGCCTAAATATTATCTTTGATCAGAGCAAACATGTACTCTCTGTATCAACGGCGTCCCCTCTCGAATTCCCGCGGCGTTAGGGCTCCCCCTCCCAAGGTGTAGTTAATAATTGCTTTAGTGCGATCCTTCGTTACCTCCAATCTTTCGGTCTTAAAAGTGATCAACATGAATCGGCGATTTCCCAGCTTACGGGATTCAAACTCTGATAAGGGGGAGTCGATTTCTGAACTGGAACCTGTCCGAGGAGGGGGTAGGACCAAGGGAGGATCTGCCTCATGTGACTTCGCGGTGGTGGCCGATTTAGTGTCGCGAGTTTCGAATCTCAGCCGTTTCTGATACGCTTGGATATATGTGTATTCGGCTGCGTGCTGCAAGCAACTAGACCGAAGTCGGCGGAGAGTTATAACGTTGGAAGATTGCCGAGGCCTTGATTCGGTTGACCAAATGGAGTGAGGCCTCATGCAATGAACCGAGATTGGACGTCTATAACCGTACCGATGGTCCCCCCCATTGCCACGAGTCGGATTGTCTCGACGCAATCGAGGCAGAGTTCCAGTAACTGGGATCACCGACGAGATGTTACCTGATCCGGTATGTCGAAATCGTTGCATACTACCCCCTCTCACTTCAAGACGCTTATCTCGTGTGCGCAGCGGAATAGCGTACAACCGTTGGATCCCTTCTTGGTATCCCTGGAGGAGCAATGAGCGTGTCCTTTACTAGACTGGTGGATGCACACACAGTAGACGTGGCGTGTCGAATCACTAAATATAGACTTTAAGAGGTCTTCAGTACGGTAGTTTTCTGGTTCTCCACACCTTGCCTAGAAGGGACCACCTCGCCTAGTTCAAGCTAGCCAGGGCTTAAGTCTAGATAGGTGCTTGTCACGGT"
act = "GTGGATACAATTTGTCTAGGCGCGCGTGTGCTCACATGCCTTGACGGGAAGGACCATCGATTATAACCACAACGATATTGGCTCCCTCTCAACCCATTCCGCTCACTCATGACGCCGCAGTGAGGATTACCCCACGCCCTAAGTAACTAAAACACTACTTAGGGCGTTTATTAATTTAGCCGCCTCCCACGGCCGTAAGTGAACTCAGTATGTACGGTTGAGGAGATAGCTAGCGGGAAAAAAAATCTTAGAAAGGAGAGCGCCATTCTGAGTGGGCGGTACGCAAAGCAGGAGCTAGCACGCGGGGTGCATGTAACTGATGTTCGTTTTTCGACGGGTAGGGGTAATAGGCGCAAATGTTTAATCTCACAAGTTTGATAGAAGCGCCTTTGCATTCCTAGCAGCCTAAGTGCGGCAAAGAAATACAGTCGATACCGCCGAGACATTTACCTTGTCACTATTATCTAATGCGGCACGCGATATATCAAGCTCAGGGTGCCACCTGGCCACGATTTAATGTGGAACTTCTCCCTTCCTTACGAGAAGTACACGTTTCGGCCGGCGTCCACAACACTACCCGCAACACTGGGATAGGAACTGAAGGAACGATCCCAGGCAGGATGATCCCGGAACGGTTATCTCGTAATCCAAGGTAGTTACGCTCTCTGATTTCAATACGGTTGTCTATACTGCAGATCTGGAGATCGTTCTGTCTATAGATCCGCTATCTATTGCCGTTATGATGCCATGAGACTATAGACGAATGGACTCTACAAGGTAACCCCAGCAGCGTACGGGGTAGGGGTCAGGGCCTATGATCTGTATTTTATTTTCACAAAAGATGTCGACTTTTTGGATGCTGCTTTCATAGGACGTGTGGCTTGGTACACTCCCAGCTATCTGTCACTGGCCTACTGACATATTTCTAGCACTGT"

In [226]:
hamming(obs, act)

479

In [228]:
def h2(obs,act):
    return sum(map(ne, obs, act))

In [229]:
h2(obs, act)

479

In [232]:
%timeit hamming(obs, act)

290 µs ± 6.86 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [231]:
%timeit h2(obs, act)

112 µs ± 4.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [233]:
codons

{'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'TTA': 'L',
 'TTG': 'L',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TTT': 'F',
 'TTC': 'F',
 'ATG': 'M',
 'TGT': 'C',
 'TGC': 'C',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TGG': 'W',
 'CAA': 'Q',
 'CAG': 'Q',
 'AAT': 'N',
 'AAC': 'N',
 'CAT': 'H',
 'CAC': 'H',
 'GAA': 'E',
 'GAG': 'E',
 'GAT': 'D',
 'GAC': 'D',
 'AAA': 'K',
 'AAG': 'K',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TAA': '*',
 'TAG': '*',
 'TGA': '*'}

In [234]:
seq = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"

In [235]:
seq.replace("U", "T")

'ATGGCCATGGCGCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'

In [236]:
translate(seq.replace("U", "T"))

'MWGAPHMWGARAPPQRENTL*ERDISQNI*SVYTPPRVYIL*NTRGGV'

In [238]:
codons

{'ATT': 'I',
 'ATC': 'I',
 'ATA': 'I',
 'CTT': 'L',
 'CTC': 'L',
 'CTA': 'L',
 'CTG': 'L',
 'TTA': 'L',
 'TTG': 'L',
 'GTT': 'V',
 'GTC': 'V',
 'GTA': 'V',
 'GTG': 'V',
 'TTT': 'F',
 'TTC': 'F',
 'ATG': 'M',
 'TGT': 'C',
 'TGC': 'C',
 'GCT': 'A',
 'GCC': 'A',
 'GCA': 'A',
 'GCG': 'A',
 'GGT': 'G',
 'GGC': 'G',
 'GGA': 'G',
 'GGG': 'G',
 'CCT': 'P',
 'CCC': 'P',
 'CCA': 'P',
 'CCG': 'P',
 'ACT': 'T',
 'ACC': 'T',
 'ACA': 'T',
 'ACG': 'T',
 'TCT': 'S',
 'TCC': 'S',
 'TCA': 'S',
 'TCG': 'S',
 'AGT': 'S',
 'AGC': 'S',
 'TAT': 'Y',
 'TAC': 'Y',
 'TGG': 'W',
 'CAA': 'Q',
 'CAG': 'Q',
 'AAT': 'N',
 'AAC': 'N',
 'CAT': 'H',
 'CAC': 'H',
 'GAA': 'E',
 'GAG': 'E',
 'GAT': 'D',
 'GAC': 'D',
 'AAA': 'K',
 'AAG': 'K',
 'CGT': 'R',
 'CGC': 'R',
 'CGA': 'R',
 'CGG': 'R',
 'AGA': 'R',
 'AGG': 'R',
 'TAA': '*',
 'TAG': '*',
 'TGA': '*'}

In [239]:
codons['GCC']

'A'

In [245]:
dna = seq.replace("U", "T")
"".join([codons[dna[k:k+3]] for k in range(0, len(dna) - 3, 3)])

'MAMAPRTEINSTRING'

In [251]:
def translate(seq: str):
    return "".join([codons[seq[k:k+3]] for k in range(0, len(seq) - 3, 3)])

In [252]:
seq = "AUGUCUUCGUCACAUUUGCCCUCCAAGCUCCCUGGUGCGCGCCCCCUUUACCCUUGGAUUACCCAGAUGUUCUCCUAUCUGGCAAACCCACUACCAGUGCGAUCCUUUAGCUCCCGCGGUUGUUGCUUGUCCCAGAAACGUCACUAUACAUCACACGGCAUGGGAGCCCAUCGUGAUCCCUCGACACGUGAGGCGAGGACCCUGGUUCGACCCUGUGAUCCAACGACACAAGGGUGCUUUCAAGAGAAUAUUUUCGCAGUAACGAUCCUGGUCGCAUCGAGCAGGAGCGUCCGACUAUACACACCGACGACUCCGUCGCAUUCAUGUAUGCACUGGGUGCUGUGGUAUUCAAAAGCGACUUAUCGUUGGUGCUUGUGUCAUCCGUUUUCAUACCUCAACCUGGACGAACUGCUACUGGAUUUACUGCGGGGCUCAAUCCGUAAUCGAAUGCACCAAAUCGUCGUCUGUAGAGGGAGUGCUUUUGUUGUUGGCCUCAAAAUCUACACAGCCCGCGCUACCCUGCGGCUCCCACCCCGAGCUCCUCCCUGUUCUACUGACUUACGAGUCGGGGGCAGCGACAAGACGAGAUCUGUUCUGUCUUUGUGGAUCGAUGGAAGCUUGAUGAUGUUAAUUUCAACUUUGUCAUACUAUCGUUGUCUAGCAAAAUAUCCCGAUACUCAGAAUGAUCGGAGCUCAGUUUCCAGUGCCCCGCGCGUAGCCACCGAUCUAACAAAGGUUGAAACACCGGGGUUAGGUUCGGUCGGUCCCUCCCGAGCGGCUUGUAUCUGCGGAAAAAGAUUGUCGAACCCGCACACGAUCGUUUCUGGUUUACCCGUAGCCGAACUGGACCUUGACUGUCCGAGGACUAAACGACUCGACUGUCAAAGGAGCUCCUGUGGCAUUCACCGGAUAGCUGUUACAGGCUUUCCACUUGAUCGUCUCUACGUUCUACAUUUGGAUUACAAAUAUACGAUAUUGCGCCUCGAGCAGACUAUGCCACCUAGAUGUCUCCACGCCGGUCGGAGAUCGUUUGUCCUCUACGCUCAGGGGAGCACCAGUCCUCACUUUAACUCCUCCAUAGCUCUGGGGACAGUCAUGCCUCUGCCGGGAUUUCUGAGGUGCCGUGGCUACGACUCUCGGUAUUAUCCCAUCGUCGUCCCAGCGUUGCUCCGAUCACCUCCGCGUCGCAUUUGGUGGUCACUCAAUUACGCAACCUGCAUUUUGCCAUUUGUUCCGACCAUUCGCAGCGCAUUGACUCACUUGAUUCCUGAUUCAGCACGGAUGGUUAAGACGAGGCCUCACCGAAACGCGGUAUUUCACGUAGGAACAGGACAUCGCAAUCAGCGAUCGGCCCAAACACAGAUUGAAGUAGUACUUCACUGCACACACGCUCGGGCCUAUACCGUUUACGGCUUUAAUAGCUCAACUCUUAAUGGGGAGACCGUGGACCCAUCAUCUCAAGUGUCUUACAAUUCGGGAACUGUGAUCUUUUGCUGCCCCCACCCGAGAAACUGCCUAUAUGUUCGUCUAUCUUCGAUAAUCCAUGGUAUCCGUAUCGCUGAGCCGGUAUGGAGAAGUAUAUACAGAGUACAAUUGCGUUGCUCUAUUAGAGUAGCAGUUGACGAGAGAUGUCCCUAUAGCCAAGACACGAUUGGUGUACCUUGCCUACGGCGAGAUAGCUUAAACACAAAUAAACCUUUGAUAGAUCGGAGUCUUCUAAAAUGCCACUAUGUUCCUGAGCUAAGGCUGAACCUUUGUCCGGUUGACAGGGUCGUAAACGCACGAGUCCACGUAAGGGAAAUAAAUGAAGGCCCAAUUGCCGUCGGCGUCGCGUUCCUAGGACCAAAGGGGAUCAGACCCGUCAUGUCGCAUAGGAGAGUAGUUUUUAUACCUCGGUCUUAUCUUAUACUCUACAUCUGCAUUCAGUGGGUGUGUAUUGGGAGGUGUUCGACAUGUCACUGGUUCUGGACACUUGGUCUCUUUGUAUGUAACGUGUAUGGAACCUCAACCCGACUGCGGGCACACGGCAGCAAUGGCGAGAUGACGGUGUAUAUGGGCAUUGGGUGCCGCAACAACAUAAUCAUGGAUGUGUUAUCUCUUAGAGGAGAGCUGACUACCCCGGCGACGGUGAUCUCCGCACUUGAAACGGGGGACACGUGUUUGAAAAACCGGAUACGGGCCCCCUUUUCAGGUGCACCAAGACGCAGGCUGUCCCAAAACGAAGUAAACCAUCACCUCUCCUGCAAUAUGAGGGUGACCGAUAGCCAUCUCAUCAACAAUAAUGCCUGUCUUCGAUGGAAAACAUCUCCAGUUGCCCGAUGUAGUUCAACACCUAUGGUUCGAAUAGUAAAAAAAACGCCCUCCCGAGAUAUGUUCUAUGCCCCGGAGCCCGAUCACAACGAGAGUGAACGUGGAUUUACCACGCCUGUGAGCUAUCCUGCCUUACUGGCGCAACCCUGUUGGUUUUGUAUAUCGAUUUUGUUGGAACAUAUUUCUAACAGGCCCCGAGCGGGGGAAUCCGAAUUAUACCACGAAUUUAUUACCCAUCGCGGCGUGUCGGCCCGGCAGCGAUGUACAGCGGAUCAAUUCGAUAGUGAGUCUCUCCAUCCGUUGGCGACAAUAGCGCACUCAGCUACGUGCGUCUCAUAUGUCGCGGAGCUCCUUACGCCUCAAUCAGAUAAGAGUACUCACGGUCGCCGUCAUAAAUCAGCAACUUCCAUAAGUAUUGGGGGGACUUCCAACUCCGCGUAUAGGCCGGAAGUAGGUAGCACAUCAACAGUCGGUGCGAUGGUCCUCGACUUGUUGAUCGUCCGAGACGCCUCGUGUCCGGUUGCACUGGCCGUACAAGGACAGCAGAGCCAAGUUUUCGGCGUGCAACGCGUUGCACCUUCGCUAGUUAGAAAUGGAGGCUGGUGUUGCUCAAGCAAAAGGGAUACCGGUAGACGCCCGACCAUAUCCACCCUUAGAGACCGCGUCGAAGACGUGUACUUUUCCACCCUUGUGCUAAACGUCAUACGGCGCGAUGUAUUCGUGAGCAGCAAGGCCGGAGGAGUCCGUGUACACUCUGUGCCCGCAACGGGUUGUCCAAUAAAUACCCCCAUAGUCAUUGGUCUAAGUUGGGAAAGUUUACAUAAACCGUUCGAUGUACGAAGCAAUUGUAUGUCUCCUUGCGAAUUGUCGAGGUUUCAGAAACUUUUCAGAUGGUCGAUCGUGAGACGACAGGAUCGAGAGUUCAUUCUGGGACAGUGUACCCACCCACCGCCUGACAUCCUUAGGGGAUACUAUCGCCCGCUGGCGUUUCUCAAAACAAAGUCAUUCAAUAUAAAUUCAAACCGUGCCGGAGUCCAACUGGUUAAAAAUAUUGGCCGCCAUUUACGUGUAACUCUGAGAGCCAUCGAUCUAUCAGAUUCCAGAGGCUCUAUUGGGCCCUCAGGAGCGAUAAUGCAAAAAUUCAACUGCAAUAGUAAUUACCGCGGAUUCAUUCAUGAGCAGUCAUUCGCGUGUCUCAUCAUGAACCGCGUGCCCAAGCAACUUCCAUUAAGGAGAUCGUCGCACGAUGAACCUGGUUGGAGUGUGUCACCGACCCGCUCAACGCCUCCAGUCCUCAUGCUGCCUCGCUUACUUCUUUGGUGCCCACAGUGCUUCCUUCGACACUCAGCUAUCCCACUUGUUCGGCAGAUACUACAGACAGGAAACAUAGGGGGUGCCUAUAAAUCAGUCUGUAUGUCCUACAUUCCACGGGGCUACGGUCAUCACCAUAGAGAACUGUACCGAAAUGCCUAUUGGGGACAGUUUAGGAAAUACGUCACUAGCUACGGUCGCGUAGAGCUCAAGACUGGUGUCUUGGGAUUAGCAAUUGCGAUCUUCUGCGAAUAUGAUGACGUGCCCGUCAUCAGACCCCCUAACAGCACGACGCCCUUCACACAUUCGUCUAAACAUUGCAUGCGCCUGUCAUUCCCUUCUACUGAAGAUGCUCAAUUGAAACAUUACUUACGUGUCUGUCCAUUGAGCUUCCUUCGAACAGCGAUGGUUCAGGGUAAUCGCCCCUCAAAACUGUCUCGUAGCUCGCGGAAACUAGUCACGAGCUUUCGCCAGACUCGGCAGCAGAGUGAAGUCCAACGAAUACAUGGCGUUGUACUCGGAUUUGAAGGUGUCGUGAUCUGGCUAGAAUCUCUGGCGGGUUCCUUAUUGGCAUCAACCAGCUAUUGGCGUCGAGGACUCGAUGCUAUAACGAGAAAGCAGAUGGAUGGGCUCUCUAUGCAUAGCAGUAGAGCCAUAGGGUGUCCAUUAUACCAAUGGCCGACAUGUAUCCCCAGGUCCUACCGCGAAGCGUGCGUCACGUGGCGCAGUAACUCAGUGAGGCCAACAAGAACCGAUGUUUCAAUGAAGUCUGUCGGCCACCGGGGCCUCCUACCCCGCCAGUUAUCGAGAAAAUCCUCAAUUGGGAACAUAAGACCACAACCGCAGACCUGGUCGACGACGCAUGUUGCUAGACCCGCACAACGCGGAGGCAAAUCGCGCACUGGACAACUCAUCCACCUCAGGGCUCCCCUAAACCGGACGUUCGGCUCCGAUUUGCAGCUGUUAGACCAGCGUCGAAAUGUGGAACGUGGUGGGUCUAGUGCAGCAUUAUACUGGGGUUCAGUACGCGGCACAGUGGUUACGAGGAUUCCCGCAACUCGGAUCUCCUGGCACCGCCCAAAGAAUCCGUUUUUAGAAUUGCCAACUGCCCAGAGUUCGGGGGCGGAUAGAGAACAGCAAGCCGAGGGUGCAGGGUACAGCUUACAUCUAGUAGUGCGCAUCACUCGGAAUGUCGAUACGCCAAUAGCUUGUGGCCUCCUCCUCCCGUUGCUCACGCCUCUCGUCUCCCAUGUUAGCUCAUGUACGUCAUUCAGGGUUCUCAAGCCGUUGGUGCCUUACAUUAUGAUGUCAACCCUCGUCUCCAGGAAAGGUGGCUCAUUUGUGUUCCACACAACUACUUUUGGUAUGGGUGCAGACAUGCGAGGCUUAAAUAGCCGAUUACAGACCUGGCCCACUUUUUGUGGAGUUUCGAUCUCUUGCGAAUCCAAUAUUACAAUUUGUCAACGCGUGCUGUCCACUGCAAGUACCAAUACCCUACUAAUUUUUCAUACUUGCCAUCCUAGCCGCGCACAGAAACCAUCGCCUGACGCAGAUCCCGCAAAAGAACUGGUUAUGGCCAGAGUAUCCCUCUCAUUACGCAAGAGCUCAAGACCGAGAUUCGUGGAUCCAAAGGGCUCGACAGUUGCUAAACAUUCCGUAACCCUAAGGGGAGUACCUCCCAACCCACCUGCUAGACUGUACGAAUGUAUGCACGCGCGACGGUUGACUGCAAUAAUGAACAUCCGUGAAUUUCGAAGAUGCUGGCAUUCGACGAUCUUAUUUAGAGAUGGAGGUAUUAGUCAAGGUCUCAGGUACCAGCUACCAGAAAAACCAGGAUCGAGCGUGAAUUUAACCGGCUAUGGACGGCAUGAGGCCACCUUGCAUGCGAAUGUAUCCCGAUAUCGGGUGGUUGGCCAAAUGAUCUCAGUGUUCUCAUUAUUCAUCCGGGGUGGUCGACGCUUUUCCUUGUCUGAUAGUGCGGAUUGUCAUGAGGACGGGCCUUGUAAAAAUAACAAGGCUCGUAGGGCACUGUCGGCGGAUUCGGAUCGUCCGGGGGUGGUCUACCUAUCAUGCAGAAAUUGGCCUUCUCUCGCUAGAUGCAACAUGGCUUCACUCUAUUCUACAUCAGCAACUAGGAGUCCUGGACAAAUAGACUGCGUUAUGCACAUUCAACCCUUGUUACAUUCCGAGUUGGGGCCGCUUGCGCUUGUGUACAAGGUCGCCGUGGGAGAGCAUACACGUACCUCUAAAGGAUCCGAAGCGAAUCUGCGCAGUGUUGAGCGACGCCCAUGCUUAGUAGUGGCGGGUGUAAUUUCGACCGAGGACCGGCGGAUGUUCUCACGUGGGGAUAGCCCCGAUCAGCCUCAGAGUUUAAGGCAGCCCUUACUGCACGCGCACUGUGGCCUAACACAGGCAGUUUCGCACCACGAUGAGAUAGAAUCAGUGCGUAGCCGCAGUGUUUUUUUUAAGUCGCUACGUCAGGCUGAGAUAUUUUUAUUUGAAGGGUCGGGCCUUGGCGCAGGUGCUCCACAUGCGCUUCAUGUACAGGGUGGGCUUCAACAUGAGUUACGCAGCCAGGCCCCGUUCGAGCCCCAGUUACAGGAGGGGGUCUCGUGGAGACGAAGAAAGGAUCAUCCUUCUGGUAGCAUUUCCGGAAUUAACGCGGCUGAUGAGUCGAGACUUUGGGGUGCGUACGUAGGAUCGAUAAUUCCGCUACUUCGUACUCGGAUUAGCAGUCCUUGCACGGUUGAUGCCCGAGGAACCCGGCGGUACGUACCCAUCGUCUCCUAUUACAGCGAAUCAGCGCGCCUUCUAGCACGCGGUUUGCAGCAAAACUGCCCAAAACGCCGCCUAAACGCAGCACUCGGCUUAGCGGUACCCGCGUUACCAACCGUGACGAGUAGUACCAGUAUGGAGCUCAAUUUUAAACGUACUCAUUGUUAUGGGGCGGUGGUGCGUCAAGUCGGGAGGCGCCCCCGGGAUCUGAACAUUCGUCAUCGGCGACUUAUGUGCCUAGAUCUGUGUAUACCACGUGGCAACCAUGCUCCGAGUGCGGUCGUCGGUGAGUUAAGGCGAAGUGAUCCACCGCACGCCUUAAAGUUAGCUUGUAAGUAUGGGGUCAGUUAUGCGGAUCGUAUUCUGGCAGGUUUAGUACCACCACCGCCGCCCGCUCAAUUAGCACAGCGAUGGGCAAUAAAUAGAGUCUGUUGCGACAAAAAGGCACCGCUAUUGGGCUUUAGACUGGACUACUUGAGGUGUUGCAUAAAAGGUAGGUCUCAGUCGAGGCGACGACUGAAACACCAUAACGUUCGCCCAACAACGUUUUCCUCCAGAGGUACAAAUUUUGAGCACCAUAGAACUUGUAGCUCCGUCACACUUGGGUGGACACGAACAUCGUUGCGCCAGUUUUAUCAACGAUAUAGGAUCCAGGGGUGUGGCCAUAAUCGCCCGGGGGCCUGUAAUACUUGGUCACGGGAAAGCCAUAAGCCAGUGACCAGGUCUCAACUUUGGCACUACAAGAUGGAUAUGCCCGUUGUAAAUGGUUCUGAUCAUCCGCGAGCCAAAACGACGGCAAGUACGCCGUACUCCAUGUACGACUGGUUGAGUCCAUGGCGGAAGCUUCUUGAGCGGUGGGAGUUCGGUCAUUAUUUGGCGGAGAGAGACAUUCGGGGCACGAUCAGUCACCUGACAUCCAAUGCGAGGUCAAUUGAUCAGCUUAUGUCGACCUCGUCCCUUGUAUCAGGGCUCGCUUCCACGGUAUACGACCUCUCGGUCUCUGCGCAGGUCCGUGCUUUGAUUUCUCCUCUAGGCUUAGCAAGCGCUCAAUGCUUAGUCGCGGCGAUCUAUCAUUGGCCUUCCGACGGCGUCUCGCUACUCGAAUACCAACACGGGCGAAAAUCGUGCAAACGGUGCACAGUCUGCUCGGUAAUGAUCAAGUGUAGGGAACUUUUCUUUGGGAAAUACCCGGCGUGGACCGACAUUAUUCACCUGGGCCGGGUGUGUACCACACGGAAUCAUACUAGCUUGUCAAGUAGGCGAUUGAUCCUAUUCCGGGGAGCAACUAAAACGCAGCAGAAGAGACGGGCUCUGUUAGUGAGAACUCUCCGUACUUGCUCACGGAUCAACCGUUUUAAGACCAUACUCCGUCACUUUAGUGUUCGAAUAGUCACUGUCAAUAGCAGGGCCAGAAUUAAGGGCGCGGAAGAGAUACAUGAGGAGGCUGGCGGAUCAAGACUGCCCUAUUUCAUGAUCGAGCCCAUCAUAAGCAUCAAACCCCCCACAUGUGCAGCGCGUCUGUCAGAACGUUAUCCAGUUUUCUACCCUAUACCUCAGGAAUGGGUCUCUAUCGUAAUCAUGUAUGCCGUAACCUCUCAUAAGUCUAUUAGUCAAUCAAACCCGCUCGCAGGACUAAAUGUGCGAGAGUUGACAGGCAAAUCAGGUAGGCUGCGAGGGCGCUCUGUCCCCACACCACCGAGCCGUGCCAGGGACUCUUUUUCAAGGCUAAUCAAAUGUCUCAGUGAGCGAGCAGCGGUUGGAGUCAUGUCUCAGAUAAGUGAUCACGUCACGGAAAGAGGCAAGACAGUAACCCCAUCUAGAUACCCUAAAAUUGCAAUUCGCAAACUUUGGGCGCCCUGUACCCUCAAACUCGUUGUAACAUCGGCGAGGGGUUUGGAAUCUCCCACCUCGAGGCUACUCUGGAGGGCACCAUUGAAUCCCAGGCUAAAAGGGACUACUGAAGACCUUAGAUUGAUCGAAUGCCCUCAAUCGAAUAUUUUCCCACGUCCCCUGAUGCCGCUUUGUAUCAUCCACGAGGGCACUUUCCAGUGCUCUAUCCAGUGGAAGUCAGGCGCGGGCACUGGGGCAGCUCCUCCCACCAAUUUGGACUCACUUGUUUGUCAUGACGUACGUAAUAGUGUGCGCUCCAAAGACAGCCCUGGCGUUUGUAAACGCCCUGGCUGUAACAUAGAAGUGCUACCCGGUCCGCAGAGUAGAUCGUUACCAGGAACACGGGACAUGAGAUAUCCGUCACGCCGGGAACUGAACAUGACCUCUGCCGAGAUUCAAAGAGGUAUUGACGUGUCCACCAUACGAUUUACAUUCGCCGAACGCCGCCCGAAUGUCUGCCCCGGAUUCGGUAUUGCCAUCUUAGUGCGUAGAACUGAAAAAGUGGUAGGUUCGGGAAAGAGAUGGGCGCCCCUUGUGGUCUGGGCGUCAAGAGAACAGUUGGUGUGGAGUAACAAAGUUCGCGUUAUUUACUUCGCUUAUGGGCAAUGUGUGGAACCACGCCGUGCAGCCGGUCGCUUGUGCUUCCUACAAAGGGAUCUACGCCGCAAACCCGCCUACAUAAAGCUUGCACUCCAGACCCGUGGGCCCAUGGCGGACCAUCCCUAUAUCGGUCGAAUCUUCCUGCUCAACUGCAGACACUGCAGGAGUCCUACUUUAUUGCAGGACUCCGCAGCCCGGCUGUGCGAUAAGCAUAGCUACGUUUCGACGGCUCUGCUAUGGGUUCGACGAAGCACCAUACUGACCCUCCAAACGAACUCCCUGUGCUAUCCUCGUUCCCGCUCUGCAGGGGAACCUCAGGAAAGAGCGUCCAUGCUCGCGCUACGCUUACGUAAGGCCCCCCAUAUUUGUGCGUAG"

In [253]:
translate(seq.replace("U", "T"))

'MSSSHLPSKLPGARPLYPWITQMFSYLANPLPVRSFSSRGCCLSQKRHYTSHGMGAHRDPSTREARTLVRPCDPTTQGCFQENIFAVTILVASSRSVRLYTPTTPSHSCMHWVLWYSKATYRWCLCHPFSYLNLDELLLDLLRGSIRNRMHQIVVCRGSAFVVGLKIYTARATLRLPPRAPPCSTDLRVGGSDKTRSVLSLWIDGSLMMLISTLSYYRCLAKYPDTQNDRSSVSSAPRVATDLTKVETPGLGSVGPSRAACICGKRLSNPHTIVSGLPVAELDLDCPRTKRLDCQRSSCGIHRIAVTGFPLDRLYVLHLDYKYTILRLEQTMPPRCLHAGRRSFVLYAQGSTSPHFNSSIALGTVMPLPGFLRCRGYDSRYYPIVVPALLRSPPRRIWWSLNYATCILPFVPTIRSALTHLIPDSARMVKTRPHRNAVFHVGTGHRNQRSAQTQIEVVLHCTHARAYTVYGFNSSTLNGETVDPSSQVSYNSGTVIFCCPHPRNCLYVRLSSIIHGIRIAEPVWRSIYRVQLRCSIRVAVDERCPYSQDTIGVPCLRRDSLNTNKPLIDRSLLKCHYVPELRLNLCPVDRVVNARVHVREINEGPIAVGVAFLGPKGIRPVMSHRRVVFIPRSYLILYICIQWVCIGRCSTCHWFWTLGLFVCNVYGTSTRLRAHGSNGEMTVYMGIGCRNNIIMDVLSLRGELTTPATVISALETGDTCLKNRIRAPFSGAPRRRLSQNEVNHHLSCNMRVTDSHLINNNACLRWKTSPVARCSSTPMVRIVKKTPSRDMFYAPEPDHNESERGFTTPVSYPALLAQPCWFCISILLEHISNRPRAGESELYHEFITHRGVSARQRCTADQFDSESLHPLATIAHSATCVSYVAELLTPQSDKSTHGRRHKSATSISIGGTSNSAYRPEVGSTSTVGAMVLDLLIVRDASCPVALAVQGQQSQVFGVQRVAPSLVRNGGWCCSSKRDTGRRPTISTLRDRVEDVYFST

## Haystack

In [255]:
h = "GATATATGCATATACTT"
n = "ATAT"

In [266]:
def kmers(seq, n=4):
    return [(k + 1, seq[k:k+n]) for k in range(0, len(seq) - n)]

In [277]:
def find(hay: str, pin: str) -> List[int]:
    pos = [pos for pos, kmer in kmers(hay, len(pin)) if kmer == pin]
    print(" ".join([str(loc) for loc in pos]))
    return pos

In [278]:
find(h, n)

2 4 10


[2, 4, 10]

In [279]:
h = "TACTACAGTAGACTACAGTGCATCCAACGCTACAGTCTACAGTGGCCTACAGTCGCGCTACAGTGGCCACTACAGTGACCTACAGTACGAGCTACAGTCTACAGTCTACAGTGCACTACAGTTCTACAGTCAGCTACAGTACTACAGTTATAGACACTACAGTTTGCGTTTCTACAGTTCTCTACAGTTAGGCTACAGTCCTACAGTCTACAGTGTTTACTACAGTTTCTACAGTCTACAGTGCTCGGGCTACAGTCTACAGTATGTCTACAGTCGCTACAGTTCAGCTACAGTTCCTACAGTTCCCGCTACAGTTCTACAGTATGCCCCTACAGTGCATACTACAGTTCTACAGTTGCTACAGTCTACAGTCTACAGTGCTACAGTCTACAGTCTACAGTCGTCTACAGTGCTCTACAGTCTACAGTCTACAGTTCCCTACAGTCGACCTACAGTGAAACGATACTACAGTACTACAGTACCTACAGTCCATAGCCTACAGTACTACAGTTCCAAACTTAATCAATGCTACAGTTACGCTACAGTCTACAGTCTACAGTTCTACAGTTACGCTACAGTTCTACAGTCGCCCTACAGTACTACAGTCTACAGTCAACTACAGTCTACAGTTCTACAGTCGCTACAGTCTCTACAGTCTACAGTGCCTACAGTGAATCTACAGTCTACAGTCTACAGTTCTCTACAGTCCCTACAGTCCTACAGTGCCTGACTGGTTCTACAGTACGCCTACAGTCTACAGTTGACTACAGTGACTACAGTCTACAGTCTACAGTAGGGG"
n = "CTACAGTCT"


In [280]:
find(h, n)

30 92 99 201 229 250 359 366 381 388 415 422 540 547 600 617 641 650 677 684 748 774 781


[30,
 92,
 99,
 201,
 229,
 250,
 359,
 366,
 381,
 388,
 415,
 422,
 540,
 547,
 600,
 617,
 641,
 650,
 677,
 684,
 748,
 774,
 781]

## Calculate consensus

In [3]:
fa = """>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT
"""

In [66]:
def fa2seq(fa: str) -> str:
    return [seq.replace("\n", "") for read, seq in [read.split("\n", 1) for read in fa.split(">")[1:]]]

In [5]:
seq = fa2seq(fa)

In [6]:
list(zip(*seq))

[('A', 'G', 'A', 'A', 'T', 'A', 'A'),
 ('T', 'G', 'T', 'A', 'T', 'T', 'T'),
 ('C', 'G', 'G', 'G', 'G', 'G', 'G'),
 ('C', 'C', 'G', 'C', 'G', 'C', 'G'),
 ('A', 'A', 'A', 'A', 'A', 'C', 'C'),
 ('G', 'A', 'T', 'A', 'A', 'A', 'A'),
 ('C', 'C', 'C', 'C', 'C', 'T', 'C'),
 ('T', 'T', 'T', 'C', 'T', 'T', 'T')]

In [7]:
[ zip(*seq)]

[<zip at 0x7f7320f6f040>]

In [37]:
def consmat(seqs: List[str]):
    return { base: [seq.count(base) for seq in zip(*seqs)] for base in "ACGT"}

In [27]:
def cons(seqs: List[str]) -> str:
    return "".join([Counter(seq).most_common(1)[0][0] for seq in zip(*seqs)])

In [63]:
fa = """>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT
"""

seqs = fa2seq(fa)

mat = consmat(seqs)

string = cons(seqs)

print(string)
print("\n".join([(f"{base}: {' '.join(map(str, counts))}") for base, counts in mat.items()]))

ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6


In [62]:
print("\n".join([(f"{base}: {' '.join(map(str, counts))}") for base, counts in mat.items()]))

A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6


In [26]:
%timeit "".join([Counter(seq).most_common(1)[0][0] for seq in zip(*seqs)])

22.5 µs ± 2.13 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [23]:
f"""
{cons(seqs)}

"""

Help on class Counter in module collections:

class Counter(builtins.dict)
 |  Counter(iterable=None, /, **kwds)
 |  
 |  Dict subclass for counting hashable items.  Sometimes called a bag
 |  or multiset.  Elements are stored as dictionary keys and their counts
 |  are stored as dictionary values.
 |  
 |  >>> c = Counter('abcdeabcdabcaba')  # count elements from a string
 |  
 |  >>> c.most_common(3)                # three most common elements
 |  [('a', 5), ('b', 4), ('c', 3)]
 |  >>> sorted(c)                       # list all unique elements
 |  ['a', 'b', 'c', 'd', 'e']
 |  >>> ''.join(sorted(c.elements()))   # list elements with repetitions
 |  'aaaaabbbbcccdde'
 |  >>> sum(c.values())                 # total of all counts
 |  15
 |  
 |  >>> c['a']                          # count of letter 'a'
 |  5
 |  >>> for elem in 'shazam':           # update counts from an iterable
 |  ...     c[elem] += 1                # by adding 1 to each element's count
 |  >>> c['a']                

In [28]:
mat

{'A': [5, 1, 0, 0, 5, 5, 0, 0],
 'C': [0, 0, 1, 4, 2, 0, 6, 1],
 'G': [1, 1, 6, 3, 0, 1, 0, 0],
 'T': [1, 5, 0, 0, 0, 1, 1, 6]}

TypeError: 'int' object is not iterable

In [59]:
" ".join(map(str, [5, 1, 0, 0, 5, 5, 0, 0]))

'5 1 0 0 5 5 0 0'

In [68]:
fa = """>Rosalind_4135
ACTAGAGTAGTCGAATACACGGGGACGGCCTAGAGTCCGCTACTTCAATTATGAAAACCT
CCCGTCAACTAATGGGAGGCGCCTACTTGGGTCGGAGATCGACGCCTTCGTTGGCGCTCG
TAGGCCATGGATGAAACCGGTCGTTTAACCGCTCATTGAGTCCATGGTTAGAGAGGCACA
ACTATGGCCGTTGCTAAGCAATGCGTATTTCGGGGGGATGTTCCCTATGGACCTCCGCTG
AACCAAGTGGAGAATGACATATCGCGGAGGTCGACTATTAACTGAACTGGGAATATTGTA
GCACAACAAGCCACTCAGCTCCTATATTGATTCGGAGCGCAAGATACCGCACGGGAGTCC
GTGTACGTATGGCACAAAGTGGTGACAAGACGCCTGTACCCCCAAACTAGTCTGGGCCAA
CCATAGCTGTAACTCCCTTTGATCATATAATTTGTGGAAGTGTTGCATTTCTGATAGGCA
AACATTCTGCTGCAGTTGGCGGATTCCTCAAAAGAATAGGAGCTGGCTAAAGCTTCGCGA
TCAGCCGCCTTCGATATTGATGGCCATTCTGGCCGGTATGTAGTGGTAATCTTGAGGCAC
GGATATCCCCAGCGACCCTTTTTTCAGCTACTCTATGCTGTGGTTGGAAACGAGGGCGGC
AAAGGGTGCGGGCAAAGTACCCGGGTTAACTCTCTCGAGAATGTCCAGCCTTGAGATATA
TGCTAAACCCAGTCCATGTAGGCTACTTCAGTCAGTAGTTTGCGAATGAAGATGTCATCC
TGAGTTAGGGGTGATTGGGGTTATTATCAGAGCATGTACTGACGAGAATCCATACTACGA
CCGAAGCGTATGCTGGGCAAAAGAAAGTCCTGCGCCAATGGCCCCGGCGCCCCGGTAAAA
GCAAGAGATGGTATGCTGGGGAACAGGGACACGCGCGCTGTCTTCATTGCTGTGGAATTA
ACCGCGAAGGGTGATTGAATGTG
>Rosalind_1266
CTCCAATGAAACTGTTCTGCTAAATAGTTTTTAGTTTTGGCGTGGAACTCGGGAAGCGTA
CTGTATCAACGCAGCCTTATGCATTCATCCTGGTGTTGGAGTCCATACAAGGGTAACCGG
GGTGGCAAGAGAGGGGACCATCGGCCATAACCATTGGACACTACTAACGCCTGGGTTTAT
ATTACTTAACAAGTTGTGAAGCGACACTCGGTAATTTCGCGTTGTGTGCGCTGGGGATAG
GCGGACTGTCCGAGGGACATTTGCGTCGTACAACGAATTTATACCCCCTTCGTTAGCGAC
ACCACAGCTTTATGGTAAGTCTTTAACAGGCTGTGGTCTGAGCGATCTAACCCGTATGCA
GCAGAGCTGAAAGGAAATTCAACAAAGGTTGCAAGGCCTGCGGGATGGCACATCAGGCTC
CCGGCTCCCATCGCGAATAATTCTATGACCTAGACGCAAGCCAATAGCACACGGGTTAAG
GAAACGTGTAAACCTCATCCCGCCTGGTGGAAAACACCAAGGTCAATATTTTGCCTGACG
ATCGTTTGCGTGAGATACACCCGGACTACCGGATCGGCTCGTTGAAATAGGTGAGACGGC
GTCTAGGTAAAATGGGCACTATACGTGACCGTATACCGGTGCCGATCGCCATACGTGTTC
CTAAGATCAACATACTTTACAAATGGAGACTACACCGCGACTTTCAGCCACAGGTTAGGC
TACGGTCCTGCAATCCAGCCTGTCGTATTAGAACGCGACGTGCTTGGGTTCGTAACAGTG
TACGATCTCGCACTACAGCTACCCAACGGCTTTGAACAACCACGTCTCCTTAACTTCTTT
CTAGTCCTGTCGACATGTCTCTCGGTGAATACAGGTCCCCCGTTCTTGTGCCGATCGAGG
CCCTGTTACCAAAATGATGGTGTGATTATCCGATGAAAATGCCCCGATCTACAAAATGTT
TATAATGGGTAGTCGGCGGGCGC
>Rosalind_6660
GCGGGGCAAGGAGGTTGCGATAGGAAGCAGCAAGCTTCGGTAGTCTATCAGCTAAGCAGT
TCTTTTCTCGCTCACCCGATGGAAACGAACTTCAAGCTAGACTTGCCCGGACGTTATTCA
GCTTCGACGTCGGATCGGGTTTTTCCAAATTGCTACGTCTTGAGGTCAAAAACTCCGCAC
GCCTTTCCGGCTCTGGACGATTACGCCTGCACCACATCATTTCCGAGCTGGCGGGGCTCG
TGCACGTACATTTGATGCTACATTTAACGGTCCTGTATCTCTGCATCGTCCCGAACGAAT
CCGCAGGGATGAGGCCCTGCATCCTAAAGTGTCTCGTACTTGCCCTTAGAGTGTGGGAGC
CATCTCCCAATGATCGCGTCATCAGATTCCGTGTCCGGGGAATGTGTGCTCCAATAACTT
GCAAAAACAGGTAAAGTGCTGGCTATGTATTGAGCGGCCCCTGTGATTACCGGCTCGCTT
CAGAAGCGAGAATTAGGTGATCGATCTGGGTTGAACCCCGAAATTCATACAGATGGCTCC
TTCGCCGATGCGAGACCTTGCTCTGTAGACATCAAGCTTGTCGTCCGATTGCTTGCTTGA
TTGAACCGTATCTATTGCTACACGGCATTCCTGACGGTTCTTGGCCGGTCAACCCGATTG
AGAGGCCTGCGCAAGTTGGGATGCTCCATGCAAGCCCTCGTTGGTCCAATACCTGGTCCG
ATTATGACGGGACCGGCTTCAAGAGGTAAGGGCAATTCAACGTAAGAAAAGCCTCTTCGA
CTCATCGTGAAGATGATCGCACTGTTCCGTAAATGGGTATCCTTGTATTTCTCGTTACGG
TTATGTGCAGCTCAGAATCAGTGCCCGCCCGGGGCTAAACCGGAGGCACCCCTCGCGCAC
TAGCGCTTAGCTGCCACTTGTTCTTGTCTTTGACGACACAATAGATAGCCTGAACCAGTC
CTCGCGCATACACTCGATACCCG
>Rosalind_1157
GGACTGAGAATATTCTCACTGACTAAAATCTAGGTCCTCGGTAATCCTATCCGAACATGA
CCGGAAGTGTGAGAAGACTTCCCGAGCTGACAATGAGGGCCTACTCTCACTACTTATGCG
TCTGGCCGCCTCCAGTTCGCTGTGTGCCTTTCCCATTTTCTCCAAGCATCGTAGCATCCA
CATACGAGAGCTGGCCGCGTCGCGACTAATTGAGTGTCCTTGGTTCTGCGACTTGCTTCC
CCCAAATGGAGACCGTAGTTACGAAGGAATCAAGCGCATAGGGTGCAGCAGAGGGACTCA
TAGCGGGCAGAGTGAGCGTCCCACGCAGGGGGCGTTAAGCCAAGAGCAAATAGTGGATTA
GCGAAAGGAAGTATAGACTACAAGCGTTTGCCAAAGAAGAGTAGGTTACCTGACAACCGG
CTTAACTCTCTTTTACGGACTTCGTTCATATCCACCGTATTATTTTGTCAATACGATGTT
CAGTTTGTTTTGATAGACGTTGTCTTTATACAGCAGAGTCTTACAGCAACCATCATAAAC
TAGGCGACCATTATTCCATCTGTCCACGGTAAACAGGGTTCCTAGGACCGAAAGATGTGG
GGCGTAAGTTCTGCCGACCTGACCTGCCACGAGAGCTTACAGCGTGGGAGTTAAACCCAA
CAAGCGTGCAAGAACCGCCACGCGTTGGCAGAACAGTACAACGCTAGCACCAGCGTGCAT
TGGTACAACTACATAACCTCTCGTTCAACCATGTTAAGCCACACTGAAAACCAGGTTGCG
CATTACAGCGAATATGAAAGGAGTTTTCGATGGCATGTAACTTACTAGACGTAATACCGC
TACGTTCTCGGAAGGCTTAGTATCTCATGCACCGAACTCGGTAGTCCAAAGGTGAATGAG
CCTCGAAACGCGCCAGACCTCGCTACCGCGTCCCGCGATGGGCGCAATTTCGATATGTTT
CGCGGTTCTACTACCGATGTGAG
>Rosalind_3528
CTCTGGTCCGAACCTTCGATGGAGGTTTCTGCCCTCCGGACAGCTTAGGAAACCACGTCC
CACAACAGGGGATATTACCCCACTAGATGCGCGTAGAACACTAACTCCTTCGCACATCGG
ATATCAGCGTAATTTCTGGTTCGCTAACCGCACCTGTTTAAGACAAATTTTCTCGCATAT
TCATTTTGAAAGGGGTTGTGTTTAAAACTAGTCAGATAGAAGACTGATTTTCTAATGCAT
GGAACCGTGTTGGTTAATGTACCTTTCTCGTGATAGACATGGTCGTAATTCTTATCTAGC
CAGTCCGAGGATGAATTGTCAACCTGACCGCACCTTTATCATTAGTGTCTTATACCGGAG
GGTGTAATTTATCTAGTAAATCTCGCATCTTCCTCGAGCAAACCCGATTAACCCGCTGCA
GAGATCAACCCAATATGCGTTGTAGCTCTACTGGTAGTGTGTGTCTGGAGCCTACCCCGT
CGACCAGGGGCGATTTTTTTGATGTAGATCAGATGTAAGGCTATAGCTTAACTGGTCTCA
CAAATAATCATTGGTAGGCCACACCATGATTGGGAAGACCAAACATTACCAAGGCCCTAC
GGCGGAAATGACGTCATTACGATACTCTATGATACATGATGTTTCGCTACCAGGTCCGTT
CCTCGCGTACGGAAATTAGGAGGCCCGCAGTGTTTCACGTAAGCTAATCATTATCTTCAT
AATTATATGTATGTGAATGGCCCCGGGCGCTCTAATAAGTGACCAACCCGAACTCCTTTT
CCTTTCAAACTCTGCCCTTCCCGGCGAGTTACTACGCCCAGGTTGTGAATGACACCGCGA
GCCTGCACCCGTGCTCCGCAAAACCAATGCCCGCGGCTGACTGCACAATCGGCGATTACC
CCAAGACGATCGAACTGGGCCCGAAACGACTCGAAGTCCGAATTGGGGACAGCGATTGGC
AGCAGAGCTTAGCCAAAACGGCA
>Rosalind_1703
ATTTTTATGGCTGCTGCATAATATTCACACCTTATGCCAATTCGCCCTTAATATAATGGA
AGCATAATTCGGAGATGAATCAAGATGGCGCCGGTAGTTCCGCAATGAGCACTTGGCTCA
ATCGTCCAGGCAACGTGGTCTATCCCAGGGAATAAAGGGCAAGAAGTTGTGTTAGTGTGA
CGACTCAATAACATATGCCTGAGAATTTGGGCCCGACGCCGTGACTGACCAGGAAAAGTC
GCGTGTGCATTCTGATAATAGTTAGAAAAAGCCTTAAAGATTGGTAGCGCACACTTCCTT
CATGGGGCACGGGCGCGATTGCTGGGTAACGCCCAAGGTCGTCCAGGCGTCAGCACCTGA
TCGACGATAGGATCATTCAGCCTTTTCTCAGTCTGGGTCATAAGAAAGCCCAACCTCCTC
ATGCGTGTAAAACTCTCGAAAGTCATCAGTGTACTGTGGCACGTGTAGAGTTGACAATAA
TGCACCAGTCGGTGGATGCAGTGTATGGTGTTTTTTACAACTTTATACTCCAATCATCCC
CGGGATATCGAGTTATTGGGAGCGGGCTGCTCGAGCACGCGTCCCGTATCGCAGTCCCAC
TTGCCGGCTTTGGCAAGTTGTCCTAAACGTGAACTCCACGAGCAGAGCCAGTACCTTTAT
CGCACCAGTACTCCTGTCAAGGCTCGTGAGACCCAGTAGCATTCCCAAGGTAGGCGGGGA
TAGTGGGCGCGAAGTAGTCCGATGGGATTCGTCCAAGGCATCTTTACCTACATGTAGTCC
CGACAAGACTATTAAGCACATCCAAATGTTCCTTCAGCGGATTAACCGGGTTCGCTAACC
AATGGGGTCCAGTTTACCGTTATGAGCTGTGCTTCGGATAAACGGTAATCGGATCTCCAA
AGCTCTCCAGTCTATCCGTGACGGAATTAGACATACTCTCGGTTACTCACGTATCTTCAG
CTCAAGAATGCACACGTTTCGGC
>Rosalind_6778
CATGTGGTCGCTAAAGACTTGCGCTCACATATGTATGGGAGGCCGTGTTGAACTACTCTT
TACTTATTTGGGGTGTCTCCGGTATGCTATTGTTTTGGCGACAGGTTAGTCCCTCCCCTT
CATCGGACATAAAGTCCACGACTGCCCCTTCGTATGGAATTGGCTGTTGGTAGACGCTAC
CGCGTTAGGCCCGTTTTCATGAGTGTGGGACTACCTACTGCATTTTAACATGCCCTAACG
AGCAAAGGGGAGAGGCGAGTGAGGTCTGAAAATAGGTCTTCGAGACGTGCACCCGCCACC
CACAGGGTATTCCGGCTGAGAACATAGTGGTGTTTTGTGATTCCTCCCTGGCTCTAATAC
TTTAGTAGGGTCTCGTCCGAACAACGGGCCTTATAATCAGTCTGATAATTCAGCCCTCTG
GATGCGCGACATAAAGACGGCCAAAGGATTTCTAAGATGCTGTAATACTGGTTAGTTAGC
AGAAGGACGCTTTGTCACGCTATACAGGCCTGATTGTTGAGTAAATAGTCAGTCACTCCG
GCAGGTTGGGAGAGCCTTTAGTATCGATAGTCCGTCAGTTAACCGTTCACCCTTACAGTT
GTCCGAGTGGGTCTGTCTCTTATTAGTTACAAGGCGGGCGCTGAACATGCAGCTGCAACA
TCTATTTCTAGAAATTCAATCTCGAACGTTGTGGGAGGCCCTGACGGATATGACGTGTCA
GAAGGCCCGTCTGTAACAAGACAGGAGGATAAAAAACCGATGAACGTGCCCGGGAAACAT
CTGTTCCTTCTTGTGGGTACCATGCCTCAATATAGAGCCTTACTTATTTGTATCCATTAT
CAGGAGACCTTGCTTCCAGTACTCGGCTCCATGCCTTATTGTATCGCGCCCACCACCTTT
GGTTCAAACGCACGTCACTTATGAGTGCAACCACACAAGCGGCCTGGGAGTAAAGATGTT
TTTGCGCTACGTACCAACGTGCG
>Rosalind_5056
CCGACCCGTCTGGATGCAATCTGCTATGCAACCGCAGTCCTTATGGGCCTCTGTCCGAGC
GTTTCCAGGTGCAACGTAACGATGTGCCCTGAACGGCGGCTTTTAATGGTCTCAAGCCTG
GCATCTACACTCAAGCGAGCGATCAGTACCGAATACGGGGTCTTCCCTCGTGTAGGCCCG
TCAAAAGTCTATATAAGCAAAGCGATCACATGAAGAAGTAAGAGGAGGTGATCAGGCGAC
GATCCAATATCTAAGAGAAGCACGCCTCAGAGTCGCATGGGGATGAAGGTGCTTGATCGC
GAGAAGAAGGCGGCGGTATGGGTGGGATCGGGATAACTCGGGCCGATCCTAGATCAGCAT
GGCGTCCTCAATATAAAAGTCGCTGACTATCACGGAGTTACGTCCTGCGTAGGCACTAGA
TCGACACATCGACGCACGCCTCGAAACAGCGACTTGGAGGCGGAGATATAGTCGGTAAAT
GAAATTCGCCTTTCGATACCGCGTACGAGGTAGGCTTCGATATGAATATTGTACTCTAGG
CCCACACAAGGATTAAGCCGATATTGTCTAGTCATTATCTTATGGTCTATTTGCCAAGAT
CGGTCTTATCAAACTGCTATATGAGGCAGATAATCACTATGTATCGGGAATCAATTTCTG
CGACGGCGGCACGTGTGGAGTGGCACGCTCGTTGAACCGGGCAGGTCTGGTTGGTGGTAT
GGTGCGTTTGGTGTAAGGTCGATAGCACGCGACTCTCGCGCTAGAGGTCCGCCCAAACCG
GAGAGTCAACGGCTCCCCTGTAAAAATCAGCATTCGTGGACCCTCTGGACAAGGTTGATA
AGGCGCTTGGGGCACACAGTTAGTTCGGTGCATCCGCCTATGCGTTATACAGTGGCGGTA
GAGGCGACGGATATACCATTGGATAAGTGAATAATTCATGTAGGGGAAGAGTGCCAGCTG
ACGCCATCACAGCATGAATAGAA
>Rosalind_0448
TGGGTATGTTTGCCGATACGCGTAAGGACGACTGGGATTGGCAGATATAGCTAAGTCCCC
ACTACGGAAAGCCAACTTCGTTTTGCTCATGTATAAAAGACTCGGAGACTACAATCTACA
GAAGACCGGACCGAATGTAGTACCGACTGCGAAGGGACTGATGCAACTTGATCCATGAAG
AGTGACAAATTCATTGGACAAGGTAGACGTTTACCTTTACTCATAGCCTGCTTTAAACGT
ATTCTGGCGGCTCATACGTACCGAATACATCATAACTGCTCTGCCATCGCCCCAGTGACA
TAGACAAAAGGCGATCCATCGTGTAGCGTGTTATAGGTTAGCCAAGCGCTTGCGATTAGC
TTTACGTATCGGTAATGTAAAAGTATTTCGTTAAGGCTAGTGAGTTCCATGCCATACTGG
ACACACACCGGCACTTACTGAATGATGCATGGGCACACAAGATTCGTGCGGTTTCGGCTC
TGCTCGCGGTAATAACAGGGGTTACATGACGCCCGACCGGAGAGCTCCGGGTGCCGGATT
GTTGGGTGTAGTATTGACGAAGTCTAGATCCTGGTCTGACGCTAGCTTTTTCATCTAATT
GAATCATCACCAGCATCAATCGACCTGATCGTCCGTTGTCGTGTAACGGTCACTACCATT
GTAAGAAAACACAAGTCTGAAGCTTAACGGAACTGGGGTTGTTGAGCAACAGGGATTTGC
GGCGTAGGAGACCGCGGTGTTCGAGCAAAACATTTACGATCGTAGTAGTAGCCGTCCATA
AGATCTTTACGGACAAACCTCTCTGTCTGCGGAATGTAAAAGTAAAAGCAAAGGGAATAA
GTAATAAGATTTACGTCTTGACTAAATCTGCATTTTTAAAGTAACGAGCGTAAGGCGCAG
GTGTCAAGCAGGTAGCGCTGGCCCAGAAATCCCACGGCTACCACTTTACAGCCGTAGCGC
AAGAAGAAGGCAGCTAGACGATG
>Rosalind_0713
CGAAGCCTATTTAAGTAGAGAGAGCGCCGTATCAAACGTACCGGAAGTCGCACTCGTATG
TTGACCATATGGATTCTAGCGCTACCTACGCCGGCCTACTGAATGGCAGTTGCCCACTTG
GCCATAGGGTGGACCACACGGGGGCGATTCCGGGCGCCATGGTAGCGTCCAGCCTGCTAC
ACTCGCCTGAATTTTTAGTAGCACATTTTCAACCTCGCACAGAGGTCACCGCGAGCTGCG
GCACAGTATCAGGATTCCCAGATTGTACAGGTTCTGTCCGACGGCATCGTGCGGAAATGA
TGGTAGGGTCTAACCATACACGGGCGCCTACGGGCCAGACTTTTACTCTGCCACAAAGAG
GCATCCTCAGATCGTGAACAAGCGTTGCCGCAGCAAGCTATCCTCCTAAGACGCCTCCCG
TAGTCACCATGAGAAACGGAAGGTCCCTAGATTCATAAAATAACTAATAACGAGCCGAAT
GTGTAGGCACCGACCAGCGAGTTTATCTTACTGCGAGGGTACCAGTTACAATATAAGCTG
AGCCAGCGTTAGACGTGACCCGGAGGGCATACGTACAATTGTCTCGCGACCGCTAAGCGC
TACAAGTCGAATGGAATTTGTGTTACACAGTCTGTCCTAATACCTGAAGTTCACTCTCGA
GCTAACAATAGGGTGCTGACCTGGGATTAATTCGGACAAAATCGTGACCCACAGGAGCCG
TCTGGGTAGCCCAGTAACGGATTACTTAATAGGATCGTTGCCCAATTATATTAAGATAAA
TTCCGGTCTGGGATGTAAATCCGTGGCATTGTTTACTAGGTGCTGTCTGTGCTGACTAGT
GAGGTATCAGGGGGACACCCTGAATATGGCACGTGTCCGCGGGGTCATTAAAGCCGACCG
CCTCCCAGGGGCGGTCATTTAAAAATGGTCACCGGTCGGCGCGAAACGACACCTTGATGT
CCCATCGCTCTTCCACGTCTGCG
"""

seqs = fa2seq(fa)

mat = consmat(seqs)

string = cons(seqs)

print(string)
print("\n".join([(f"{base}: {' '.join(map(str, counts))}") for base, counts in mat.items()]))

CCTAGGTTAGTAGATTCAATGGGGAAGCCTATGGTTCTGGTACGTTATTTATGAACCCGTCCCTTCATATGAAACCTTACGCTTACTTCGGTGTGAGAGCCTCGGTTCGTTCCTCACTCGGCTGCCACGTAAGAGCGCGGTCTGCCAACCCATCAGGGAGTGAAAGCTTCGTGAGGCTAAACTATTAAAGATGTTTGCCAGTGCATATGTGTAAGATCTCTTACTTAGCGACGAGCACCGGCCCAAGTGGAGAATTACTTAAGGGTACAGTATAGGATTTATGGAACCGTGCTTATCACACAGAAGGAAGTCGCGCTATCCCTCTGATGGGTCTTAGATCATCCATCCGTCCGGGAGTACGCTAACCTAAGTCTAGAAGAAGCGAAGTCTCTCTGGGCCATCCGATTGCTCCACAACCTGCCGAAACCACGAATAACGATTGTTATGAATTTTGTGGAAGTGTTGAATAGCTGAGAGAATGAAACGCGGCTGTCGCTGGCGGTTTCGGTGTAACAATCGGATATATCATCATACCCGCCGTCCGCTTGCGTGAGTATTGCAGGCCATTATGGCAAGAATCGATTGGTAATCCTGACGCACGGCTAAGCTCAAGCAGCTTTTATTCTGCACGAATCCCGAGGTGTTGGGACCTACGCCTTACGAAGCTGAAGGAAGTTTACCGGGGCTGAGTACGACGAGAATGGCCAACCTTGGGTGCGATGTGGGACGGAAATCAATTCGCTAGCAAACGACAATAGCTTGCAAGTGTAGCCGTCATCGCGATTTATCGGGATACACCGCCCTTATCGTAGTAAGTAAACACTATAGTTTATGCTACGACAGGTGCTCGGGCTGCCTCTAATCAAGTGCACGGCTCATAGGCGCGAATCCGCGGCGCAGGCGTGAAACGCTAATCAGTGGGATAGGGACACACGCGATGGCTTCGAGACTGAGAATGTTACCACGAATGCTCCCGAAGTGCG
A: 2 1 2 3 1 3 2

In [65]:
seqs

['TCAGGGCGTTCACTTATAAGGAGTTAAACCCTAAACGTGGGATTGTTGGCCGCGCGCGGT\nGCTTTGCCTATTCTTAGTCTGGCAGCGTCCGTTAGCCCTAAACATGTCTGGTGATTGAGA\nCGAACCTCCCCCTACAGTCCCGAAATACGCAGATATAGGTTTGCATATTACTCTTCCAAA\nTGCGGCTCTTACAGGTCAGCCTGCTCCTCTATTAATGCTTTAGCACGTTCTGGTGTTAGT\nGTGTTGTCTGATGTTACTGTCGCGGTGCGCAATAAGGATCACGACAACCCGGGCCATTAG\nCGTATCCTCTCCGTGGCGCTCCTAACCGCCATTAGGACCGTACAACAAAATAATCGAGCT\nGACTCTGCATGTGAAATCCGAGATGATGGACCTAGCGTCCTGCGCGCTGCCTCCGAATAG\nAGTGTTATTGTCAGGCGTAGGTTAGGGGTTCCCGACCACAGCCCTCATCTAGTCCGACGA\nGAAGACCACACCGAGCACAGGACATTTTGCTTCAAAATAGTCTGCATTGCCGCGTGAACG\nAACTTCGTGTGCCCCCAATGATGGACGATGGTAATAGCTTCTGTGCCTTTCATAGTCCTG\nGCAATTGAAGCTTCTCCTGGCTTGCAGCAGAAGGCGCTACATTGCTGTAGGATCTCAATC\nTTCCATGCACGTTAACCATACAAAGACGGTGAACTTTCATGTTGTGAGTCCACCGAAAGT\nGTGGTCAACACAGGCTCCAGGATACCCAGACGACCTCGATAAAGACCGACGACTTCTGGA\nCCGTTTTCAATATTGATTCGCGATACTCCACTTCGAGCCGGCAGACTGGCTTACGCACGC\nAACCTAGTACATTTCGAATTTGCGCCCGCTTGTCCTGAACGGTCGCTGCTGGCTCACTGC\nAGTACATCCCTTCCCAGTCTCTACACGTAGACTGTAAAGGCCCGCGCAGCCACACAAGGG\nGGCGAT