In [4]:
# import Biopython functions
from Bio import SeqIO

# Loading and viewing sequences
In this example both a transcript (Thy1 in mouse), and the coding sequence, are read.

In [5]:
# load the whole transcipt
transcript = next(SeqIO.parse("complete.fasta", "fasta"))

# load the coding sequence
coding = next(SeqIO.parse("coding.fasta", "fasta"))

In [6]:
print(transcript.description)

NM_001105.4 Homo sapiens activin A receptor type 1 (ACVR1), transcript variant 1, mRNA


In [7]:
print(coding.description)

lcl|NM_001105.4_cds_NP_001096.1_1 [gene=ACVR1] [db_xref=CCDS:CCDS2206.1] [protein=activin receptor type-1 precursor] [protein_id=NP_001096.1] [location=431..1960] [gbkey=CDS]


In [8]:
print(transcript.seq)

GAAGAGATGTGGGCCTCTGGGGCCGCTGGATTCAGTAACTTCCGTCGGGTTCTAGACTGGCTCGGCTCTGTCCAGTTTGTGCCAGATAGTCTCCCACCCCCTCCCCACCCCTCCTTTCCCCTGGAGATTTGAACGCTGCTTGCATGGGAGAAAAGCTACTTAGAGAAGAAAACGTTCCACTTAGTAACAGAAGAAAAGTCTTGGTTAAAAAGTTGTCATGAATTTGGCTTTTGGAGAGAGGCAGCAAGCCTGGAGCATTGGTAAGCGTCACACTGCCAAAGTGAGAGCTGCTGGAGAACTCATAATCCCAGGAACGCCTCTTCTACTCTCCGAGTACCCCAGTGACCAGAGTGAGAGAAGCTCTGAACGAGGGCACGCGGCTTGAAGGACTGTGGGCAGATGTGACCAAGAGCCTGCATTAAGTTGTACAATGGTAGATGGAGTGATGATTCTTCCTGTGCTTATCATGATTGCTCTCCCCTCCCCTAGTATGGAAGATGAGAAGCCCAAGGTCAACCCCAAACTCTACATGTGTGTGTGTGAAGGTCTCTCCTGCGGTAATGAGGACCACTGTGAAGGCCAGCAGTGCTTTTCCTCACTGAGCATCAACGATGGCTTCCACGTCTACCAGAAAGGCTGCTTCCAGGTTTATGAGCAGGGAAAGATGACCTGTAAGACCCCGCCGTCCCCTGGCCAAGCCGTGGAGTGCTGCCAAGGGGACTGGTGTAACAGGAACATCACGGCCCAGCTGCCCACTAAAGGAAAATCCTTCCCTGGAACACAGAATTTCCACTTGGAGGTTGGCCTCATTATTCTCTCTGTAGTGTTCGCAGTATGTCTTTTAGCCTGCCTGCTGGGAGTTGCTCTCCGAAAATTTAAAAGGCGCAACCAAGAACGCCTCAATCCCCGAGACGTGGAGTATGGCACTATCGAAGGGCTCATCACCACCAATGTTGGAGACAGCACTTTAGCAGATTTATTGGATCATTCGTGTACATCA

In [9]:
print(coding.seq)

ATGGTAGATGGAGTGATGATTCTTCCTGTGCTTATCATGATTGCTCTCCCCTCCCCTAGTATGGAAGATGAGAAGCCCAAGGTCAACCCCAAACTCTACATGTGTGTGTGTGAAGGTCTCTCCTGCGGTAATGAGGACCACTGTGAAGGCCAGCAGTGCTTTTCCTCACTGAGCATCAACGATGGCTTCCACGTCTACCAGAAAGGCTGCTTCCAGGTTTATGAGCAGGGAAAGATGACCTGTAAGACCCCGCCGTCCCCTGGCCAAGCCGTGGAGTGCTGCCAAGGGGACTGGTGTAACAGGAACATCACGGCCCAGCTGCCCACTAAAGGAAAATCCTTCCCTGGAACACAGAATTTCCACTTGGAGGTTGGCCTCATTATTCTCTCTGTAGTGTTCGCAGTATGTCTTTTAGCCTGCCTGCTGGGAGTTGCTCTCCGAAAATTTAAAAGGCGCAACCAAGAACGCCTCAATCCCCGAGACGTGGAGTATGGCACTATCGAAGGGCTCATCACCACCAATGTTGGAGACAGCACTTTAGCAGATTTATTGGATCATTCGTGTACATCAGGAAGTGGCTCTGGTCTTCCTTTTCTGGTACAAAGAACAGTGGCTCGCCAGATTACACTGTTGGAGTGTGTCGGGAAAGGCAGGTATGGTGAGGTGTGGAGGGGCAGCTGGCAAGGGGAGAATGTTGCCGTGAAGATCTTCTCCTCCCGTGATGAGAAGTCATGGTTCAGGGAAACGGAATTGTACAACACTGTGATGCTGAGGCATGAAAATATCTTAGGTTTCATTGCTTCAGACATGACATCAAGACACTCCAGTACCCAGCTGTGGTTAATTACACATTATCATGAAATGGGATCGTTGTACGACTATCTTCAGCTTACTACTCTGGATACAGTTAGCTGCCTTCGAATAGTGCTGTCCATAGCTAGTGGTCTTGCACATTTGCACATAGAGATATTTGGGACCCAAGGGAAACCAGCCATTGCCC

In [10]:
print("Lenth of gene:",len(transcript.seq))
print("Lenth of coding sequence:",len(coding.seq))

('Lenth of gene:', 3062)
('Lenth of coding sequence:', 1530)


In [11]:
# find the start of the coding sequence in the gene
# we know from the info above that the gene starts at location 977
# python slicing offers quick access, note that python starts counting at zero:
transcript.seq[976:]

Seq('TTATTGGATCATTCGTGTACATCAGGAAGTGGCTCTGGTCTTCCTTTTCTGGTA...AAA', SingleLetterAlphabet())

# Simple sequence operations 

In [12]:
transcript.seq.count('A')

850

## Two ways to compute GC content

G-C pairs have a stronger bonds due to an addition H-bond
<img src="GC.png">

In [13]:
print(100 * float(transcript.seq.count("G") + transcript.seq.count("C")) / len(transcript.seq))

from Bio.SeqUtils import GC
print(GC(transcript.seq))

44.3500979752
44.3500979752


## find first occurence of a substring

In [14]:
print(transcript.seq.find('ATG'))
print(coding.seq.find('ATG'))

6
0


## obtain the complemenatry sequence
compare with sequence above to see whether it makes sense to you

In [15]:
print(transcript.seq.complement())

CTTCTCTACACCCGGAGACCCCGGCGACCTAAGTCATTGAAGGCAGCCCAAGATCTGACCGAGCCGAGACAGGTCAAACACGGTCTATCAGAGGGTGGGGGAGGGGTGGGGAGGAAAGGGGACCTCTAAACTTGCGACGAACGTACCCTCTTTTCGATGAATCTCTTCTTTTGCAAGGTGAATCATTGTCTTCTTTTCAGAACCAATTTTTCAACAGTACTTAAACCGAAAACCTCTCTCCGTCGTTCGGACCTCGTAACCATTCGCAGTGTGACGGTTTCACTCTCGACGACCTCTTGAGTATTAGGGTCCTTGCGGAGAAGATGAGAGGCTCATGGGGTCACTGGTCTCACTCTCTTCGAGACTTGCTCCCGTGCGCCGAACTTCCTGACACCCGTCTACACTGGTTCTCGGACGTAATTCAACATGTTACCATCTACCTCACTACTAAGAAGGACACGAATAGTACTAACGAGAGGGGAGGGGATCATACCTTCTACTCTTCGGGTTCCAGTTGGGGTTTGAGATGTACACACACACACTTCCAGAGAGGACGCCATTACTCCTGGTGACACTTCCGGTCGTCACGAAAAGGAGTGACTCGTAGTTGCTACCGAAGGTGCAGATGGTCTTTCCGACGAAGGTCCAAATACTCGTCCCTTTCTACTGGACATTCTGGGGCGGCAGGGGACCGGTTCGGCACCTCACGACGGTTCCCCTGACCACATTGTCCTTGTAGTGCCGGGTCGACGGGTGATTTCCTTTTAGGAAGGGACCTTGTGTCTTAAAGGTGAACCTCCAACCGGAGTAATAAGAGAGACATCACAAGCGTCATACAGAAAATCGGACGGACGACCCTCAACGAGAGGCTTTTAAATTTTCCGCGTTGGTTCTTGCGGAGTTAGGGGCTCTGCACCTCATACCGTGATAGCTTCCCGAGTAGTGGTGGTTACAACCTCTGTCGTGAAATCGTCTAAATAACCTAGTAAGCACATGTAGT

# Transcription and translation

In [16]:
print(coding.seq.transcribe())

AUGGUAGAUGGAGUGAUGAUUCUUCCUGUGCUUAUCAUGAUUGCUCUCCCCUCCCCUAGUAUGGAAGAUGAGAAGCCCAAGGUCAACCCCAAACUCUACAUGUGUGUGUGUGAAGGUCUCUCCUGCGGUAAUGAGGACCACUGUGAAGGCCAGCAGUGCUUUUCCUCACUGAGCAUCAACGAUGGCUUCCACGUCUACCAGAAAGGCUGCUUCCAGGUUUAUGAGCAGGGAAAGAUGACCUGUAAGACCCCGCCGUCCCCUGGCCAAGCCGUGGAGUGCUGCCAAGGGGACUGGUGUAACAGGAACAUCACGGCCCAGCUGCCCACUAAAGGAAAAUCCUUCCCUGGAACACAGAAUUUCCACUUGGAGGUUGGCCUCAUUAUUCUCUCUGUAGUGUUCGCAGUAUGUCUUUUAGCCUGCCUGCUGGGAGUUGCUCUCCGAAAAUUUAAAAGGCGCAACCAAGAACGCCUCAAUCCCCGAGACGUGGAGUAUGGCACUAUCGAAGGGCUCAUCACCACCAAUGUUGGAGACAGCACUUUAGCAGAUUUAUUGGAUCAUUCGUGUACAUCAGGAAGUGGCUCUGGUCUUCCUUUUCUGGUACAAAGAACAGUGGCUCGCCAGAUUACACUGUUGGAGUGUGUCGGGAAAGGCAGGUAUGGUGAGGUGUGGAGGGGCAGCUGGCAAGGGGAGAAUGUUGCCGUGAAGAUCUUCUCCUCCCGUGAUGAGAAGUCAUGGUUCAGGGAAACGGAAUUGUACAACACUGUGAUGCUGAGGCAUGAAAAUAUCUUAGGUUUCAUUGCUUCAGACAUGACAUCAAGACACUCCAGUACCCAGCUGUGGUUAAUUACACAUUAUCAUGAAAUGGGAUCGUUGUACGACUAUCUUCAGCUUACUACUCUGGAUACAGUUAGCUGCCUUCGAAUAGUGCUGUCCAUAGCUAGUGGUCUUGCACAUUUGCACAUAGAGAUAUUUGGGACCCAAGGGAAACCAGCCAUUGCCC

In [17]:
print(coding.seq.translate())

MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNEDHCEGQQCFSSLSINDGFHVYQKGCFQVYEQGKMTCKTPPSPGQAVECCQGDWCNRNITAQLPTKGKSFPGTQNFHLEVGLIILSVVFAVCLLACLLGVALRKFKRRNQERLNPRDVEYGTIEGLITTNVGDSTLADLLDHSCTSGSGSGLPFLVQRTVARQITLLECVGKGRYGEVWRGSWQGENVAVKIFSSRDEKSWFRETELYNTVMLRHENILGFIASDMTSRHSSTQLWLITHYHEMGSLYDYLQLTTLDTVSCLRIVLSIASGLAHLHIEIFGTQGKPAIAHRDLKSKNILVKKNGQCCIADLGLAVMHSQSTNQLDVGNNPRVGTKRYMAPEVLDETIQVDCFDSYKRVDIWAFGLVLWEVARRMVSNGIVEDYKPPFYDVVPNDPSFEDMRKVVCVDQQRPNIPNRWFSDPTLTSLAKLMKECWYQNPSARLTALRIKKTLTKIDNSLDKLKTDC*


We could also tranlate the whole RNA:

In [18]:
print(transcript.seq.translate())

EEMWASGAAGFSNFRRVLDWLGSVQFVPDSLPPPPHPSFPLEI*TLLAWEKSYLEKKTFHLVTEEKSWLKSCHEFGFWREAASLEHW*ASHCQSESCWRTHNPRNASSTLRVPQ*PE*EKL*TRARGLKDCGQM*PRACIKLYNGRWSDDSSCAYHDCSPLP*YGR*EAQGQPQTLHVCV*RSLLR**GPL*RPAVLFLTEHQRWLPRLPERLLPGL*AGKDDL*DPAVPWPSRGVLPRGLV*QEHHGPAAH*RKILPWNTEFPLGGWPHYSLCSVRSMSFSLPAGSCSPKI*KAQPRTPQSPRRGVWHYRRAHHHQCWRQHFSRFIGSFVYIRKWLWSSFSGTKNSGSPDYTVGVCRERQVW*GVEGQLARGECCREDLLLP**EVMVQGNGIVQHCDAEA*KYLRFHCFRHDIKTLQYPAVVNYTLS*NGIVVRLSSAYYSGYS*LPSNSAVHS*WSCTFAHRDIWDPRETSHCPSRFKEQKYSG*EEWTVLHSRFGPGSHAFPEHQSA*CGEQSPCGHQALHGPRSSR*NHPGGLFRFL*KGRYLGLWTCFVGSGQADGEQWYSGGLQATVLRCGSQ*PKF*RYEEGSLCGSTKAKHTQQMVLRPDINLSGQANERMLVSKSIRKTHSTAYQKDFDQN**FPRQIEN*LLTFS*CQEGRFDVVVIVQLGPNAGLTGCQNGIHLSPSPNGCFDKADVVPSHVLGRHQNHPNLAR*L*TGHFTNCSHCRD*CWTDTVAKVGTGGTQRNPKRDLGIKSVALHSFHKSPRHSPRETQGGGEFLISNIACASLLYCTRNSLHSLLALLLLILKTQLAKMLAAYSTGLSLDNRNSIWQNKM*CQTLLHFTHVLMFTMMPNIRNCLYTTLQIIYYLCT**FLQNCFVHMLKLIFMWSYDFITEMFLTLYSKMDIFFYYQLKSHFKCFTFVCV*TVTFFQFICRTYLAITHVTPPNILLI*KQRFQ*NFSPERYGENAFSSELSITCI*TLPEKNNYFVLIYFLY



This sequence is different from the translated coding sequence, and does not even contain a subset of the sequence above. And, we get a warning. Why?

An alternative is to start at the beginning of the coding sequence (see above):

In [19]:
print(transcript.seq[976:].translate())

LLDHSCTSGSGSGLPFLVQRTVARQITLLECVGKGRYGEVWRGSWQGENVAVKIFSSRDEKSWFRETELYNTVMLRHENILGFIASDMTSRHSSTQLWLITHYHEMGSLYDYLQLTTLDTVSCLRIVLSIASGLAHLHIEIFGTQGKPAIAHRDLKSKNILVKKNGQCCIADLGLAVMHSQSTNQLDVGNNPRVGTKRYMAPEVLDETIQVDCFDSYKRVDIWAFGLVLWEVARRMVSNGIVEDYKPPFYDVVPNDPSFEDMRKVVCVDQQRPNIPNRWFSDPTLTSLAKLMKECWYQNPSARLTALRIKKTLTKIDNSLDKLKTDC*HFHSVKKEDLTLLSLSSWDLMLA*LVVRMESICLPPQMAALTRQTSYPAMCWGDIKTTLTSLDDCELGISRTVHTAETNVGQTLLQR*GLEEHREILKEIWALSQWLCIAFTSLLDTPHGKLKEVVNF*SAILPVLLFFIALGILCIPYLHCYS*F*RPNLPKCWLRTPLVCLWIIGIQFGKTKCNVRLCCILHMC*CLQ*CRTLGIVYTQLCKLFITCALSSFYKTALCIC*SLFLCGLMILLQKCF*HYTLKWTFSFIIS*NHILSASHLYVCRL*LFFSSYAERI*PLPT*HHRIYY*FRSKDFSRILVLNATGKMHFLQNYPLRAFKLCQKKITILF*STFCI**LFV*IK*TVFKSKKKKKK


This gives a sensible result for our example. However, tge sequence is longer than the one we had above for the coding sequance. Does this matter? Find an example where it does not work. Why? 

We could also search for a start codon:

In [20]:
codingStart = transcript.seq.find('ATG')
print(codingStart)
print(transcript.seq[codingStart:].translate())

6
MWASGAAGFSNFRRVLDWLGSVQFVPDSLPPPPHPSFPLEI*TLLAWEKSYLEKKTFHLVTEEKSWLKSCHEFGFWREAASLEHW*ASHCQSESCWRTHNPRNASSTLRVPQ*PE*EKL*TRARGLKDCGQM*PRACIKLYNGRWSDDSSCAYHDCSPLP*YGR*EAQGQPQTLHVCV*RSLLR**GPL*RPAVLFLTEHQRWLPRLPERLLPGL*AGKDDL*DPAVPWPSRGVLPRGLV*QEHHGPAAH*RKILPWNTEFPLGGWPHYSLCSVRSMSFSLPAGSCSPKI*KAQPRTPQSPRRGVWHYRRAHHHQCWRQHFSRFIGSFVYIRKWLWSSFSGTKNSGSPDYTVGVCRERQVW*GVEGQLARGECCREDLLLP**EVMVQGNGIVQHCDAEA*KYLRFHCFRHDIKTLQYPAVVNYTLS*NGIVVRLSSAYYSGYS*LPSNSAVHS*WSCTFAHRDIWDPRETSHCPSRFKEQKYSG*EEWTVLHSRFGPGSHAFPEHQSA*CGEQSPCGHQALHGPRSSR*NHPGGLFRFL*KGRYLGLWTCFVGSGQADGEQWYSGGLQATVLRCGSQ*PKF*RYEEGSLCGSTKAKHTQQMVLRPDINLSGQANERMLVSKSIRKTHSTAYQKDFDQN**FPRQIEN*LLTFS*CQEGRFDVVVIVQLGPNAGLTGCQNGIHLSPSPNGCFDKADVVPSHVLGRHQNHPNLAR*L*TGHFTNCSHCRD*CWTDTVAKVGTGGTQRNPKRDLGIKSVALHSFHKSPRHSPRETQGGGEFLISNIACASLLYCTRNSLHSLLALLLLILKTQLAKMLAAYSTGLSLDNRNSIWQNKM*CQTLLHFTHVLMFTMMPNIRNCLYTTLQIIYYLCT**FLQNCFVHMLKLIFMWSYDFITEMFLTLYSKMDIFFYYQLKSHFKCFTFVCV*TVTFFQFICRTYLAITHVTPPNILLI*KQRFQ*NFSPERYGENAFSSELSITCI*TLPEKNNYFVLIYFLY

Does this make sense?

# Open Reading Frames (ORFs)

How many start codons are there? Where are they?

In [21]:
print(transcript.seq.count('ATG'))

61


In [22]:
seq = transcript.seq
i = 0
while seq[i:].find('ATG')>-1:
    j = seq[i:].find('ATG')
    i = i + j + 3
    print(i - 3)


6
143
217
399
430
437
445
466
490
497
529
560
611
650
664
834
920
950
1085
1121
1151
1161
1195
1205
1237
1286
1291
1469
1507
1535
1573
1592
1681
1691
1727
1739
1759
1809
1849
1857
2020
2044
2071
2107
2147
2195
2448
2511
2516
2542
2550
2559
2562
2652
2670
2680
2697
2721
2777
2810
2914


This is a list of possible coding region starting points.

## Alternative strategy

Each true ORF should end with a stop codon. So we can translate the sequence using all three different reading farmes, and search for start-> stop codon sequences.

In [23]:
f1 = transcript.seq.translate().split('*')
f2 = transcript.seq[1:].translate().split('*')
f3 = transcript.seq[2:].translate().split('*')
print(f1)


[Seq('EEMWASGAAGFSNFRRVLDWLGSVQFVPDSLPPPPHPSFPLEI', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('TLLAWEKSYLEKKTFHLVTEEKSWLKSCHEFGFWREAASLEHW', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('ASHCQSESCWRTHNPRNASSTLRVPQ', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('PE', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('EKL', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('TRARGLKDCGQM', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('PRACIKLYNGRWSDDSSCAYHDCSPLP', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('YGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('EAQGQPQTLHVCV', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('RSLLR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('GPL', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('RPAVLFLTEHQRWLPRLPERLLPGL', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('AGKDDL', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('DPAVPWPSRGVLPRGLV', HasStopCodon(ExtendedIUPACProtein()

These are all candidate coding sequences ending with a stop codon. Now check if there is a start codon. Since we have now got amino acis sequences, we should search for the amino acid equivalent of 'ATG', which is 'M' (Methionine)

In [24]:
for i,f in enumerate((f1,f2,f3)):
    print("Reading frame: "+str(i+1))
    for p in f:
        M = p.find('M')
        if M>=0:
            print((len(p)-M+1)*3)

Reading frame: 1
126
6
45
48
117
69
75
135
Reading frame: 2
168
1530
12
180
9
27
72
Reading frame: 3
78
9
105
57
51
48
45
15
69
18
57
96
9
36
105
60
117


These are the lengths of all possible open reading frames. There are many different candidates. Often the longest is the correct one, here that is the case too.

Note: here we know the 5' -> 3' direction (can you see why?), but generally both directions should be searched.

## Putting everything together

In [25]:
# a simple ORF finder

seq = transcript.seq

# search for at least 100 amino acid length:
min_len = 100

for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
    for frame in range(3):
        pos = 0
        for prot in nuc[frame:].translate().split('*'):
            if len(prot) >= min_len:
                start = prot.find("M")
                print("%s...%s - fragment length: %i, strand: %i, frame: %i, pos: %i, start: %i, coding length %i, has start %s" % (prot[start:start+10], prot[-3:], (len(prot)-start+1)*3, strand, frame, pos*3, (max(0,start)+pos+frame)*3+1, (len(prot[start:])+1)*3, start>-1))
                pos = pos + len(prot)+1

MLAAYSTGLS...NKM - fragment length: 69, strand: 1, frame: 0, pos: 0, start: 253, coding length 69, has start True
MVDGVMILPV...TDC - fragment length: 1530, strand: 1, frame: 1, pos: 0, start: 49, coding length 1530, has start True
MDFDTSILSL...GNA - fragment length: 372, strand: -1, frame: 0, pos: 0, start: 10, coding length 372, has start True
MSPSIVPYST...PCS - fragment length: 294, strand: -1, frame: 2, pos: 0, start: 49, coding length 294, has start True


In [38]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
my_seq= "MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNEDHCEGQQCFSSLSINDGFHVYQKGCFQVYEQGKMTCKTPPSPGQAVECCQGDWCNRNITAQLPTKGKSFPGTQNFHLEVGLIILSVVFAVCLLACLLGVALRKFKRRNQERLNPRDVEYGTIEGLITTNVGDSTLADLLDHSCTSGSGSGLPFLVQRTVARQITLLECVGKGRYGEVWRGSWQGENVAVKIFSSRDEKSWFRETELYNTVMLRHENILGFIASDMTSRHSSTQLWLITHYHEMGSLYDYLQLTTLDTVSCLRIVLSIASGLAHLHIEIFGTQGKPAIAHRDLKSKNILVKKNGQCCIADLGLAVMHSQSTNQLDVGNNPRVGTKRYMAPEVLDETIQVDCFDSYKRVDIWAFGLVLWEVARRMVSNGIVEDYKPPFYDVVPNDPSFEDMRKVVCVDQQRPNIPNRWFSDPTLTSLAKLMKECWYQNPSARLTALRIKKTLTKIDNSLDKLKTDC"

analysed_seq = ProteinAnalysis(my_seq)


analysed_seq.count_amino_acids()

{'A': 22,
 'C': 21,
 'D': 29,
 'E': 26,
 'F': 17,
 'G': 35,
 'H': 12,
 'I': 27,
 'K': 29,
 'L': 53,
 'M': 14,
 'N': 23,
 'P': 23,
 'Q': 23,
 'R': 26,
 'S': 36,
 'T': 30,
 'V': 40,
 'W': 9,
 'Y': 14}