In [1]:
# import the sequence io
from Bio import SeqIO as seq

In [3]:
# crete a list object for our file

records = list(seq.parse("hemoglobin_alpha2_dna.fa", "fasta"))

In [4]:
# Let's take a look at our first 5 entries

records[:5]

[SeqRecord(seq=Seq('ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTC...TAA', SingleLetterAlphabet()), id='ENSG00000188536', name='ENSG00000188536', description='ENSG00000188536 Human (Homo sapiens)', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGTGCTGTCTCCCGCCGACAAGACCAACATCAAGTCCACTTGGGATAAGATT...TAA', SingleLetterAlphabet()), id='ENSCAFP00000035886', name='ENSCAFP00000035886', description='ENSCAFP00000035886 Dog (Canis lupus familiaris)', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGTGCTGTCAGCCAACGACAAGAGCAACGTCAAGGCCGCTTTCGGCAAAATC...TAA', SingleLetterAlphabet()), id='ENSFALP00000004267', name='ENSFALP00000004267', description='ENSFALP00000004267 Flycatcher (Ficedula albicollis)', dbxrefs=[]),
 SeqRecord(seq=Seq('GATGGGTTCCGTGGGCGCGTACCTCCCCCTAACCCCCCTTCCCCTGGCACGGAT...TGA', SingleLetterAlphabet()), id='ENSOANP00000032350', name='ENSOANP00000032350', description='ENSOANP00000032350 Platypus (Ornithorhynchus anatinus)', dbxrefs=[]),
 SeqRecord(seq=Seq('CTGGTGCTGTCTCCCAACAAAACCAACGTCAAGGCCGCCTGG

In [5]:
# How many do we have?

len(records)

75

In [8]:
# create generator object for our data

record_iterator = seq.parse("hemoglobin_alpha2_dna.fa", "fasta")

record_iterator

<generator object parse at 0x7f2b11e80ba0>

In [9]:
# grab the first record

first_record = next(record_iterator)

In [10]:
# let's see what we got

print(first_record)

ID: ENSG00000188536
Name: ENSG00000188536
Description: ENSG00000188536 Human (Homo sapiens)
Number of features: 0
Seq('ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTC...TAA', SingleLetterAlphabet())


In [11]:
# we will now translate the sequence from DNA to Amino Acid
# but this will in turn break our other fields

print(first_record.translate())

ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSA...YR*', HasStopCodon(ExtendedIUPACProtein(), '*'))


In [13]:
# make sure it matches the first object of our list of data objects
records[0].translate()

SeqRecord(seq=Seq('MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSA...YR*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [14]:
# here is where will put it all together 
# notice how we carry over the other fields as well

# instantiate emppty list
translated = []

# instantiate iterator
i = 0

# loop through and change the data for every DNA 
for seqRecord in seq.parse("hemoglobin_alpha2_dna.fa", "fasta"):
    
    translated.append(seqRecord.translate())
    translated[i].id = seqRecord.id
    translated[i].name = seqRecord.name
    translated[i].description = seqRecord.description

    i += 1

In [16]:
# Use this to compare to our new translated list
records[:5]

[SeqRecord(seq=Seq('ATGGTGCTGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTC...TAA', SingleLetterAlphabet()), id='ENSG00000188536', name='ENSG00000188536', description='ENSG00000188536 Human (Homo sapiens)', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGTGCTGTCTCCCGCCGACAAGACCAACATCAAGTCCACTTGGGATAAGATT...TAA', SingleLetterAlphabet()), id='ENSCAFP00000035886', name='ENSCAFP00000035886', description='ENSCAFP00000035886 Dog (Canis lupus familiaris)', dbxrefs=[]),
 SeqRecord(seq=Seq('ATGGTGCTGTCAGCCAACGACAAGAGCAACGTCAAGGCCGCTTTCGGCAAAATC...TAA', SingleLetterAlphabet()), id='ENSFALP00000004267', name='ENSFALP00000004267', description='ENSFALP00000004267 Flycatcher (Ficedula albicollis)', dbxrefs=[]),
 SeqRecord(seq=Seq('GATGGGTTCCGTGGGCGCGTACCTCCCCCTAACCCCCCTTCCCCTGGCACGGAT...TGA', SingleLetterAlphabet()), id='ENSOANP00000032350', name='ENSOANP00000032350', description='ENSOANP00000032350 Platypus (Ornithorhynchus anatinus)', dbxrefs=[]),
 SeqRecord(seq=Seq('CTGGTGCTGTCTCCCAACAAAACCAACGTCAAGGCCGCCTGG

In [17]:
# notice how the dna is now in the form of an Amino acid
translated[:5]

[SeqRecord(seq=Seq('MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSA...YR*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='ENSG00000188536', name='ENSG00000188536', description='ENSG00000188536 Human (Homo sapiens)', dbxrefs=[]),
 SeqRecord(seq=Seq('MVLSPADKTNIKSTWDKIGGHAGDYGGEALDRTFQSFPTTKTYFPHFDLSPGSA...YR*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='ENSCAFP00000035886', name='ENSCAFP00000035886', description='ENSCAFP00000035886 Dog (Canis lupus familiaris)', dbxrefs=[]),
 SeqRecord(seq=Seq('MVLSANDKSNVKAAFGKIGGQADEYGAETLERMFATYPQTKTYFPHFDLSKGSA...YR*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='ENSFALP00000004267', name='ENSFALP00000004267', description='ENSFALP00000004267 Flycatcher (Ficedula albicollis)', dbxrefs=[]),
 SeqRecord(seq=Seq('DGFRGRVPPPNPPSPGTDSVDSPGTDPVDSPPGTVPTPLTFPALSRRLFLSFPP...SS*', HasStopCodon(ExtendedIUPACProtein(), '*')), id='ENSOANP00000032350', name='ENSOANP00000032350', description='ENSOANP00000032350 Platypus (Ornithorhynchus anatinus)',

In [20]:
seq.write(translated, "translated_records.fa", "fasta")

75