# Downloading Files from GenBank

In [26]:
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "fabien.allemand@etu.unistra.fr"

In [27]:
search_db = "nucleotide"
fetch_db = "nuccore"

In [28]:
orga = "Homo sapiens"

region = ["CDS", "centrometre", "intron", "mobile_element", "ncRNA", "rRNA", "telomere", "tRNA", "3'UTR", "5'UTR"]
valid_codon_start = ["ATG", "CTG", "TTG", "GTG", "ATA", "ATC", "ATT", "TTA"]
valid_codon_stop = ["TAA", "TAG", "TGA"]

In [29]:

handle = Entrez.esearch(db=search_db, term="(" + orga + "[Organism] AND NC_*[Accession])", retmax ="99999999", usehistory='y')
record = Entrez.read(handle)
ids = record["IdList"]
handle.close()

print(len(ids))
print(ids)

24
['568815597', '568815596', '568815595', '568815594', '568815593', '568815592', '568815591', '568815590', '568815589', '568815588', '568815587', '568815586', '568815585', '568815584', '568815583', '568815582', '568815581', '568815580', '568815579', '568815578', '568815577', '568815576', '568815575', '568815574']


In [60]:
ids = ["NC_018416"] # For testing purpose, very small organism
for id in ids[:3]:
    print("#### id =", id, "####")
    handle = Entrez.efetch(db=fetch_db, id=id, rettype="gbwithparts", retmode="text")
    try:
        record = SeqIO.read(handle, "genbank")
    except:
        print("ERROR: Unable to read from id =", id)
    handle.close()
    
    DNA_seq = None
    DNA_seq_len = -1
    try:
        DNA_seq = record.seq
        DNA_seq_len = len(DNA_seq)
        print(DNA_seq)
    except:
        print("ERROR: Unable to read sequence(s) from id =", id)
    
    try:
        for feature in record.features:
            if feature.type in region:
                print("TYPE =", feature.type)
                print("DEBUG =", feature.location.start, feature.location.end)
                print("STRAND =", feature.location.strand)
                sequence_start = int(feature.location.start)
                sequence_end = int(feature.location.end)
                print("DEBUG =", sequence_start, sequence_end)
                if sequence_start < 0 or sequence_end >= DNA_seq_len or sequence_end <= sequence_start:
                    print("ERROR: Invalid sequence start/stop (%d,%d)" % (sequence_start, sequence_end))
                feature_DNA_seq = DNA_seq[sequence_start : sequence_end] 
                if len(feature_DNA_seq) % 3 != 0:
                    print("ERROR: Invalid sequence length (%d)" % len(feature_DNA_seq))
                    continue
                # print(type(feature_DNA_seq))
                if feature.strand == -1:
                    feature_DNA_seq = feature_DNA_seq.reverse_complement()
                print("SEQUENCE =", str(feature_DNA_seq))
                codon_start = feature_DNA_seq[0:3]
                codon_stop = feature_DNA_seq[len(feature_DNA_seq)-3:]
                if codon_start not in valid_codon_start:
                    print("ERROR: Invalid codon start (%s)" % codon_start)
                    continue
                if codon_stop not in valid_codon_stop:
                    print("ERROR: Invalid codon stop (%s)" % codon_stop)
                    continue
                if not all(base in "ATGC" for base in feature_DNA_seq):
                    print("ERROR: Invalid base found in sequence")
                    continue
                
                # print("STRAND =", feature.location.strand)
                print("QUALIFIER =", feature.qualifiers)
                print("LOCATION_OPERATOR =", feature.location_operator)
    except:
        print("ERROR: Unable to read feature(s) from id =", id)


#### id = NC_018416 ####
ATGAATGATATTATTTTTGCAAAAGTAACAGCAGATGGTATTTGTGCGGTAAATATAGTAAAATTATCAGGAAAAAATGTAAACAAATTAATATTCCCTTTAATAAAAAAAAAATTAAAAAAACAAAAAATGATATATACAAATTTATATGGTATAAAAGAGAAATATACAGAAAAAATATTAATAGTTTTTTTTAAATCACCAAATACTTTTACAGGTGAAGATTTAATAGAATTTCATTTAAATGGAAATTATTGTTTATTAAATAAATTAATTAAAGATTTAATTTTTTTAGGTGTTAGACCAGCTAAACCTGGAGAATTTTTAGAAAGAAGATATTCTAGTGGGAAAATTACATTATTTGAATGTGAAATTATTAATGATAAAATTTTATATAGTTATACAAATATGTTTAAATTAACAATGGAAGATCAAAAAAATTTTTATCTTTCTATAATACAAAATTTAAAATTTAAATTTAATATTATTATAATATGTTTAGAATGTTTTTGTATTTTTAATAAAAATTCTTTAAAAAAAGATTTTATTTTTATTAAAAATTTTTTTAAAAAAATAAAAAAATTAATTAATATATTACAAATTAAAATAGAAAAAATTGATTATTTAAAAAATAAATTTGAAATTATGATAATAGGTAGAAGAAATGTAGGAAAATCTACTTTATTTAATAAATTATGTTTACAATATGGTTCTATTGTTACAAATATACCAGGAACAACAAAAAATAATATAAATAAACAAATTTTTTTTTCTTCTAAAAAAATAAATATTAATGATACAGCAGGAATAAAAATTAAATCTAAAAATTTAATAGAAAAAATAGGTATATTAAAAAATATTAATAAAATTTTTAAAAGTAGTTTAATATTATATATAGTTGACGAATTTAACTTAAAAAAAAGTTTATTTAATGCTCCTTTAAATATATATGAAAAAATAATACAAAATAAAATT

## Notes:

- .strand
    – shorthand for .location.strand – the strand on the sequence that the feature is located on. For double stranded nucleotide sequence this may either be 1 for the top strand, −1 for the bottom strand, 0 if the strand is important but is unknown, or None if it doesn’t matter. This is None for proteins, or single stranded sequences.

In [16]:
for i in range(10):
    if 1 == 1:
        print("boucle", i)
        continue
    print("end boucle")

boucle 0
boucle 1
boucle 2
boucle 3
boucle 4
boucle 5
boucle 6
boucle 7
boucle 8
boucle 9


In [41]:
test = "bonjour"
print(test[len(test)-3:])

our
