In [43]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [44]:
gb_file = r"C:\Users\jojoa\Emory University\EC-Phy-Kim Lab - Documents\1 Lab Shared Documents\SeqCenter Results\EmmaDawson221020\Assemblies\AMK50-PM-WT\AMK50-PM-WT.gbk"

In [45]:
for gb_record in SeqIO.parse(open(gb_file,"r"), "genbank") :
    # now do something with the record
    print("Name %s, %i features" % (gb_record.name, len(gb_record.features)))
    print(repr(gb_record.seq))
#there are 3795 features that SeqCenter has annotated for us where the 3796th one if the source

Name 1, 3796 features
Seq('GTGTCACTTTCGCTTTGGCAGCATTGTCTTGCCCGATTGCAGGATGAGTTACCT...GCC')


Let's do an example with feature 26.

In [27]:
gb_feature = gb_record.features[26]

In [32]:
start=gb_feature.location.nofuzzy_start #first base pair in seq for feature
end=gb_feature.location.nofuzzy_end #last base pair in seq for feature

In [33]:
gb_record.seq[start:end].reverse_complement()

Seq('ATGATGTTGTTTAAAGACGATAGTAAAAAATCTTTATGGAATTTATCCATTATT...TAA')

Now that we know it works, let's loop through all features to get their respective sequences.

In [50]:
seq_list=[]
for i in range(0,len(gb_record.features)):
    gb_feature = gb_record.features[i]
    start=gb_feature.location.nofuzzy_start #first base pair in seq for feature
    end=gb_feature.location.nofuzzy_end #last base pair in seq for feature
    seq=gb_record.seq[start:end].reverse_complement()
    seq_list.append(seq) #full list of all sequences for every feature

In [51]:
len(seq_list)

3796

In [52]:
seq_list

[Seq('GGCGGACTCCCCTCAAAAACAAGACTAAAAAATAAACAGGAAAAATGTGATTAA...CAC'),
 Seq('CTAGGATGATAATGTTCTGATTAAGTTAGAAAAATCTTCTTTGATATCGTGACT...CAC'),
 Seq('TTTAGCATTGGCTTTACGTTCCATAATACTGTTGCCCACCGCATGCAATAGATG...ATA'),
 Seq('TTACAAGCGCATCGGCATAACAACATAAGCTGCGGCTGCACTTGCCACATTCTC...CAT'),
 Seq('CTATGGTTTAACCTCTATTTTGCCATGTTCTACGCTAAATAGCCTGCTATTTAC...CAT'),
 Seq('TTAGATATCGATATTTTCTGCTTTTAGTGCATTCTCTTCGATAAAGGCACGACG...CAT'),
 Seq('ATGAAAAATGTTGGTTTTGTGGGCTGGCGTGGTATGGTCGGCTCCGTTTTAATG...TAA'),
 Seq('TTACACTATTTTTGGGGTGCAATAATAAACCCAAGCAGGCGGTACATTTAACAT...CAT'),
 Seq('TCATTTTGCTAGAGCTTGTCTAATAAACCCTTTATTTTGCGCCAACAGCTGTGT...CAT'),
 Seq('ATGGAAATCTACGTTTATGCTGATTGGACTGAATCAACTGACCCTGTATTAGTT...TAA'),
 Seq('ATGAATACCATCGAATCAACAACAAAGCGCACTGCTGTCGTTTTTCCAAAACAT...TAG'),
 Seq('ATGATATGGGTTAGCGCACAGGAAGTTATCGCTTTCCATGATCGTATATTACAA...TAA'),
 Seq('ATGCGTACATATACCTCGACACAAGCCCGAGCCAATATTTCAGAAGTATTAGAT...TGA'),
 Seq('ATGATCCGTAGTATGACCGCTTTCGCTCGTCGAGATATCAAAAAAGAGTGGGGT...TAA'),
 Seq('TTATTGCTTTAGCG

Let's define a function that separates out the CDS features from the rest.

In [54]:
def index_genbank_features(gb_record, feature_type, qualifier) :
    answer = dict()
    for (index, feature) in enumerate(gb_record.features) :
        if feature.type==feature_type :
            if qualifier in feature.qualifiers :
                #There should only be one locus_tag per feature, but there
                #are usually several db_xref entries
                for value in feature.qualifiers[qualifier] :
                    if value in answer :
                        print("WARNING - Duplicate key %s for %s features %i and %i" \
                           % (value, feature_type, answer[value], index))
                    else :
                        answer[value] = index
    return answer

In [56]:
locus_tag_cds_index=index_genbank_features(gb_record,"CDS","locus_tag")
locus_tag_cds_index

{'KBMPMAKL_00001': 1,
 'KBMPMAKL_00003': 3,
 'KBMPMAKL_00004': 4,
 'KBMPMAKL_00005': 5,
 'KBMPMAKL_00006': 6,
 'KBMPMAKL_00007': 7,
 'KBMPMAKL_00008': 8,
 'KBMPMAKL_00009': 9,
 'KBMPMAKL_00010': 10,
 'KBMPMAKL_00011': 11,
 'KBMPMAKL_00012': 12,
 'KBMPMAKL_00013': 13,
 'KBMPMAKL_00014': 14,
 'KBMPMAKL_00015': 15,
 'KBMPMAKL_00016': 16,
 'KBMPMAKL_00017': 17,
 'KBMPMAKL_00018': 18,
 'KBMPMAKL_00019': 19,
 'KBMPMAKL_00020': 20,
 'KBMPMAKL_00021': 21,
 'KBMPMAKL_00022': 22,
 'KBMPMAKL_00023': 23,
 'KBMPMAKL_00024': 24,
 'KBMPMAKL_00025': 25,
 'KBMPMAKL_00026': 26,
 'KBMPMAKL_00027': 27,
 'KBMPMAKL_00028': 28,
 'KBMPMAKL_00029': 29,
 'KBMPMAKL_00030': 30,
 'KBMPMAKL_00031': 31,
 'KBMPMAKL_00032': 32,
 'KBMPMAKL_00033': 33,
 'KBMPMAKL_00034': 34,
 'KBMPMAKL_00035': 35,
 'KBMPMAKL_00036': 36,
 'KBMPMAKL_00037': 37,
 'KBMPMAKL_00038': 38,
 'KBMPMAKL_00039': 39,
 'KBMPMAKL_00040': 40,
 'KBMPMAKL_00041': 41,
 'KBMPMAKL_00042': 42,
 'KBMPMAKL_00043': 43,
 'KBMPMAKL_00044': 44,
 'KBMPMAKL_00045': 

This shows that there are 3617 out of 3796 features that are CDS features.

In [57]:
len(locus_tag_cds_index)

3617