In [1]:
from Bio import SeqIO
import re
import csv

Get records from .gb file

In [2]:
file_path = 'FMDV_records.gb'
genbank_records = SeqIO.parse(file_path, 'gb')

Create empty lists for storing necessary information

In [3]:
hosts = []
collection_dates = []
countries = []
cities = []
isolation_sources = []
isolates = []
strains = []
serotypes = []
record_ids = []
number_of_records = 0
CDS_count = 0

Functions

In [4]:
# The function checks if the qualifier is a list and returns the appropriate value.
# If the qualifier is a list, its first element is returned.
# If the qualifier is not a list, it is returned as is.
# If the qualifier is not found by the key, the string 'Unknown' is returned.
def parse_qualifier(qualifier, key):
    value = qualifier.get(key, 'Unknown')
    return value if not isinstance(value, list) else value[0]

# The function applies parse_qualifier to the necessary qualifiers.
def extract_data_from_record(record):
    features = record.features
    source_feature_list = [feature for feature in features if feature.type == 'source']

    for source_feature in source_feature_list:
        qual_dict = source_feature.qualifiers

        # Obtain the required values of the qualifiers.
        host = parse_qualifier(qual_dict, 'host')
        collection_date = parse_qualifier(qual_dict, 'collection_date')
        country_city = parse_qualifier(qual_dict, 'country')
        isolation_source = parse_qualifier(qual_dict, 'isolation_source')
        isolate = parse_qualifier(qual_dict, 'isolate')
        strain = parse_qualifier(qual_dict, 'strain')
        serotype = parse_qualifier(qual_dict, 'serotype')

        # Split the country qualifier because it contains information about both the country and the city.
        country_name, city_name = country_city.split(': ') if ':' in country_city else (country_city, 'Unknown')

        # Save all values in lists.
        hosts.append(host)
        collection_dates.append(collection_date)
        countries.append(country_name)
        cities.append(city_name)
        isolation_sources.append(isolation_source)
        isolates.append(isolate)
        strains.append(strain)
        serotypes.append(serotype)

    record_ids.append(record.id)

# Execute the function for each record in the .gb file.
for record in genbank_records:
    extract_data_from_record(record)
    CDS_count += sum(1 for feature in record.features if feature.type == 'CDS')
    number_of_records += 1

print("Number of records:", number_of_records)
print("Number of CDS:", CDS_count)
print("Hosts:", hosts[:5])
print("Collection Dates:", collection_dates[:5])
print("Countries:", countries[:5])
print("Cities:", cities[:5])
print("Isolation Sources:", isolation_sources[:5])
print("Isolates:", isolates[:5])
print("Strains:", strains[:5])
print("Serotypes:", serotypes[:5])
print("Record IDs:", record_ids[:5])


Number of records: 507
Number of CDS: 507
Hosts: ['Bos taurus', 'Bos taurus', 'Bos taurus', 'Bos taurus', 'Bos taurus']
Collection Dates: ['07-Sep-2015', '26-Aug-2015', '26-Aug-2015', '26-Aug-2015', '26-Aug-2015']
Countries: ['Viet Nam', 'Viet Nam', 'Viet Nam', 'Viet Nam', 'Viet Nam']
Cities: ['Dong Thap', 'Dong Thap', 'Dong Thap', 'Dong Thap', 'Dong Thap']
Isolation Sources: ['passaged probang', 'passaged probang', 'passaged probang', 'passaged probang', 'passaged probang']
Isolates: ['A/VIT/361/2015', 'A/VIT/98/2015', 'A/VIT/95/2015', 'A/VIT/78/2015', 'A/VIT/75/2015']
Strains: ['Sea-97', 'Sea-97', 'Sea-97', 'Sea-97', 'Sea-97']
Serotypes: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']
Record IDs: ['ON652985.1', 'ON652984.1', 'ON652983.1', 'ON652982.1', 'ON652981.1']


In [5]:
# Function for reading map.csv files
def read_csv(file_name):
    with open(file_name) as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=",", 
                                fieldnames=["base", "new"])
        result = {}
        
        for row in reader:
            base_value = row["base"].strip()
            new_value = row["new"].strip().rstrip(';')
            result[base_value] = new_value
        csvfile.close()
        return result

In [6]:
# The function maps values from a list to values in a CSV file.
def map_qualifiers(qualifier, csv_file):
    data_dict = read_csv(csv_file)
    feature_map_comp = [(re.compile(key), value) for key, value in data_dict.items()]
    mapped_values = []
    for name in qualifier:
        for regex, new_name in feature_map_comp:
            match = regex.search(name)
            if match:
                mapped_values.append(new_name)
                break
        else:
            mapped_values.append(name)
    return mapped_values

In [7]:
# Mapping hosts
hosts_mapped = map_qualifiers(hosts, 'Maps/host_map.csv')

# Mapping countries
countries_mapped = map_qualifiers(countries, 'Maps/country_map.csv')

# Converting all dates to years
collection_years = [date[-4:] for date in collection_dates]

In [8]:
# The function prints unique values and their count in the list
def print_sorted_counts(items):
    count_dict = {}
    for item in items:    
        count_dict[item] = count_dict.get(item, 0) + 1
    
    for key, value in sorted(count_dict.items(), key=lambda x: x[1], reverse=True):
        print(f"{key}: {value}")

Below you can output the values ​​of qualifiers with frequencies

In [9]:
# Values of the host qualifier
print("Counts for hosts:")
print_sorted_counts(hosts_mapped)

Counts for hosts:
cow: 237
cattle: 121
pig: 68
buffalo: 25
Unknown: 21
sheep: 17
goat: 7
gazelle: 6
water-buffalo: 4
elephant: 1


In [10]:
# Values of the country qualifier
print("Counts for countries:")
print_sorted_counts(countries_mapped)

Counts for countries:
VNM: 194
KEN: 155
Unknown: 61
PAK: 24
NER: 21
TUN: 11
ARG: 5
DZA: 5
THA: 4
UGA: 4
BRA: 3
IRQ: 3
EGY: 2
ISR: 2
ETH: 2
IDN: 2
GBR: 2
IND: 1
URY: 1
DEU: 1
BGD: 1
ZMB: 1
SDN: 1
NLD: 1


In [11]:
# Values for collection_date qualifier
print("Counts for collection_dates:")
print_sorted_counts(collection_years)

Counts for collection_dates:
2016: 154
2018: 87
2007: 59
2019: 51
2015: 29
2014: 26
2020: 21
2017: 19
2012: 15
2013: 6
2006: 6
2010: 6
2011: 5
2022: 3
2001: 2
2009: 2
1988: 1
1993: 1
2008: 1
2003: 1
1970: 1
1969: 1
1971: 1
1990: 1
1991: 1
2-15: 1
1-22: 1
2021: 1
1963: 1
2002: 1
2000: 1
1987: 1


In [12]:
# Values for isolation_source qualifier
print("Counts for isolation_sources:")
print_sorted_counts(isolation_sources)

Counts for isolation_sources:
Unknown: 255
epithelial tissue: 64
oropharyngeal fluid (probang sample): 62
passaged probang: 16
vesicular epithelium: 15
Cell supernatant: 14
cattle probang tissue; Beit She'arim, Jezreel Valiey Regional Concil, Israel, 2007: 5
probang: 4
Tongue Epi Tissue: 4
cell culture supernatant: 4
cattle probang tissue; Neve Ur, Emek HaMa'ayanot Regional Council, Israel, 2007: 3
wild gazelle head tissue; Kochav Hayarden, Lower Galilee, Israel, 2007: 3
cattle probang tissue; Ramat Magshimim, Southern Golan Heights, Israel, 2007: 2
wild gazelle head tissue; Ramat Yissahar, Lower Galilee, Israel, 2007: 2
goat epithelium tissue; Hura, Southern District, Israel, 2007: 2
tongue epithelium: 2
tissue (vesicle): 1
GPVF: 1
GPVF Passage 6: 1
Cell culture supernatant: 1
wild gazelle epithelium tissue; Zur Natan, Drom Hasaron, Israel, 2007: 1
sheep heart tissue; Bethlehem, Palestinian Authority, 2007: 1
sheep heart tissue; Zababdeh, Jenin district, Palestinian Authority, 2007: 1

In [13]:
# Values for isolate qualifier
print("Counts for isolates:")
print_sorted_counts(isolates)

Counts for isolates:
Unknown: 166
O/VIT/DL-P44-1/2017_pro: 2
O/VIT/DT-P-108-2/2018_pro: 2
O/VIT/DT-P-69-2/2018_pro: 2
A/VIT/361/2015: 1
A/VIT/98/2015: 1
A/VIT/95/2015: 1
A/VIT/78/2015: 1
A/VIT/75/2015: 1
A/VIT/74/2015: 1
A/VIT/441/2015: 1
A/VIT/352/2015: 1
A/VIT/350/2015: 1
A/VIT/239/2015: 1
A/VIT/216/2015: 1
A/VIT/212/2015: 1
A/VIT/209/2015: 1
A/VIT/207/2015: 1
A/VIT/206/2015: 1
A/VIT/203/2015: 1
A/VIT/201/2015: 1
A/VIT/165/2015: 1
A/VIT/14661/2013: 1
A/VIT/141/2015: 1
A/VIT/109/2015: 1
A/IND61/1988: 1
FBR03588: 1
FBR02520: 1
FBR02756DI: 1
FBR04313: 1
FBR04302: 1
FBR04274: 1
FBR03587: 1
FBR03563: 1
FBR03143: 1
FBR02770: 1
FBR01218: 1
FBR00044: 1
FBR00015: 1
FBR00926: 1
KVI_550119: 1
KVI_545682: 1
KVI_548463*163: 1
KVI_550746: 1
KVI_546402*129: 1
KVI_541556: 1
KVI_(468)547412: 1
KVI_(482)547412: 1
KVI_547412: 1
KVI_546423*3: 1
KVI_546423 *2: 1
KVI_547801: 1
KVI_547793: 1
KVI_548462*162: 1
KVI_544011: 1
KVI_546826: 1
KVI_540421: 1
KVI_(246)543181: 1
KVI_(230)543181: 1
KVI_537323: 1
KVI_

In [14]:
# Values for strain qualifier
print("Counts for strains:")
print_sorted_counts(strains)

Counts for strains:
Unknown: 319
Sea-97: 21
O/ALG/1/2015: 1
O/ALG/5/2014: 1
O/ALG/3/2014: 1
O/ALG/2/2014: 1
O/ALG/1/2014: 1
O/TUN/14/2014: 1
O/TUN/13/2014: 1
O/TUN/12/2014: 1
O/TUN/11/2014: 1
O/TUN/10/2014: 1
O/TUN/9/2014: 1
O/TUN/8/2014: 1
O/TUN/7/2014: 1
O/TUN/6/2014: 1
O/TUN/5/2014: 1
O/TUN/3/2014: 1
C1 GC: 1
K29/Kenya/29Jan2014/SAT1: 1
K14/Kenya/10Jan2014/SAT1: 1
88/Kenya/13Jan2016/SAT2: 1
87/Kenya/13Jan2016/SAT2: 1
86/Kenya/13Jan2016/SAT2: 1
83/Kenya/13Jan2016/SAT2: 1
81/Kenya/13Jan2016/SAT2: 1
80/Kenya/13Jan2016/SAT2: 1
76/Kenya/13Jan2016/SAT2: 1
75/Kenya/13Jan2016/SAT2: 1
74/Kenya/13Jan2016/SAT2: 1
73/Kenya/13Jan2016/SAT1: 1
72/Kenya/13Jan2016/SAT1: 1
71/Kenya/13Jan2016/SAT2: 1
69/Kenya/13Jan2016/SAT2: 1
67/Kenya/13Jan2016/SAT2: 1
65/Kenya/13Jan2016/SAT2: 1
63/Kenya/13Jan2016/SAT2: 1
62/Kenya/13Jan2016/SAT2: 1
61/Kenya/13Jan2016/SAT2: 1
61/Kenya/13Jan2016/SAT1: 1
60/Kenya/13Jan2016/SAT2: 1
59/Kenya/12Jan2016/SAT2: 1
59/Kenya/12Jan2016/SAT1: 1
58/Kenya/12Jan2016/SAT2: 1
56/Kenya/

In [15]:
# Values for serotype qualifier
print("Counts for serotypes:")
print_sorted_counts(serotypes)

Counts for serotypes:
Unknown: 291
O: 174
A: 19
SAT2: 13
C: 4
SAT1: 3
Asia 1: 2
SAT3: 1


The function extracts CDS sequences and creates a FASTA file with headers in the format >GenbankAC/country/host/year/serotype.

For this purpose, mapped lists are used.

In [16]:
def write_fasta_from_genbank(genbank_records, hosts, collection_dates, countries, serotypes, record_ids, output_file):
    with open(output_file, 'w') as fasta_file:
        for i, record in enumerate(genbank_records):

            # Constructing the header
            header = f">{record_ids[i]}/{countries[i]}/{hosts[i]}/{collection_dates[i]}/{serotypes[i]}"
            fasta_file.write(header + '\n')

            # Getting the coordinates of the coding sequence
            cds_start = record.features[1].location.start.position
            cds_end = record.features[1].location.end.position

            # Extracting the coding sequence from the full sequence (ORIGIN)
            cds_sequence = record.seq[cds_start:cds_end]

            # Writing the sequence, moving to a new line every 70 characters
            for i in range(0, len(cds_sequence), 70):
                fasta_file.write(str(cds_sequence[i:i+70]) + '\n')


In [17]:
genbank_records = SeqIO.parse(file_path, 'gb')
write_fasta_from_genbank(genbank_records, hosts_mapped, collection_years, countries_mapped, serotypes, record_ids, 'extracted_CDS.fasta')

