In [110]:
from Bio import SeqIO
import re
import csv
import pandas as pd

Get records from .gb file

In [111]:
file_path = 'FMDV_all_records.gb'
genbank_records = SeqIO.parse(file_path, 'gb')

Create empty lists for storing necessary information

In [112]:
hosts = []
collection_dates = []
countries = []
cities = []
isolation_sources = []
isolates = []
strains = []
serotypes = []
record_ids = []
number_of_records = 0
CDS_count = 0

Functions

In [113]:
"""
The function checks if the qualifier is a list and returns the appropriate value.
If the qualifier is a list, its first element is returned.
If the qualifier is not a list, it is returned as is.
If the qualifier is not found by the key, the string 'Unknown' is returned. 
"""
def parse_qualifier(qualifier, key):
    value = qualifier.get(key, 'Unknown')
    return value if not isinstance(value, list) else value[0]

def merge_values(values):
    #Merges multiple values into one by prioritizing non-'Unknown' values.
    for value in values:
        if value != 'Unknown':
            return value
    return 'Unknown'

# The function applies parse_qualifier to the necessary qualifiers.
def extract_data_from_record(record):
    features = record.features
    source_feature_list = [feature for feature in features if feature.type == 'source']

    # Dictionary to store merged values for each qualifier
    merged_values = {
        'host': [],
        'collection_date': [],
        'country': [],
        'isolation_source': [],
        'isolate': [],
        'strain': [],
        'serotype': []
    }

    # Iterate through 'source' features
    for source_feature in source_feature_list:
        qual_dict = source_feature.qualifiers

        # Iterate through qualifiers and merge values
        for key in merged_values.keys():
            merged_values[key].append(parse_qualifier(qual_dict, key))

    # Append merged values to respective lists
    hosts.append(merge_values(merged_values['host']))
    collection_dates.append(merge_values(merged_values['collection_date']))
    countries.append(merge_values(merged_values['country']))
    isolation_sources.append(merge_values(merged_values['isolation_source']))
    isolates.append(merge_values(merged_values['isolate']))
    strains.append(merge_values(merged_values['strain']))
    serotypes.append(merge_values(merged_values['serotype']))

    record_ids.append(record.name)

# Execute the function for each record in the .gb file.
for record in genbank_records:
    extract_data_from_record(record)
    CDS_count += sum(1 for feature in record.features if feature.type == 'CDS')
    number_of_records += 1

print("Number of records:", number_of_records)
print("Number of CDS:", CDS_count)
print("Hosts:", hosts[:5])
print("Collection Dates:", collection_dates[:5])
print("Countries:", countries[:5])
#print("Cities:", cities[:5])
print("Isolation Sources:", isolation_sources[:5])
print("Isolates:", isolates[:5])
print("Strains:", strains[:5])
print("Serotypes:", serotypes[:5])
print("Record IDs:", record_ids[:5])


Number of records: 1870
Number of CDS: 1870
Hosts: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Bubalus bubalis']
Collection Dates: ['Unknown', 'Unknown', 'Unknown', 'Unknown', '01-Dec-2013']
Countries: ['Spain', 'Spain', 'Spain:Olot', 'Unknown', 'India']
Isolation Sources: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'Unknown']
Isolates: ['rp146', 'rp99', 'C-s8c1', 'Unknown', 'FMDV/Goat/India/2013/Shahjadpur/P0']
Strains: ['C', 'C', 'C', 'A10-61', 'Unknown']
Serotypes: ['Unknown', 'Unknown', 'Unknown', 'Unknown', 'O']
Record IDs: ['AJ133359', 'AJ133358', 'AJ133357', 'X00429', 'OK422491']


In [114]:
# Function for reading map.csv files
def read_csv(file_name):
    with open(file_name) as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=",", 
                                fieldnames=["base", "new"])
        result = {}
        
        for row in reader:
            base_value = row["base"].strip()
            new_value = row["new"].strip().rstrip(';')
            result[base_value] = new_value
        csvfile.close()
        return result

In [115]:
# The function maps values from a list to values in a CSV file.
def map_qualifiers(qualifier, csv_file):
    data_dict = read_csv(csv_file)
    feature_map_comp = [(re.compile(key), value) for key, value in data_dict.items()]
    mapped_values = []
    for name in qualifier:
        for regex, new_name in feature_map_comp:
            match = regex.search(name)
            if match:
                mapped_values.append(new_name)
                break
        else:
            mapped_values.append(name)
    return mapped_values

In [116]:
# Mapping hosts
hosts_mapped = map_qualifiers(hosts, 'Maps/host_map.csv')

# Mapping countries
countries_mapped = map_qualifiers(countries, 'Maps/country_map.csv')

# Converting all dates to years
collection_years = ['Unknown' if date == 'Unknown' else (date[-4:] if date[-4:].isdigit() else date[:4]) for date in collection_dates]

In [117]:
# The function prints unique values and their count in the list
def print_sorted_counts(items):
    count_dict = {}
    for item in items:    
        count_dict[item] = count_dict.get(item, 0) + 1
    
    for key, value in sorted(count_dict.items(), key=lambda x: x[1], reverse=True):
        print(f"{key}: {value}")

Below you can output the values ​​of qualifiers with frequencies

In [118]:
# Values of the host qualifier
print("Counts for hosts:")
print_sorted_counts(hosts_mapped)

Counts for hosts:
cattle: 618
cow: 589
Unknown: 244
pig: 181
buffalo: 156
sheep: 40
goat: 12
water-buffalo: 11
gazelle: 8
unknown: 7
dog: 2
elephant: 1
bullock: 1


In [119]:
# Values of the country qualifier
print("Counts for countries:")
print_sorted_counts(countries_mapped)

Counts for countries:
VNM: 273
Unknown: 262
KEN: 177
GBR: 144
PAK: 139
JPN: 107
IND: 87
ARG: 57
NER: 47
THA: 44
KOR: 39
USA: 35
UGA: 32
CHN: 29
IRQ: 24
ZWE: 21
TUR: 19
ISR: 18
ESP: 16
NPL: 16
EGY: 15
ETH: 15
BWA: 14
MYS: 14
BRA: 13
TUN: 12
SAU: 12
ZAF: 11
BGD: 10
DZA: 10
IRN: 10
MMR: 10
ZMB: 9
BGR: 8
DEU: 7
BTN: 7
TZA: 6
GHA: 6
ARE: 5
MUS: 5
LBY: 5
MNG: 5
TWN: 5
URY: 4
CMR: 4
LKA: 4
RUS: 4
NAM: 4
HKG: 3
IDN: 3
SDN: 3
LAO: 3
AFG: 3
VEN: 3
PHL: 3
LBN: 3
MOZ: 2
BHR: 2
TCD: 2
CHE: 2
ITA: 2
COL: 2
FRA: 2
NLD: 2
MWI: 1
ECU: 1
GRC: 1
MAR: 1
PSE: 1
POL: 1
BEL: 1
PER: 1
IRL: 1
KAZ: 1


In [120]:
# Values for collection_date qualifier
print("Counts for collection_dates:")
print_sorted_counts(collection_years)

Counts for collection_dates:
Unknown: 364
2016: 234
2010: 152
2007: 141
2018: 131
2012: 119
2001: 83
2013: 72
2014: 68
2015: 63
2019: 60
2017: 55
2009: 51
2011: 33
2002: 26
2006: 24
2020: 21
2005: 17
2008: 12
1967: 10
1970: 9
2000: 9
1968: 8
1999: 8
1990: 7
1991: 7
2004: 7
1989: 6
1976: 5
2003: 4
1969: 4
1996: 4
1974: 4
1972: 4
1993: 3
1971: 3
2022: 3
1994: 3
1983: 3
1988: 2
2023: 2
1975: 2
1997: 2
1977: 2
1965: 2
1964: 2
1992: 2
1986: 2
1958: 1
2021: 1
1979: 1
1980: 1
1984: 1
1934: 1
1966: 1
1978: 1
1981: 1
1998: 1
1973: 1
1959: 1
1982: 1
1963: 1
1987: 1


In [121]:
# Values for isolation_source qualifier
print("Counts for isolation_sources:")
print_sorted_counts(isolation_sources)

Counts for isolation_sources:
Unknown: 1297
epithelial tissue: 86
oropharyngeal fluid (probang sample): 62
oropharyngeal fluid: 39
epithelium: 25
tongue epithelium: 23
passaged probang: 16
vesicular epithelium: 15
Cell supernatant: 14
probang: 12
vesicle on tongue: 10
postmortem tissue, dorsal soft palate: 9
postmortem tissue, dorsal nasopharynx: 8
serum: 8
vesicle on coronary band, right hind foot: 7
epithelium from clinically infected animals: 7
oropharyngeal fluid from persistently infected cattle: 6
cattle probang tissue; Beit She'arim, Jezreel Valiey Regional Concil, Israel, 2007: 5
vesicle on coronary band, left hind foot: 5
Tongue Epi Tissue: 4
cell culture supernatant: 4
vesicle on coronary band, right front foot: 4
vesicle on dental pad: 4
vesicle on coronary band, left front foot: 4
cattle probang tissue; Neve Ur, Emek HaMa'ayanot Regional Council, Israel, 2007: 3
wild gazelle head tissue; Kochav Hayarden, Lower Galilee, Israel, 2007: 3
oropharyngeal fluid from persistently i

In [122]:
# Values for isolate qualifier
print("Counts for isolates:")
print_sorted_counts(isolates)

Counts for isolates:
Unknown: 442
O/VIT/17-19073/2017: 2
O/VIT/DL-P44-1/2017_pro: 2
O/VIT/DT-P-108-2/2018_pro: 2
O/VIT/DT-P-69-2/2018_pro: 2
O/VIT/19-005/2019: 2
O/VIT/18CD-1610.2/2018: 2
O/VIT/18-5490/2018: 2
O/VIT/18-3766/2018: 2
A/VIT/DL-P087-1/2017_pro: 2
O/IRN/16/2016: 2
O/IRN/9/2016: 2
o6pirbright iso58: 2
UKG/1734/2001: 2
UKG/1558/2001: 2
UKG/1450/2001: 2
rp146: 1
rp99: 1
C-s8c1: 1
FMDV/Goat/India/2013/Shahjadpur/P0: 1
FMDV/Goat/India/2013/Shahjadpur/P50-WT: 1
O/SKR/2000: 1
O/SKR/2002: 1
A_PAK_C8_2018-Clone-03: 1
A_PAK_C8_2018: 1
A_PAK_C6_2017: 1
FMDV_O/NGO_372_P_2015: 1
FMDV_O/MOY2_099_P_2016: 1
FMDV_O/BUS_379_P_2015: 1
FMDV_O/KYA_004_P_2015: 1
FMDV_O/NAP_189_P_2015: 1
O/Uganda/2006: 1
FMDV/Goat/India/2013/Shahjadpur/P50-KO: 1
FMDV/SAT2/EGY/Ismailia/2018: 1
329773: 1
319463: 1
330653: 1
335278: 1
FMDV_B14-112_A24/NP/1dpi: 1
FMDV_B14-20_A24/NP/1dpi: 1
FMDV_B14-53_A24/NP/3dpi: 1
FMDV_B14-44_A24/SER/4dpi: 1
FMDV_B14-44_A24/NP/4dpi: 1
FMDV_B14-44_A24/VES/4dpi: 1
FMDV_B14-25_A24/SER

In [123]:
# Values for strain qualifier
print("Counts for strains:")
print_sorted_counts(strains)

Counts for strains:
Unknown: 1202
A24 Cruzeiro: 35
A/Arg/01: 32
Sea-97: 21
A/IRQ/09: 16
Aarg2001: 5
O/MYA: 4
C: 3
PanAsia: 3
FMDV/A/ASIA/Iran-05/SIS-13: 3
O/SKR: 3
Asia1Leb83: 3
A22 Iraq: 3
O6: 2
O PanAsia: 2
O UK2001: 2
O1 Campos: 2
O1 BFS: 2
O10 Philippines: 2
A10-61: 1
vaccine IND 63/72: 1
ZB/CHA/58(att): 1
A/Arg/01-A01Lc: 1
A/Arg/01-CapLc: 1
Asia1-9/Shamir/ISR/89: 1
Asia1-8/Shamir/ISR/89: 1
Asia1/Shamir/ISR/89: 1
O/CHN/Mya98/33-P: 1
O/GSLX/2010: 1
O/ALG/1/2015: 1
O/ALG/5/2014: 1
O/ALG/3/2014: 1
O/ALG/2/2014: 1
O/ALG/1/2014: 1
O/TUN/14/2014: 1
O/TUN/13/2014: 1
O/TUN/12/2014: 1
O/TUN/11/2014: 1
O/TUN/10/2014: 1
O/TUN/9/2014: 1
O/TUN/8/2014: 1
O/TUN/7/2014: 1
O/TUN/6/2014: 1
O/TUN/5/2014: 1
O/TUN/3/2014: 1
Iraq 09: 1
A/TAI/72/2016: 1
A/TAI/56/2016: 1
A/TAI/32-1/2016: 1
A/TAI/137/2015: 1
A/TAI/108/2015: 1
A/TAI/67/2015: 1
A/TAI/46-1/2015: 1
A/TAI/98-1/2014: 1
A/TAI/74/2014: 1
A/TAI/15-1/2014: 1
A/TAI/94/2013: 1
A/TAI/80-3/2013: 1
A/TAI/53/2013: 1
A/TAI/36/2012: 1
A/TAI/3/2012: 1
A/TAI/

In [124]:
# Values for serotype qualifier
print("Counts for serotypes:")
print_sorted_counts(serotypes)

Counts for serotypes:
O: 738
Unknown: 622
A: 313
SAT2: 44
SAT1: 33
C: 30
SAT3: 28
Asia 1: 15
Asia1: 15
Pan Asia O: 11
O/CATHAY: 5
SAT 2: 4
SAT 3: 4
SAT 1: 2
FMDV-Asia 1: 1
type O: 1
Asia-1: 1
Asia l: 1
Asial: 1
A22: 1


Save qualifier values to a table

In [125]:
df = pd.DataFrame({
    'GenBankAccession': record_ids,
    'Country': countries_mapped,
    'Host': hosts_mapped,
    'CollectionDate': collection_years,
    'Serotype': serotypes,
    'Strain': strains,
    'Isolate': isolates,
    'IsolationSource': isolation_sources
})
df.to_csv('qualifiers_table.csv')

The function extracts CDS sequences and creates a FASTA file with headers in the format >GenbankAC/country/host/year/serotype.

For this purpose, mapped lists are used.

In [126]:
def write_fasta_from_genbank(genbank_records, hosts, collection_dates, countries, serotypes, record_ids, output_file):
    with open(output_file, 'w') as fasta_file:
        for i, record in enumerate(genbank_records):  

            # Constructing the header
            header = f">{record_ids[i].replace(' ', '-')}/{countries[i].replace(' ', '-')}/{hosts[i].replace(' ', '-')}/{collection_dates[i].replace(' ', '-')}/{serotypes[i].replace(' ', '-')}"
            fasta_file.write(header + '\n')

            cds_list = []
            # Getting the coordinates of the coding sequence/sequences
            for feature in record.features:
                if feature.type == 'CDS':
                    # Check if product qualifier is 'polyprotein'
                    if 'product' in feature.qualifiers:
                        product_value = feature.qualifiers['product'][0]
                        pattern = r'(?i)(polyprotein|poylprotein|polyprotein precursor|polypeptide)'
                        if re.match(pattern, product_value): 
                        
                            cds_start = feature.location.start.position
                            cds_end = feature.location.end.position
                            
                            cds_sequence = record.seq[cds_start:cds_end]
                            cds_list.append(str(cds_sequence))
                            full_cds = ''.join(cds_list)

            # Writing the sequence, moving to a new line every 70 characters
            for i in range(0, len(full_cds), 70):
                fasta_file.write(str(full_cds[i:i+70]) + '\n')

In [127]:
genbank_records = SeqIO.parse(file_path, 'gb')
write_fasta_from_genbank(genbank_records, hosts_mapped, collection_years, countries_mapped, serotypes, record_ids, 'extracted_CDS_full.fasta')