### Installing packages

In [1]:
# Importing packages
import os
import sys
import argparse
import re
import json
import Bio
from Bio import SeqIO
from Bio import Entrez
from tqdm import tqdm

In [2]:
# Asserting name of file.
# In fact shortened form of bacteria name - first letter of genus and nine (or lesser) letters of specie.
json_file_name = "M_adhaerens.json"
# Asserting email for Entrez.  Print your own mail in the line below
Entrez.email = "vibrio.choleri.1854@gmail.com"

In [61]:
json_path = f"../../data/jsons/{json_file_name}"
with open(json_path, "r") as json_organism:
    json_organism = json.load(json_organism)

# Creating a variables for futher working with links
organism = json_organism[0]
db_search = "assembly"
db_current = "nucleotide"
pre_complete_ids = list(json_organism[1].values())[0]

In [3]:
organism = "Escherichia_coli"
db_search = "assembly"
db_current = "nucleotide"
pre_complete_ids = ["GCF_024399055", "GCF_024298945", "GCF_000017765", "GCF_027945015", "GCF_022439465", "GCF_001753545", "GCF_013167395", "GCF_024734095", "GCF_030038915", "GCF_030988825", "GCF_004358365", "GCF_022494685", "GCF_013201215", "GCF_025426195", "GCF_024585805", "GCF_025853875", "GCF_006777065", "GCF_002855415", "GCF_009867035", "GCF_000008865", "GCF_000270105", "GCF_014169855", "GCF_005222445", "GCF_021166435", "GCF_002011965", "GCF_023658265", "GCF_017570585", "GCF_028609765", "GCF_001900795", "GCF_000026245", "GCF_025402935", "GCF_001677515", "GCF_012221365", "GCF_029536635", "GCF_022493835", "GCF_019211745", "GCF_029855165", "GCF_904863445", "GCF_022759745", "GCF_019971035", "GCF_008635785", "GCF_019428625", "GCF_900636055", "GCF_021513335", "GCF_004301465", "GCF_013394575", "GCF_016942635", "GCF_019334665", "GCF_002249955", "GCF_002934895", "GCF_030347015", "GCF_900636105", "GCF_000273425", "GCF_014904175", "GCF_030012765", "GCF_003018875", "GCF_902810335", "GCF_017165275", "GCF_020424045", "GCF_904863305", "GCF_013374315", "GCF_022654545", "GCF_022494805", "GCF_030036755", "GCF_014361285", "GCF_003703795", "GCF_005221565", "GCF_030013435", "GCF_013167615", "GCF_005954605", "GCF_023516115", "GCF_023657975", "GCF_019428725", "GCF_022846575", "GCF_002966755", "GCF_027925825", "GCF_016775825", "GCF_002164645", "GCF_000299455", "GCF_030908745", "GCF_955652485", "GCF_900636235", "GCF_001901065", "GCF_020827555", "GCF_019645975", "GCF_018884505", "GCF_030028155", "GCF_005221805", "GCF_002105735", "GCF_030295645", "GCF_004010595", "GCF_018884285", "GCF_024585405", "GCF_008080655", "GCF_904831835", "GCF_013425955", "GCF_026210495", "GCF_008926185", "GCF_018972245", "GCF_900186905"]

In [4]:
def extract_insdc(links): 
    linkset = [ls for ls in links[0]['LinkSetDb'] if
               ls['LinkName'] == 'assembly_nuccore_refseq']
    if 0 != len(linkset):
        uids = [link['Id'] for link in linkset[0]['Link']]
    else:
        uids = 0
    return uids

def download_links(db_search, db_current, complete_id, timer, num_link):
    if timer > 0:
        link_handle = Entrez.elink(dbfrom=db_search, db=db_current, from_uid=complete_id)
        link_record = Entrez.read(link_handle)
        uids = extract_insdc(link_record)
        if uids != 0:
            for uid in uids:
                if uid not in links_checked:  # Checking for duplicates
                    links_checked.append(uid)
                    links.append((uid, num_link))
                    cumulative = 1
                else:
                    cumulative = 0
            num_link += cumulative
    return num_link
            
def forced_download_links(db_search, db_current, complete_id, timer, level, num_link):   # Наверное, это можно реализовать декоратором
    try:
        num_link = download_links(db_search=db_search, db_current=db_current, complete_id=complete_id, timer=timer, num_link=num_link)    
    except RuntimeError:
        print(f"Problem is with {complete_id}")
        timer -= 1
        level += 1
        print(f"We are on the {level} level now")
        num_link = forced_download_links(db_search, db_current, complete_id, timer, level, num_link)
    return num_link

In [5]:
def gcf_transformer(db_search, gcf, level, timer):
    if timer > 0:
        try:
            search_handle = Entrez.esearch(db_search, gcf)
            search_record = Entrez.read(search_handle)
            return search_record
        except RuntimeError:
            print(f"Problem is with {gcf}")
            timer -= 1
            level += 1
            print(f"We are on the {level} level now")
            search_record = gcf_transformer(db_search, gcf, level, timer)

In [7]:
if pre_complete_ids[0][0:3] == "GCF":
    complete_ids = []
    for gcf in tqdm(pre_complete_ids):
        level = 0
        timer = 5 
        search_record = gcf_transformer(db_search, gcf, level, timer)
        complete_ids.append(search_record["IdList"])
else:
    complete_ids = pre_complete_ids

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:36<00:00,  1.04it/s]


In [65]:
# Taking ids for fetching. It collected all non-duplicated links in nucleotide database from assembly database.
# We use try-except for excepting problems with network temorary lags, which ruined our code
links = []
links_checked = []
num_link = 0
for complete_id in tqdm(complete_ids):
    timer = 5
    level = 1
    num_link = forced_download_links(db_search, db_current, complete_id, timer, level, num_link)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:27<00:00,  2.00s/it]


In [50]:
# Collecting data about assemblies
gb_records = []
for link in tqdm(links):
    gb_handle = Entrez.efetch(db=db_current, rettype="fasta", retmode="text", id=link[0])
    gb_record = SeqIO.read(gb_handle, 'fasta')
    gb_records.append((gb_record, link[1]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:22<00:00,  2.86s/it]


On the next stage we need to create fasta files for each assembly, to reaanotate them.

In [52]:
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"

In [53]:
dna_type, tuples, source_list = [], [], [] # Creating list for identyfing the number of every assemblie DNA molecules (chromosome and any plasmids)
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"
for rec in tqdm(gb_records):
    try:
        source = rec[1] # Number of assembly
        descr = rec[0].description
        if "plasmid" in descr:
            dna_type.append("plasmid")
        elif "scaffold" in descr:
            dna_type.append("scaffold")
        else:
            dna_type.append("chromosome")
        if "scaffold" not in descr: # Для скаффолда иной подход
            source_list.append(source) 
            number = source_list.count(source) # Counting the DNA molecule of assembly
            tuples.append((source, number))
            mask = f"{name}{source}_{number}"
            with open (f"../../{name}/data/for_prokka_fasta/{mask}.fasta", "w") as for_prokka_fasta:
                for_prokka_fasta.write(">")
                for_prokka_fasta.write(mask)
                for_prokka_fasta.write("\n")
                for_prokka_fasta.write(str(rec[0].seq))
                for_prokka_fasta.write("\n")
        elif "scaffold" in descr:
            pass
    except Bio.Seq.UndefinedSequenceError:
        print(rec[0].name)
        continue

  0%|                                                                                                                 | 0/50 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '../../M_adhaerens/data/for_prokka_fasta/M_adhaerens0_1.fasta'

In [11]:
# Saving plasmid_code
jsonpc1 = json.dumps(tuples)
with open(f"../../{name}/data/{name}_plasmid_code1.json", "w") as jspc1:
    jspc1.write(jsonpc1)
jsonpc2 = json.dumps(dna_type)
with open(f"../../{name}/data/{name}_plasmid_code2.json", "w") as jspc2:
    jspc2.write(jsonpc2)