### Installing packages

In [3]:
# Installing packages
#!pip3 install pandas
#!pip3 install biopython

# Importing packages
import Bio
import math
import os
import sys
import argparse
import shutil
import re
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.SeqRecord import SeqRecord
from Bio import Entrez
from itertools import chain
from tqdm import tqdm
#from google.colab import files   # For working with colab only
#from google.colab import drive   # For working with colab only

# Asserting email for Entrez
Entrez.email = "bogdan.sotnikov.1999@mail.ru"

# Google drive mounting
#drive.mount('/content/gdrive')    # For working with colab only

In [None]:
parser = argparse.ArgumentParser()

parser.add_argument("organism_name", type=str)
parser.add_argument("mail", type=str)

arguments = parser.parse_args()

organism_name = arguments.organism_name
Entrez.email = arguments.mail
# Если аргумент есть в аргументах, сделать api

In [4]:
organism_name = "Campylobacter upsaliensis"

In [5]:
# Code from source above, with modifications. It is for choosing GenBank nucleotide data only.
def extract_insdc(links):
    linkset = [ls for ls in links[0]['LinkSetDb'] if
              ls['LinkName'] == 'assembly_nuccore_insdc']
    if 0 != len(linkset):
        uids = [link['Id'] for link in linkset[0]['Link']]
    else:
        uids = 0
    return uids

In [6]:
# Creating a variables for futher work with links
#organism = organism_name
organism = organism_name
db_search = "assembly"
db_current = "nucleotide"

In [7]:
# First searching helps to count complete number of links
search_handle = Entrez.esearch(db=db_search, term=organism)
search_record = Entrez.read(search_handle)
count = int(search_record["Count"])

# Second searching
search_handle = Entrez.esearch(db=db_search, term=organism, retmax=count)
search_record = Entrez.read(search_handle)

In [8]:
# Getting summary about links
idlist = search_record["IdList"]
complete_ids = []   # List of full completed genomes' ids
for ids in tqdm(idlist):
    handle = Entrez.esummary(db=db_search, id=ids)
    record = Entrez.read(handle)
    if record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyStatus'] == "Complete Genome":
        complete_ids.append(ids)

 33%|█████████████████████████████████▍                                                                    | 223/679 [05:20<10:55,  1.44s/it]


URLError: <urlopen error [Errno 101] Network is unreachable>

In [7]:
# Taking ids for fetching. It collected all non-duplicated links in nucleotide database from assembly database.
links = []
links_checked = []
n = 0
for complete_id in tqdm(complete_ids):
    link_handle = Entrez.elink(dbfrom=db_search, db=db_current, from_uid=complete_id)
    link_record = Entrez.read(link_handle)
    uids = extract_insdc(link_record)
    if uids != 0:
        for uid in uids:
            if uid not in links_checked:    # Checking for duplicates
                links_checked.append(uid)
                links.append((uid, n))
                cumulative = 1
            else:
                cumulative = 0
        n += cumulative

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 87/87 [01:16<00:00,  1.14it/s]


In [8]:
# Collecting data about assemblies
gb_records = []
for link in tqdm(links):
    gb_handle = Entrez.efetch(db=db_current, rettype="gb", retmode="text", id=link[0])
    gb_record = SeqIO.read(gb_handle, 'genbank')
    gb_records.append((gb_record, link[1]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 578/578 [1:06:32<00:00,  6.91s/it]


On the next stage we need to create fasta files for each assembly, to reaanotate them.

In [1]:
organism = "Proteus penneri"
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"
print(name)

P_penneri


In [23]:
dna_type, tuples, source_list = [], [], [] # Creating list for identyfing the number of every assemblie DNA molecules (chromosome and any plasmids)
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"
for rec in tqdm(gb_records):
    source = rec[1] # Number of assembly
    if "plasmid" in rec[0].description:
        dna_type.append("plasmid")
    else:
        dna_type.append("chromosome")
    source_list.append(source) 
    number = source_list.count(source) # Counting the DNA molecule of assembly
    tuples.append((source, number))
    mask = f"{name}{source}_{number}"
    with open (f"../{name}/data/for_prokka_fasta/{mask}.fasta", "w") as for_prokka_fasta:
        for_prokka_fasta.write(">")
        for_prokka_fasta.write(mask)
        for_prokka_fasta.write("\n")
        for_prokka_fasta.write(str(rec[0].seq))
        for_prokka_fasta.write("\n")  

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 633.51it/s]


In [35]:
# Saving plasmid_code
jsonpc1 = json.dumps(tuples)
with open(f"../{name}/data/{name}_plasmid_code1.json", "w") as jspc1:
    jspc1.write(jsonpc1)
jsonpc2 = json.dumps(dna_type)
with open(f"../{name}/data/{name}_plasmid_code2.json", "w") as jspc2:
    jspc2.write(jsonpc2)