### Installing packages

In [1]:
# Importing packages
import Bio
import math
import os
import sys
import argparse
import shutil
import re
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.SeqRecord import SeqRecord
from Bio import Entrez
from itertools import chain
from tqdm import tqdm

In [2]:
# Asserting name of file.
# In fact shortened form of bacteria name - first letter of genus and nine (or lesser) letters of specie.
json_file_name = "S_ruber.json"
# Asserting email for Entrez.  Print your own mail in the line below
Entrez.email = "vibrio.choleri.1854@gmail.com"

In [8]:
json_path = f"../../data/jsons/{json_file_name}"
with open(json_path, "r") as json_organism:
    json_organism = json.load(json_organism)

# Creating a variables for futher working with links
organism = json_organism[0]
db_search = "assembly"
db_current = "nucleotide"
complete_ids = list(json_organism[1].values())[0]

In [9]:
if type(complete_ids[len(complete_ids)-1]) == list:
    scaffs = complete_ids[len(complete_ids)-1]
    del complete_ids[len(complete_ids)-1]
    complete_ids += scaffs

In [10]:
# Code from source above, with modifications. It is for choosing GenBank nucleotide data only.
def extract_insdc(links):
    linkset = [ls for ls in links[0]['LinkSetDb'] if
              ls['LinkName'] == 'assembly_nuccore_insdc']
    if 0 != len(linkset):
        uids = [link['Id'] for link in linkset[0]['Link']]
    else:
        uids = 0
    return uids

In [12]:
# Taking ids for fetching. It collected all non-duplicated links in nucleotide database from assembly database.
links = []
links_checked = []
n = 0
for complete_id in tqdm(complete_ids):
    link_handle = Entrez.elink(dbfrom=db_search, db=db_current, from_uid=complete_id)
    link_record = Entrez.read(link_handle)
    uids = extract_insdc(link_record)
    if uids != 0:
        for uid in uids:
            if uid not in links_checked:  # Checking for duplicates
                links_checked.append(uid)
                links.append((uid, n))
                cumulative = 1
            else:
                cumulative = 0
        n += cumulative

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:37<00:00,  1.01it/s]


In [13]:
# Collecting data about assemblies
gb_records = []
for link in tqdm(links):
    gb_handle = Entrez.efetch(db=db_current, rettype="gb", retmode="text", id=link[0])
    gb_record = SeqIO.read(gb_handle, 'genbank')
    gb_records.append((gb_record, link[1]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [01:40<00:00,  3.87s/it]


On the next stage we need to create fasta files for each assembly, to reaanotate them.

In [14]:
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"

S_ruber


In [16]:
dna_type, tuples, source_list = [], [], [] # Creating list for identyfing the number of every assemblie DNA molecules (chromosome and any plasmids)
orglist = organism.split()
if len(orglist[-1]) < 10:
    last_letter = len(orglist[-1])
else:
    last_letter = 9
name = f"{orglist[0][0]}_{orglist[-1][:last_letter]}"
for rec in tqdm(gb_records):
    source = rec[1] # Number of assembly
    if "plasmid" in rec[0].description:
        dna_type.append("plasmid")
    else:
        dna_type.append("chromosome")
    source_list.append(source) 
    number = source_list.count(source) # Counting the DNA molecule of assembly
    tuples.append((source, number))
    mask = f"{name}{source}_{number}"
    with open (f"../../{name}/data/for_prokka_fasta/{mask}.fasta", "w") as for_prokka_fasta:
        for_prokka_fasta.write(">")
        for_prokka_fasta.write(mask)
        for_prokka_fasta.write("\n")
        for_prokka_fasta.write(str(rec[0].seq))
        for_prokka_fasta.write("\n")  

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 338.21it/s]


In [17]:
# Saving plasmid_code
jsonpc1 = json.dumps(tuples)
with open(f"../../{name}/data/{name}_plasmid_code1.json", "w") as jspc1:
    jspc1.write(jsonpc1)
jsonpc2 = json.dumps(dna_type)
with open(f"../../{name}/data/{name}_plasmid_code2.json", "w") as jspc2:
    jspc2.write(jsonpc2)