In [207]:
import time

import pandas as pd

from Bio import Entrez, SeqIO
from Bio.SeqRecord import SeqRecord

from entrez import select_phyla
from fasta_processing import read_fasta, read_single_fasta, dict_align_to_fasta
from data_processing import choose_best_frameshift, save_subset_df_transcripts, find_codon, download_subset_df_datasets

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df_cnidaria = pd.read_csv("../all_nxf1.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df_cnidaria.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df_cnidaria.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0
9598,Pan troglodytes,451267,0,live,NXF1,CK820_G0030982,nuclear RNA export factor 1,nuclear RNA export factor 1,,9,NC_072407.2,65104305.0,65118449.0,minus,23.0
9913,Bos taurus,512136,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,29,NC_037356.1,41099477.0,41110792.0,minus,22.0
9615,Canis lupus familiaris,483780,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,18,NC_051822.1,54936964.0,54947631.0,plus,21.0


In [3]:
# cnidaria = select_phyla(df_cnidaria, "Cnidaria")

In [4]:
cnidaria_copy = {'Cnidaria': [50429, 6105, 6087, 45351]}

In [147]:
cnidaria_subset = df_cnidaria.loc[cnidaria_copy['Cnidaria']]

In [6]:
cnidaria_subset

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
50429,Stylophora pistillata,111339613,0,live,LOC111339613,"AWC38_SpisGene17253, NXF1",nuclear RNA export factor 1-like,nuclear RNA export factor 1-like|Nuclear RNA e...,,Un,NW_019218197.1,98177.0,131366.0,plus,21.0
6105,Actinia tenebrosa,116292710,0,live,LOC116292710,,nuclear RNA export factor 1-like,nuclear RNA export factor 1-like,,Un,NW_022259381.1,16092.0,33206.0,plus,24.0
6087,Hydra vulgaris,100214173,0,live,LOC100214173,,nuclear RNA export factor 1,nuclear RNA export factor 1,,12,NC_088975.1,736736.0,739315.0,minus,2.0
45351,Nematostella vectensis,5510737,0,live,LOC5510737,NEMVEDRAFT_v1g244041,nuclear RNA export factor 1,nuclear RNA export factor 1|predicted protein,,7,NC_064040.1,11955766.0,11973360.0,plus,24.0


In [7]:
# download_subset_df_datasets(cnidaria_subset)

In [204]:
def download_gene_gb(org_names: list) -> None:
    # Obtain mRNA Accession and ranges of the gene
    for org_name in org_names:
        with open(f"../Datasets/{org_name}/ncbi_dataset/data/gene.fna") as infile:
            line = infile.readline().rstrip()
            gene_acc = line.split(":")[0][1:]
            gene_ranges = line.split()[0].split(":")[1]
            gene_range_1 = gene_ranges.split("-")[0]
            gene_range_2 = gene_ranges.split("-")[1]

        if "c" in gene_range_1:
            gene_range_1 = gene_range_1.replace("c", "")
            strand = 2
        else:
            strand = 1
        gene_range_1, gene_range_2 = int(gene_range_1), int(gene_range_2)

        # Obtain and download mRNA in GenBank format
        stream = Entrez.efetch(db="nucleotide", id=gene_acc, idtype="acc",
                               seq_start=gene_range_1, seq_stop=gene_range_2, strand=strand,
                               rettype="gb", retmode="text")
        with open(f"../Datasets/{org_name}/ncbi_dataset/data/gene.gb", "w") as outfile:
            outfile.write(stream.read())

        print(f"{org_name}.gb has been downloaded")

        time.sleep(0.333333334)

In [205]:
download_gene_gb(["stylophora_pistillata", "nematostella_vectensis", "actinia_tenebrosa", "hydra_vulgaris"])

stylophora_pistillata.gb has been downloaded
nematostella_vectensis.gb has been downloaded
actinia_tenebrosa.gb has been downloaded
hydra_vulgaris.gb has been downloaded


In [291]:
def parse_exon_ranges(org_names: list, feature_type: str = "mRNA") -> dict:
    exons_dict = {}

    feature_types = ["mRNA", "CDS"]
    if feature_type not in feature_types:
        raise Exception(f"Feature type {feature_type} is not supported")

    for org_name in org_names:
        exons_range = {}
        records = SeqIO.parse(f"../Datasets/{org_name}/ncbi_dataset/data/gene.gb", "genbank")
        for record in records:
            for feature in record.features:
                if feature.type == feature_type:
                    for part_i, part in enumerate(feature.location.parts):
                        start, end = int(part.start), int(part.end)
                        exons_range[part_i] = [start, end]
        exons_dict[org_name] = exons_range

    return exons_dict

In [297]:
org_names = ["stylophora_pistillata", "nematostella_vectensis", "actinia_tenebrosa", "hydra_vulgaris"]

In [298]:
parse_exon_ranges(org_names, feature_type="mRNA")

{'stylophora_pistillata': {0: [0, 243],
  1: [882, 1092],
  2: [18604, 18771],
  3: [19302, 19441],
  4: [20748, 20852],
  5: [21589, 21659],
  6: [22484, 22573],
  7: [23168, 23276],
  8: [23896, 24012],
  9: [24495, 24546],
  10: [26302, 26363],
  11: [26884, 26943],
  12: [27066, 27128],
  13: [28278, 28394],
  14: [28638, 28681],
  15: [29443, 29507],
  16: [30474, 30532],
  17: [30851, 30938],
  18: [31526, 31588],
  19: [31927, 31988],
  20: [32669, 33190]},
 'nematostella_vectensis': {0: [0, 218],
  1: [1286, 1536],
  2: [2152, 2300],
  3: [2733, 2817],
  4: [3225, 3330],
  5: [4828, 4939],
  6: [5047, 5117],
  7: [5635, 5724],
  8: [6199, 6307],
  9: [7012, 7128],
  10: [8119, 8156],
  11: [8298, 8367],
  12: [8531, 8587],
  13: [10051, 10112],
  14: [10520, 10576],
  15: [11031, 11093],
  16: [11664, 11780],
  17: [12355, 12398],
  18: [12862, 12926],
  19: [13499, 13557],
  20: [13681, 13819],
  21: [14909, 14974],
  22: [16026, 16087],
  23: [17169, 17595]},
 'actinia_tenebr

In [231]:
records = SeqIO.parse(f"../Datasets/{org_name}/ncbi_dataset/data/gene.gb", "genbank")

for record in records:


TypeError: list indices must be integers or slices, not str

In [166]:
def download_exon_ranges_auto(org_names: list) -> dict:
    exons_dict = {}

    for org_name in org_names:
        exons = []
        record = False

        with open(f"../Datasets/{org_name}/ncbi_dataset/data/rna.gb") as infile:
            for line in infile.readlines():
                line = line.rstrip()
                if line.startswith("PRIMARY"):
                    record = True
                    continue
                if record:
                    if line.startswith("FEATURES"):
                        break
                    else:
                        exons.append(" ".join(line.split()))

        exons_range = []
        for exon in exons:
            exon_range = exon.split()[0]
            exon_range_0, exon_range_1 = exon_range.split("-")[0], exon_range.split("-")[1]
            exons_range.append([int(exon_range_0), int(exon_range_1)])
        exons_dict[org_name] = exons_range

    return exons_dict

In [None]:
with open(f"../Datasets/{org_name}/ncbi_dataset/data/rna.gb") as infile:
    for line_i, line in enumerate(infile.readlines()):
        line = line.rstrip()
        print(f"line {line_i}: {line}")
        time.sleep(2)
        print("Enter the range of exons via space:")
        custom_range = [int(i) for i in input().split()]

        with open(f"../Datasets/{org_name}/ncbi_dataset/data/rna.gb") as infile:
            for line_i, line in enumerate(infile.readlines()):
                line = line.rstrip()
                if line_i == custom_range[0]:
                    record = True
                    continue
                if record:
                    if line_i == custom_range[1]:
                        break
                    else:
                        exons.append(" ".join(line.split()))

In [None]:
org_names = ["stylophora_pistillata", "nematostella_vectensis"]

In [168]:
download_exon_ranges_auto(org_names)

{'stylophora_pistillata': [[1, 243],
  [244, 453],
  [454, 620],
  [621, 759],
  [760, 976],
  [977, 1080],
  [1081, 1150],
  [1151, 1239],
  [1240, 1347],
  [1348, 1463],
  [1464, 1514],
  [1515, 1575],
  [1576, 1634],
  [1635, 1696],
  [1697, 1812],
  [1813, 1855],
  [1856, 1919],
  [1920, 1977],
  [1978, 2064],
  [2065, 2126],
  [2127, 2187],
  [2188, 2708]]}

In [None]:
# почему у меня получилось 22 экзона, а в табличке написано, что 21?
# возможно дело в том, что у нас есть GARY01005553.1 == UTR

In [None]:
org_name = "stylophora_pistillata"
stylophora_rna = read_single_fasta(f"../Datasets/{org_name}/ncbi_dataset/data/rna.fasta")


In [120]:
cnidaria_subset["exons"] = [1, 2, 3, 4]

In [126]:
cnidaria_subset.loc[cnidaria_subset[cnidaria_subset["gene_id"] == 111339613].index, "exons"] = 1337

In [127]:
cnidaria_subset

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count,exons
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
50429,Stylophora pistillata,111339613,0,live,LOC111339613,"AWC38_SpisGene17253, NXF1",nuclear RNA export factor 1-like,nuclear RNA export factor 1-like|Nuclear RNA e...,,Un,NW_019218197.1,98177.0,131366.0,plus,21.0,1337
6105,Actinia tenebrosa,116292710,0,live,LOC116292710,,nuclear RNA export factor 1-like,nuclear RNA export factor 1-like,,Un,NW_022259381.1,16092.0,33206.0,plus,24.0,2
6087,Hydra vulgaris,100214173,0,live,LOC100214173,,nuclear RNA export factor 1,nuclear RNA export factor 1,,12,NC_088975.1,736736.0,739315.0,minus,2.0,3
45351,Nematostella vectensis,5510737,0,live,LOC5510737,NEMVEDRAFT_v1g244041,nuclear RNA export factor 1,nuclear RNA export factor 1|predicted protein,,7,NC_064040.1,11955766.0,11973360.0,plus,24.0,4
