In [1]:
import os

import pandas as pd

from os.path import commonprefix

from fasta_processing import read_single_fasta, dict_align_to_fasta
from datasets import select_all_phylas, download_all_files_ncbi
from taxonomy_processing import create_taxonomy
from data_processing import analyze_exons, create_cassette, concat_cassette

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1_2.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9606,Homo sapiens,10482,0,live,NXF1,"MEX67, TAP",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,11q12.3,11,NC_000011.10,62792130.0,62805440.0,minus,22.0
10090,Mus musculus,53319,0,live,Nxf1,"Mex67, Mvb1, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|Mex 67 homolog|mRN...,19 5.5 cM,19,NC_000085.7,8734467.0,8748274.0,plus,20.0
10116,Rattus norvegicus,59087,0,live,Nxf1,"Mex67h, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,1q43,1,NC_086019.1,215084563.0,215097756.0,plus,21.0
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0


In [3]:
df_taxonomy = pd.read_csv("../all_phylas_taxonomy.tsv", sep="\t", names=["taxid", "taxonomy"], index_col=0, dtype={"taxid": int, "taxonomy": str})

In [4]:
df_protostomia = df_taxonomy[df_taxonomy.taxonomy.str.contains("Protostomia")]
df_prot = df_protostomia[~df_protostomia.taxonomy.str.contains("Arthropoda")]

In [5]:
df_ecdysozoa = df_prot[df_prot.taxonomy.str.contains("Ecdysozoa")]
df_spiralia = df_prot[df_prot.taxonomy.str.contains("Spiralia")]

In [6]:
prot_taxids = {
    "Ecdysozoa": df_ecdysozoa.index.tolist(),
    "Spiralia": df_spiralia.index.tolist(),
}
prot_taxids

{'Ecdysozoa': [6239, 6279, 6334, 37621],
 'Spiralia': [6185, 6604, 400727, 7574, 6198, 1735272, 6565, 29159, 6500]}

In [7]:
# download_all_files_ncbi(df, prot_taxids, phylas=list(prot_taxids.keys()))

In [8]:
df_prot

Unnamed: 0_level_0,taxonomy
taxid,Unnamed: 1_level_1
6239,cellular organisms; Eukaryota; Opisthokonta; M...
6185,cellular organisms; Eukaryota; Opisthokonta; M...
6604,cellular organisms; Eukaryota; Opisthokonta; M...
400727,cellular organisms; Eukaryota; Opisthokonta; M...
6279,cellular organisms; Eukaryota; Opisthokonta; M...
6334,cellular organisms; Eukaryota; Opisthokonta; M...
7574,cellular organisms; Eukaryota; Opisthokonta; M...
6198,cellular organisms; Eukaryota; Opisthokonta; M...
1735272,cellular organisms; Eukaryota; Opisthokonta; M...
6565,cellular organisms; Eukaryota; Opisthokonta; M...


In [9]:
# df_prot.to_csv("../protostomia_taxonomy.tsv", sep="\t", index=True, header=False)

In [10]:
# create_taxonomy("../protostomia_taxonomy.tsv")

---

In [11]:
os.listdir("../Datasets/Ecdysozoa")

['trichinella_spiralis_3',
 'priapulus_caudatus_4',
 'brugia_malayi_2',
 'caenorhabditis_elegans_0']

In [12]:
prefix = "../Datasets"
phylum = "Ecdysozoa"
postfix = "ncbi_dataset/data"

In [13]:
caenorhabditis_elegans_0 = analyze_exons(f"{prefix}/{phylum}/caenorhabditis_elegans_0/{postfix}/exons.fa")
caenorhabditis_elegans_0_cassette = create_cassette(phylum, "caenorhabditis_elegans_0", caenorhabditis_elegans_0, exons_i=[4, 5])

brugia_malayi_2 = analyze_exons(f"{prefix}/{phylum}/brugia_malayi_2/{postfix}/exons.fa")
brugia_malayi_2_cassette = create_cassette(phylum, "brugia_malayi_2", brugia_malayi_2, exons_i=[8, 9])

trichinella_spiralis_3 = analyze_exons(f"{prefix}/{phylum}/trichinella_spiralis_3/{postfix}/exons.fa")
trichinella_spiralis_3_cassette = create_cassette(phylum, "trichinella_spiralis_3", trichinella_spiralis_3, exons_i=[5, 6])

priapulus_caudatus_4 = analyze_exons(f"{prefix}/{phylum}/priapulus_caudatus_4/{postfix}/exons.fa") # 116-43
priapulus_caudatus_4_cassette = create_cassette(phylum, "priapulus_caudatus_4", priapulus_caudatus_4, exons_i=[10, 11])

In [14]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "eie"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "eie"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "eie"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "eie"),
}

In [15]:
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_cassette.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_cassette.aln")

In [16]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "ee"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "ee"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "ee"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "ee"),
}

In [17]:
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_2_exons.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_2_exons.aln")

In [18]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "i"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "i"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "i"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "i"),
}

In [20]:
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_introns.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa_introns.aln")

In [None]:
"python Scripts/build_rna_structures.py --input Alignment/Ecdysozoa_cassette.fa --paint Alignment/Ecdysozoa_introns.fa"