In [1]:
import os

import pandas as pd

from os.path import commonprefix
from pathlib import Path
from IPython.display import HTML

from highlight import highlight_intron_in_seq
from fasta_processing import read_single_fasta, dict_align_to_fasta, read_fasta
from datasets import select_all_phylas, download_all_files_ncbi, check_transcript_count
from taxonomy_processing import create_taxonomy
from data_processing import analyze_exons, create_cassette, concat_cassette, dict_align_create, find_codon, \
    dict_align_info_analyze, dict_align_update_keys
from build_rna_structures import run_rnafold_with_highlight

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1_2.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9606,Homo sapiens,10482,0,live,NXF1,"MEX67, TAP",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,11q12.3,11,NC_000011.10,62792130.0,62805440.0,minus,22.0
10090,Mus musculus,53319,0,live,Nxf1,"Mex67, Mvb1, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|Mex 67 homolog|mRN...,19 5.5 cM,19,NC_000085.7,8734467.0,8748274.0,plus,20.0
10116,Rattus norvegicus,59087,0,live,Nxf1,"Mex67h, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,1q43,1,NC_086019.1,215084563.0,215097756.0,plus,21.0
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0


In [3]:
df_taxonomy = pd.read_csv("../all_phylas_taxonomy.tsv", sep="\t", names=["taxid", "taxonomy"], index_col=0,
                          dtype={"taxid": int, "taxonomy": str})

In [4]:
df_protostomia = df_taxonomy[df_taxonomy.taxonomy.str.contains("Protostomia")]
df_prot = df_protostomia[~df_protostomia.taxonomy.str.contains("Arthropoda")]

In [5]:
df_ecdysozoa = df_prot[df_prot.taxonomy.str.contains("Ecdysozoa")]
df_spiralia = df_prot[df_prot.taxonomy.str.contains("Spiralia")]

In [6]:
prot_taxids = {
    "Ecdysozoa": df_ecdysozoa.index.tolist(),
    "Spiralia": df_spiralia.index.tolist(),
}
prot_taxids

{'Ecdysozoa': [6239, 6279, 6334, 37621],
 'Spiralia': [6185, 6604, 400727, 7574, 6198, 1735272, 6565, 29159, 6500]}

In [7]:
# download_all_files_ncbi(df, prot_taxids, phylas=list(prot_taxids.keys()))

In [8]:
# при загрузке может быть такое, что в cds и других файлах оказывается несколько последовательностей - разные транскрипты возможны
# поэтому будем просто удалять второй и последующие варики транскриптов, потому что экзоны загружаются из генбанк файла, и порядок вроде соответствует и там, и там

In [9]:
# после ручного изменения эта функция не должна ничего печатать
check_transcript_count(["Ecdysozoa", "Spiralia"])

In [10]:
df_prot

Unnamed: 0_level_0,taxonomy
taxid,Unnamed: 1_level_1
6239,cellular organisms; Eukaryota; Opisthokonta; M...
6185,cellular organisms; Eukaryota; Opisthokonta; M...
6604,cellular organisms; Eukaryota; Opisthokonta; M...
400727,cellular organisms; Eukaryota; Opisthokonta; M...
6279,cellular organisms; Eukaryota; Opisthokonta; M...
6334,cellular organisms; Eukaryota; Opisthokonta; M...
7574,cellular organisms; Eukaryota; Opisthokonta; M...
6198,cellular organisms; Eukaryota; Opisthokonta; M...
1735272,cellular organisms; Eukaryota; Opisthokonta; M...
6565,cellular organisms; Eukaryota; Opisthokonta; M...


In [11]:
# df_prot.to_csv("../protostomia_taxonomy.tsv", sep="\t", index=True, header=False)

In [12]:
# create_taxonomy("../protostomia_taxonomy.tsv")

---

In [13]:
# Пробую построить вторичные структуры для кассет у Ecdysozoa

In [14]:
os.listdir("../Datasets/Ecdysozoa")

['trichinella_spiralis_3',
 'priapulus_caudatus_4',
 'brugia_malayi_2',
 'caenorhabditis_elegans_0']

In [15]:
# Удалил caenorhabditis_elegans_1, потому что там ген nxf-2

[
    'caenorhabditis_elegans_0',
    'brugia_malayi_2',
    'trichinella_spiralis_3',
    'priapulus_caudatus_4',
]

['caenorhabditis_elegans_0',
 'brugia_malayi_2',
 'trichinella_spiralis_3',
 'priapulus_caudatus_4']

In [16]:
prefix = "../Datasets"
phylum = "Ecdysozoa"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

In [17]:
# Nematoda
caenorhabditis_elegans_0 = analyze_exons(
    f"{prefix}/{phylum}/caenorhabditis_elegans_0/{postfix}/exons.fa")  # 110 37 | 4, 5
brugia_malayi_2 = analyze_exons(f"{prefix}/{phylum}/brugia_malayi_2/{postfix}/exons.fa")  # 110 37 | 8, 9
trichinella_spiralis_3 = analyze_exons(f"{prefix}/{phylum}/trichinella_spiralis_3/{postfix}/exons.fa")  # 110 37 | 5, 6

# Scalidophora
priapulus_caudatus_4 = analyze_exons(
    f"{prefix}/{phylum}/priapulus_caudatus_4/{postfix}/exons.fa")  # 110 37 | 10, 11 || 116 43 | 16, 17

In [18]:
# Nematoda
caenorhabditis_elegans_0_cassette = create_cassette(phylum, "caenorhabditis_elegans_0", caenorhabditis_elegans_0,
                                                    exons_i=[4, 5])
brugia_malayi_2_cassette = create_cassette(phylum, "brugia_malayi_2", brugia_malayi_2, exons_i=[8, 9])
trichinella_spiralis_3_cassette = create_cassette(phylum, "trichinella_spiralis_3", trichinella_spiralis_3,
                                                  exons_i=[5, 6])

# Scalidophora
priapulus_caudatus_4_cassette = create_cassette(phylum, "priapulus_caudatus_4", priapulus_caudatus_4, exons_i=[10, 11])

In [19]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "eie"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "eie"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "eie"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "eie"),
}

dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_cassette.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_cassette.aln")

In [20]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "ee"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "ee"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "ee"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "ee"),
}

dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_2_exons.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_2_exons.aln")

In [21]:
alignment_dict = {
    "Caenorhabditis_elegans": concat_cassette(caenorhabditis_elegans_0_cassette, "i"),
    "Brugia_malayi": concat_cassette(brugia_malayi_2_cassette, "i"),
    "Trichinella_spiralis": concat_cassette(trichinella_spiralis_3_cassette, "i"),
    "Priapulus_caudatus": concat_cassette(priapulus_caudatus_4_cassette, "i"),
}

dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_introns.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_introns.aln")

In [22]:
"python Scripts/build_rna_structures.py --input Alignment/Ecdysozoa/Ecdysozoa_cassette.fa --paint Alignment/Ecdysozoa/Ecdysozoa_introns.fa"

'python Scripts/build_rna_structures.py --input Alignment/Ecdysozoa/Ecdysozoa_cassette.fa --paint Alignment/Ecdysozoa/Ecdysozoa_introns.fa'

---

In [23]:
# Пробую посмотреть, что будет если построить структуру для всех экзонов и кассетного интрона.
# Получается, что структуры, которые формируются в кассете интроном не отличаются от тех, которые формируются экзонами.

In [24]:
os.listdir("../Datasets/Ecdysozoa")

['trichinella_spiralis_3',
 'priapulus_caudatus_4',
 'brugia_malayi_2',
 'caenorhabditis_elegans_0']

In [25]:
prefix = "../Datasets"
phylum = "Ecdysozoa"
postfix = "ncbi_dataset/data"
nof = "cds_cassette.fa"

In [26]:
caenorhabditis_elegans_0_cds_cassette = read_single_fasta(f"{prefix}/{phylum}/caenorhabditis_elegans_0/{postfix}/{nof}")
brugia_malayi_2_cds_cassette = read_single_fasta(f"{prefix}/{phylum}/brugia_malayi_2/{postfix}/{nof}")
trichinella_spiralis_3_cds_cassette = read_single_fasta(f"{prefix}/{phylum}/trichinella_spiralis_3/{postfix}/{nof}")
priapulus_caudatus_4_cds_cassette = read_single_fasta(f"{prefix}/{phylum}/priapulus_caudatus_4/{postfix}/{nof}")

In [27]:
alignment_dict = {
    "Caenorhabditis_elegans": caenorhabditis_elegans_0_cds_cassette,
    "Brugia_malayi": brugia_malayi_2_cds_cassette,
    "Trichinella_spiralis": trichinella_spiralis_3_cds_cassette,
    "Priapulus_caudatus": priapulus_caudatus_4_cds_cassette,
}
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.aln")

---

In [28]:
# Попробуем построить вторичные структуры кассет для Spiralia

In [29]:
os.listdir("../Datasets/Spiralia")

['gigantopelta_aegis_6',
 'crassostrea_virginica_7',
 'schistosoma_haematobium_1',
 'mya_arenaria_2',
 'opisthorchis_viverrini_5',
 'lingula_anatina_4',
 'pomacea_canaliculata_3',
 'magallana_gigas_8',
 'aplysia_californica_9']

In [30]:
# Удалил schistosoma_haematobium_0, т.к. там 4 экзона и более ранняя сборка

[
    'schistosoma_haematobium_1',
    'opisthorchis_viverrini_5',
    'mya_arenaria_2',
    'crassostrea_virginica_7',
    'magallana_gigas_8',
    'pomacea_canaliculata_3',
    'gigantopelta_aegis_6',
    'aplysia_californica_9',
    'lingula_anatina_4',
]

['schistosoma_haematobium_1',
 'opisthorchis_viverrini_5',
 'mya_arenaria_2',
 'crassostrea_virginica_7',
 'magallana_gigas_8',
 'pomacea_canaliculata_3',
 'gigantopelta_aegis_6',
 'aplysia_californica_9',
 'lingula_anatina_4']

In [31]:
prefix = "../Datasets"
phylum = "Spiralia"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

In [32]:
# Platyhelminthes
schistosoma_haematobium_1 = analyze_exons(
    f"{prefix}/{phylum}/schistosoma_haematobium_1/{postfix}/{nof}")  # 239 37 | 4, 5
opisthorchis_viverrini_5 = analyze_exons(f"{prefix}/{phylum}/opisthorchis_viverrini_5/{postfix}/{nof}")  # 1 exon

# Mollusca / Bivalvia
mya_arenaria_2 = analyze_exons(
    f"{prefix}/{phylum}/mya_arenaria_2/{postfix}/{nof}")  # 110 37 | 10, 11 || 113 43 | 15, 16
crassostrea_virginica_7 = analyze_exons(
    f"{prefix}/{phylum}/crassostrea_virginica_7/{postfix}/{nof}")  # 110 37 | 11, 12 || 122 43 | 17, 18
magallana_gigas_8 = analyze_exons(
    f"{prefix}/{phylum}/magallana_gigas_8/{postfix}/{nof}")  # 110 37 | 11, 12 || 122 43 | 17, 18

# Mollusca / Gastropoda
pomacea_canaliculata_3 = analyze_exons(f"{prefix}/{phylum}/pomacea_canaliculata_3/{postfix}/{nof}")  # 2 exons
gigantopelta_aegis_6 = analyze_exons(
    f"{prefix}/{phylum}/gigantopelta_aegis_6/{postfix}/{nof}")  # 110 37 | 11, 12 || 116 43 | 17, 18
aplysia_californica_9 = analyze_exons(
    f"{prefix}/{phylum}/aplysia_californica_9/{postfix}/{nof}")  # 221 37 | 5, 6 || 116 43 | 11, 12
lingula_anatina_4 = analyze_exons(f"{prefix}/{phylum}/lingula_anatina_4/{postfix}/{nof}")  # 1 exon

In [33]:
# Platyhelminthes
schistosoma_haematobium_1_cassette = create_cassette(phylum, "schistosoma_haematobium_1", schistosoma_haematobium_1,
                                                     exons_i=[4, 5])

# Mollusca / Bivalvia
mya_arenaria_2_cassette = create_cassette(phylum, "mya_arenaria_2", mya_arenaria_2, exons_i=[10, 11])
crassostrea_virginica_7_cassette = create_cassette(phylum, "crassostrea_virginica_7", crassostrea_virginica_7,
                                                   exons_i=[11, 12])
magallana_gigas_8_cassette = create_cassette(phylum, "magallana_gigas_8", magallana_gigas_8, exons_i=[11, 12])

# Mollusca / Gastropoda
gigantopelta_aegis_6_cassette = create_cassette(phylum, "gigantopelta_aegis_6", gigantopelta_aegis_6, exons_i=[11, 12])
aplysia_californica_9_cassette = create_cassette(phylum, "aplysia_californica_9", aplysia_californica_9, exons_i=[5, 6])

In [34]:
alignment_dict = {
    "Schistosoma_haematobium": concat_cassette(schistosoma_haematobium_1_cassette, "eie"),
    "Mya_arenaria": concat_cassette(mya_arenaria_2_cassette, "eie"),
    "Crassostrea_virginica": concat_cassette(crassostrea_virginica_7_cassette, "eie"),
    "Magallana_gigas": concat_cassette(magallana_gigas_8_cassette, "eie"),
    "Gigantopelta_aegis": concat_cassette(gigantopelta_aegis_6_cassette, "eie"),
    "Aplysia_californica": concat_cassette(aplysia_californica_9_cassette, "eie"),
}

dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Spiralia_cassette.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Spiralia_cassette.aln")

In [35]:
alignment_dict = {
    "Schistosoma_haematobium": concat_cassette(schistosoma_haematobium_1_cassette, "i"),
    "Mya_arenaria": concat_cassette(mya_arenaria_2_cassette, "i"),
    "Crassostrea_virginica": concat_cassette(crassostrea_virginica_7_cassette, "i"),
    "Magallana_gigas": concat_cassette(magallana_gigas_8_cassette, "i"),
    "Gigantopelta_aegis": concat_cassette(gigantopelta_aegis_6_cassette, "i"),
    "Aplysia_californica": concat_cassette(aplysia_californica_9_cassette, "i"),
}

dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Spiralia_introns.fa")
dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Spiralia_introns.aln")

In [36]:
"python Scripts/build_rna_structures.py --input Alignment/Spiralia/Spiralia_cassette.fa --paint Alignment/Spiralia/Spiralia_introns.fa"

'python Scripts/build_rna_structures.py --input Alignment/Spiralia/Spiralia_cassette.fa --paint Alignment/Spiralia/Spiralia_introns.fa'

---

In [37]:
# Характеристика кассетных интронов для Protostomia

In [38]:
dict_align_info = {
    "Ecdysozoa":
        [
            'caenorhabditis_elegans_0',
            'brugia_malayi_2',
            'trichinella_spiralis_3',
            'priapulus_caudatus_4',
        ],
    "Spiralia":
        [
            'schistosoma_haematobium_1',
            'mya_arenaria_2',
            'crassostrea_virginica_7',
            'magallana_gigas_8',
            'gigantopelta_aegis_6',
            'aplysia_californica_9',
        ]
}

In [39]:
# здесь везде должно быть True в столбце equal_to_cds, НО если у нас в файле есть 2 транскрипта, то будет False
# выше исправил

protostomia_df_cds, protostomia_dict_align_cds = dict_align_info_analyze(dict_align_info, "cds")
protostomia_df_cds

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Ecdysozoa,caenorhabditis_elegans_0,1827,True,-1,1828,106
1,Ecdysozoa,brugia_malayi_2,1950,True,-1,1951,243
2,Ecdysozoa,trichinella_spiralis_3,1479,True,-1,1480,185
3,Ecdysozoa,priapulus_caudatus_4,1995,True,-1,1996,2114
4,Spiralia,schistosoma_haematobium_1,1995,True,-1,1996,652
5,Spiralia,mya_arenaria_2,1986,True,-1,1987,1727
6,Spiralia,crassostrea_virginica_7,2004,True,-1,2005,1613
7,Spiralia,magallana_gigas_8,1929,True,-1,1930,1537
8,Spiralia,gigantopelta_aegis_6,1941,True,-1,1942,1869
9,Spiralia,aplysia_californica_9,1980,True,-1,1981,4146


In [40]:
protostomia_df_cds_cassette, protostomia_dict_align_cds_cassette = dict_align_info_analyze(dict_align_info,
                                                                                           "cds_cassette")
protostomia_df_cds_cassette

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Ecdysozoa,caenorhabditis_elegans_0,1089,False,896,193,106
1,Ecdysozoa,brugia_malayi_2,1095,False,956,139,243
2,Ecdysozoa,trichinella_spiralis_3,972,False,761,211,185
3,Ecdysozoa,priapulus_caudatus_4,1083,False,1082,1,2114
4,Spiralia,schistosoma_haematobium_1,981,False,980,1,652
5,Spiralia,mya_arenaria_2,1110,False,1109,1,1727
6,Spiralia,crassostrea_virginica_7,1110,False,1109,1,1613
7,Spiralia,magallana_gigas_8,1035,False,1034,1,1537
8,Spiralia,gigantopelta_aegis_6,1050,False,1049,1,1869
9,Spiralia,aplysia_californica_9,1068,False,1067,1,4146


In [41]:
highlight_intron_in_seq("Ecdysozoa", "caenorhabditis_elegans_0")

In [42]:
highlight_intron_in_seq("Ecdysozoa", "trichinella_spiralis_3")

---

In [43]:
# Пробую сделать cds + intron и построить вторичные структуры

In [44]:
org_names = \
    [
        "caenorhabditis_elegans_0",
        "brugia_malayi_2",
        "trichinella_spiralis_3",
        "priapulus_caudatus_4",
    ]

dict_align = dict_align_create("Ecdysozoa", org_names, "cds_cassette")
new_dict_align = dict_align_update_keys(dict_align)

dict_align_to_fasta(new_dict_align, "../Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.fa")
dict_align_to_fasta(new_dict_align, "../Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.aln")

In [45]:
"python Scripts/build_rna_structures.py --input Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.fa --paint Alignment/Ecdysozoa/Ecdysozoa_introns.fa --output rnafold_output/Ecdysozoa_cds_cassette"

'python Scripts/build_rna_structures.py --input Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.fa --paint Alignment/Ecdysozoa/Ecdysozoa_introns.fa --output rnafold_output/Ecdysozoa_cds_cassette'

In [46]:
# run_rnafold_with_highlight(
#     Path("../Alignment/Ecdysozoa/Ecdysozoa_cds_cassette.fa"),
#     Path("../Alignment/Ecdysozoa/Ecdysozoa_introns.fa"),
#     Path("../rnafold_output/Ecdysozoa_cds_cassette")
# )

In [47]:
org_names = \
    [
        'schistosoma_haematobium_1',
        'mya_arenaria_2',
        'crassostrea_virginica_7',
        'magallana_gigas_8',
        'gigantopelta_aegis_6',
        'aplysia_californica_9',
    ]

dict_align = dict_align_create("Spiralia", org_names, "cds_cassette")
new_dict_align = dict_align_update_keys(dict_align)

dict_align_to_fasta(new_dict_align, "../Alignment/Spiralia/Spiralia_cds_cassette.fa")
dict_align_to_fasta(new_dict_align, "../Alignment/Spiralia/Spiralia_cds_cassette.aln")

In [48]:
"python Scripts/build_rna_structures.py --input Alignment/Spiralia/Spiralia_cds_cassette.fa --paint Alignment/Spiralia/Spiralia_introns.fa --output rnafold_output/Spiralia_cds_cassette"

'python Scripts/build_rna_structures.py --input Alignment/Spiralia/Spiralia_cds_cassette.fa --paint Alignment/Spiralia/Spiralia_introns.fa --output rnafold_output/Spiralia_cds_cassette'

In [49]:
# run_rnafold_with_highlight(
#     Path("../Alignment/Spiralia/Spiralia_cds_cassette.fa"),
#     Path("../Alignment/Spiralia/Spiralia_introns.fa"),
#     Path("../rnafold_output/Spiralia_cds_cassette")
# )

In [50]:
# Вручную создал файлики для хайлайта полного гена дрозофилы -- нет смысла
# RNAplot --pre "" < mfe.fold
# RNAplot --pre "" < centroid.fold

# В них вручную проставил аннотацию для интронов (желтые -- все, зеленый -- кассетный)
# % Start Annotations
# 1 504 8 10 10 0 omark
# 602 665 8 10 10 0 omark
# 827 893 8 10 10 0 omark
# 987 9878 8 10 10 0 omark
# 10574 12175 8 0 10 0 omark
# 12626 12692 8 10 10 0 omark
# 12865 12921 8 10 10 0 omark
# 13419 13484 8 10 10 0 omark
# 13585 14341 8 10 10 0 omark
# % End Annotations

# run_rnafold_with_highlight(
#     fasta_path=Path("../References/sbr_RA_gene_for_rnafold.fa"),
#     output_dir=Path("../rnafold_output")
# )

---

In [51]:
# У некоторых видов получилось что-то очень интересное, посмотрим на cds без интрона для сравнения

In [52]:
org_names = \
    [
        "caenorhabditis_elegans_0",
        "brugia_malayi_2",
        "trichinella_spiralis_3",
        "priapulus_caudatus_4",
    ]

dict_align = dict_align_create("Ecdysozoa", org_names, "cds")
new_dict_align = dict_align_update_keys(dict_align)

dict_align_to_fasta(new_dict_align, "../Alignment/Ecdysozoa/Ecdysozoa_cds.fa")
dict_align_to_fasta(new_dict_align, "../Alignment/Ecdysozoa/Ecdysozoa_cds.aln")

In [53]:
org_names = \
    [
        'schistosoma_haematobium_1',
        'mya_arenaria_2',
        'crassostrea_virginica_7',
        'magallana_gigas_8',
        'gigantopelta_aegis_6',
        'aplysia_californica_9',
    ]

dict_align = dict_align_create("Spiralia", org_names, "cds")
new_dict_align = dict_align_update_keys(dict_align)

dict_align_to_fasta(new_dict_align, "../Alignment/Spiralia/Spiralia_cds.fa")
dict_align_to_fasta(new_dict_align, "../Alignment/Spiralia/Spiralia_cds.aln")

In [54]:
# run_rnafold_with_highlight(
#     fasta_path=Path("../Alignment/Ecdysozoa/Ecdysozoa_cds.fa"),
#     output_dir=Path("../rnafold_output/Ecdysozoa_cds")
# )
#
# run_rnafold_with_highlight(
#     fasta_path=Path("../Alignment/Spiralia/Spiralia_cds.fa"),
#     output_dir=Path("../rnafold_output/Spiralia_cds")
# )