In [1]:
import os

import pandas as pd

from os.path import commonprefix
from pathlib import Path
from IPython.display import HTML

from Scripts.data_processing import obtain_data
from Scripts.fasta_processing import plain_to_fasta
from highlight import highlight_intron_in_seq
from fasta_processing import read_single_fasta, dict_align_to_fasta, read_fasta, dict_align_to_fasta_upd
from datasets import select_all_phylas, download_all_files_ncbi, check_transcript_count, update_data_for_species
from taxonomy_processing import create_taxonomy
from data_processing import analyze_exons, create_cassette, concat_cassette, dict_align_create, find_codon, \
    dict_align_info_analyze, dict_align_update_keys
from build_rna_structures import run_rnafold_with_highlight

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1_2.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9606,Homo sapiens,10482,0,live,NXF1,"MEX67, TAP",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,11q12.3,11,NC_000011.10,62792130.0,62805440.0,minus,22.0
10090,Mus musculus,53319,0,live,Nxf1,"Mex67, Mvb1, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|Mex 67 homolog|mRN...,19 5.5 cM,19,NC_000085.7,8734467.0,8748274.0,plus,20.0
10116,Rattus norvegicus,59087,0,live,Nxf1,"Mex67h, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,1q43,1,NC_086019.1,215084563.0,215097756.0,plus,21.0
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0


In [3]:
df_taxonomy = pd.read_csv("../all_phylas_taxonomy.tsv", sep="\t", names=["taxid", "taxonomy"], index_col=0,
                          dtype={"taxid": int, "taxonomy": str})

In [4]:
df_deuterostomia = df_taxonomy[df_taxonomy.taxonomy.str.contains("Deuterostomia")]

In [5]:
# df_deuterostomia.to_csv("../deuterostomia_taxonomy.tsv", sep="\t", header=False)
# create_taxonomy("../deuterostomia_taxonomy.tsv")

In [6]:
df_whippomorpha = df_taxonomy[df_taxonomy.taxonomy.str.contains("Cetacea")]

df_mysticeti = df_whippomorpha[df_whippomorpha.taxonomy.str.contains("Mysticeti")]
df_odontoceti = df_whippomorpha[df_whippomorpha.taxonomy.str.contains("Odontoceti")]

whippomorpha_taxids = {
    "Mysticeti": df_mysticeti.index.tolist(),
    "Odontoceti": df_odontoceti.index.tolist(),
}

In [7]:
# df_whippomorpha.to_csv("../whippomorpha_taxonomy.tsv", sep="\t", index=True, header=False)

In [8]:
# create_taxonomy("../whippomorpha_taxonomy.tsv")

In [9]:
# download_all_files_ncbi(df, whippomorpha_taxids, phylas=list(whippomorpha_taxids.keys()))

In [10]:

# после изменения эта функция не должна ничего печатать
# whippomorpha_species_to_update = check_transcript_count(["Mysticeti", "Odontoceti"])

In [11]:
# update_data_for_species(whippomorpha_species_to_update)

In [12]:
def create_many_cassettes(phylum: str, data: dict) -> dict:
    introns = {}
    for org_name, (df, exons_i) in data.items():
        cassette = create_cassette(phylum, org_name, df, exons_i=exons_i)
        introns[org_name] = concat_cassette(cassette, "i")
    return introns

In [13]:
os.listdir("../Datasets/Mysticeti")

['balaenoptera_ricei_2',
 'balaenoptera_musculus_3',
 'balaenoptera_acutorostrata_4',
 'eschrichtius_robustus_0',
 'eubalaena_glacialis_1']

In [14]:
prefix = "../Datasets"
phylum = "Mysticeti"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

# Mysticeti
balaenoptera_ricei_2 = analyze_exons(
    f"{prefix}/{phylum}/balaenoptera_ricei_2/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

balaenoptera_musculus_3 = analyze_exons(
    f"{prefix}/{phylum}/balaenoptera_musculus_3/{postfix}/{nof}")  # 9 экзон ОГРОМНЫЙ 1977 нт || 116 43 | 14, 15

balaenoptera_acutorostrata_4 = analyze_exons(
    f"{prefix}/{phylum}/balaenoptera_acutorostrata_4/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

eschrichtius_robustus_0 = analyze_exons(
    f"{prefix}/{phylum}/eschrichtius_robustus_0/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

eubalaena_glacialis_1 = analyze_exons(
    f"{prefix}/{phylum}/eubalaena_glacialis_1/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

# у всех одинаково все, кроме
# balaenoptera_musculus_3 - объединение экзонов и интронов?

In [15]:
data_mysticeti = {
    "balaenoptera_ricei_2": (balaenoptera_ricei_2, [9, 10]),
    "balaenoptera_acutorostrata_4": (balaenoptera_acutorostrata_4, [9, 10]),
    "eschrichtius_robustus_0": (eschrichtius_robustus_0, [9, 10]),
    "eubalaena_glacialis_1": (eubalaena_glacialis_1, [9, 10]),
}

In [16]:
mysticeti_introns = create_many_cassettes("Mysticeti", data_mysticeti)

In [17]:
prefix = "../Datasets"
phylum = "Mysticeti"
postfix = "ncbi_dataset/data"
align_type = "cds_cassette"

org_names_mysticeti = os.listdir(f"{prefix}/Mysticeti")
org_names_mysticeti.remove("balaenoptera_musculus_3")

mysticeti_alignment_dict = dict_align_create(phylum, org_names_mysticeti, align_type)
mysticeti_alignment_dict_upd = {
    "_".join(key.split("_")[:-1]).capitalize(): value
    for key, value in mysticeti_alignment_dict.items()
}

os.makedirs(f"../Alignment/{phylum}", exist_ok=True)
dict_align_to_fasta(mysticeti_alignment_dict_upd, "../Alignment/Mysticeti/Mysticeti_cds_cassette.fa")
# dict_align_to_fasta(mysticeti_alignment_dict_upd, "../Alignment/Mysticeti/Mysticeti_cds_cassette.aln")

# выравнилось все, даже интрон

---

In [18]:
# удалил Odontoceti/phocoena_sinus_7, phocoena_sinus_6 кажется более полным

os.listdir("../Datasets/Odontoceti")

['delphinus_delphis_4',
 'pseudorca_crassidens_0',
 'delphinapterus_leucas_11',
 'monodon_monoceros_9',
 'kogia_breviceps_5',
 'lipotes_vexillifer_12',
 'globicephala_melas_8',
 'orcinus_orca_15',
 'lagenorhynchus_albirostris_2',
 'tursiops_truncatus_14',
 'physeter_macrocephalus_13',
 'phocoena_phocoena_1',
 'phocoena_sinus_6',
 'mesoplodon_densirostris_3',
 'neophocaena_asiaeorientalis_asiaeorientalis_10',
 'sagmatias_obliquidens_16']

---

In [19]:
prefix = "../Datasets"
phylum = "Odontoceti"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

# Odontoceti
delphinus_delphis_4 = analyze_exons(
    f"{prefix}/{phylum}/delphinus_delphis_4/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

pseudorca_crassidens_0 = analyze_exons(
    f"{prefix}/{phylum}/pseudorca_crassidens_0/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

delphinapterus_leucas_11 = analyze_exons(
    f"{prefix}/{phylum}/delphinapterus_leucas_11/{postfix}/{nof}")  # 110 37 | 10, 11 || 116 43 | 16, 17

monodon_monoceros_9 = analyze_exons(
    f"{prefix}/{phylum}/monodon_monoceros_9/{postfix}/{nof}")  # 110 37 | 10, 11 || 116 43 | 16, 17

kogia_breviceps_5 = analyze_exons(
    f"{prefix}/{phylum}/kogia_breviceps_5/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

lipotes_vexillifer_12 = analyze_exons(
    f"{prefix}/{phylum}/lipotes_vexillifer_12/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

globicephala_melas_8 = analyze_exons(
    f"{prefix}/{phylum}/globicephala_melas_8/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

orcinus_orca_15 = analyze_exons(
    f"{prefix}/{phylum}/orcinus_orca_15/{postfix}/{nof}")  # 110 37 | 10, 11 || 116 43 | 16, 17

lagenorhynchus_albirostris_2 = analyze_exons(
    f"{prefix}/{phylum}/lagenorhynchus_albirostris_2/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

tursiops_truncatus_14 = analyze_exons(
    f"{prefix}/{phylum}/tursiops_truncatus_14/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

physeter_macrocephalus_13 = analyze_exons(
    f"{prefix}/{phylum}/physeter_macrocephalus_13/{postfix}/{nof}")  # 110 37 | 8, 9 || 116 43 | 14, 15

phocoena_phocoena_1 = analyze_exons(
    f"{prefix}/{phylum}/phocoena_phocoena_1/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

phocoena_sinus_6 = analyze_exons(
    f"{prefix}/{phylum}/phocoena_sinus_6/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

mesoplodon_densirostris_3 = analyze_exons(
    f"{prefix}/{phylum}/mesoplodon_densirostris_3/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

neophocaena_asiaeorientalis_asiaeorientalis_10 = analyze_exons(
    f"{prefix}/{phylum}/neophocaena_asiaeorientalis_asiaeorientalis_10/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

sagmatias_obliquidens_16 = analyze_exons(
    f"{prefix}/{phylum}/sagmatias_obliquidens_16/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

In [20]:
data_odontoceti = {
    "delphinus_delphis_4": (delphinus_delphis_4, [9, 10]),
    "pseudorca_crassidens_0": (pseudorca_crassidens_0, [9, 10]),
    "delphinapterus_leucas_11": (delphinapterus_leucas_11, [10, 11]),
    "monodon_monoceros_9": (monodon_monoceros_9, [10, 11]),
    "kogia_breviceps_5": (kogia_breviceps_5, [9, 10]),
    "lipotes_vexillifer_12": (lipotes_vexillifer_12, [9, 10]),
    "globicephala_melas_8": (globicephala_melas_8, [9, 10]),
    "orcinus_orca_15": (orcinus_orca_15, [10, 11]),
    "lagenorhynchus_albirostris_2": (lagenorhynchus_albirostris_2, [9, 10]),
    "tursiops_truncatus_14": (tursiops_truncatus_14, [9, 10]),
    "physeter_macrocephalus_13": (physeter_macrocephalus_13, [8, 9]),
    "phocoena_phocoena_1": (phocoena_phocoena_1, [9, 10]),
    "phocoena_sinus_6": (phocoena_sinus_6, [9, 10]),
    "mesoplodon_densirostris_3": (mesoplodon_densirostris_3, [9, 10]),
    "neophocaena_asiaeorientalis_asiaeorientalis_10": (neophocaena_asiaeorientalis_asiaeorientalis_10, [9, 10]),
    "sagmatias_obliquidens_16": (sagmatias_obliquidens_16, [9, 10]),
}

In [21]:
odontoceti_introns = create_many_cassettes("Odontoceti", data_odontoceti)

In [22]:
prefix = "../Datasets"
phylum = "Odontoceti"
postfix = "ncbi_dataset/data"
align_type = "cds_cassette"

org_names = os.listdir(f"{prefix}/{phylum}")

odontoceti_alignment_dict = dict_align_create(phylum, org_names, align_type)
odontoceti_alignment_dict_upd = {
    "_".join(key.split("_")[:-1]).capitalize(): value
    for key, value in odontoceti_alignment_dict.items()
}

os.makedirs(f"../Alignment/{phylum}", exist_ok=True)
dict_align_to_fasta(odontoceti_alignment_dict_upd, "../Alignment/Odontoceti/Odontoceti_cds_cassette.fa")
# dict_align_to_fasta(odontoceti_alignment_dict_upd, "../Alignment/Odontoceti/Odontoceti_cds_cassette.aln")

---

In [23]:
prefix = "../Datasets"

org_names_mysticeti = os.listdir(f"{prefix}/Mysticeti")
org_names_mysticeti.remove("balaenoptera_musculus_3")

org_names_odontoceti = os.listdir(f"{prefix}/Odontoceti")

dict_align_info = {
    "Mysticeti": org_names_mysticeti,
    "Odontoceti": org_names_odontoceti,
}

In [24]:
# здесь везде должно быть True в столбце equal_to_cds

whippomorpha_df_cds, whippomorpha_dict_align_cds = dict_align_info_analyze(dict_align_info, "cds")
whippomorpha_df_cds

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Mysticeti,balaenoptera_ricei_2,1869,True,-1,1870,1831
1,Mysticeti,balaenoptera_acutorostrata_4,1869,True,-1,1870,1835
2,Mysticeti,eschrichtius_robustus_0,1869,True,-1,1870,1835
3,Mysticeti,eubalaena_glacialis_1,1860,True,-1,1861,1830
4,Odontoceti,delphinus_delphis_4,1860,True,-1,1861,1825
5,Odontoceti,pseudorca_crassidens_0,1860,True,-1,1861,1824
6,Odontoceti,delphinapterus_leucas_11,1860,True,-1,1861,1824
7,Odontoceti,monodon_monoceros_9,1860,True,-1,1861,1828
8,Odontoceti,kogia_breviceps_5,1860,True,-1,1861,1846
9,Odontoceti,lipotes_vexillifer_12,1860,True,-1,1861,1914


In [25]:
whippomorpha_df_cds_cassette, whippomorpha_dict_align_cds_cassette = dict_align_info_analyze(dict_align_info, "cds_cassette")
whippomorpha_df_cds_cassette

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Mysticeti,balaenoptera_ricei_2,1080,False,1028,52,1831
1,Mysticeti,balaenoptera_acutorostrata_4,1080,False,1028,52,1835
2,Mysticeti,eschrichtius_robustus_0,1080,False,1028,52,1835
3,Mysticeti,eubalaena_glacialis_1,1071,False,1019,52,1830
4,Odontoceti,delphinus_delphis_4,1071,False,1019,52,1825
5,Odontoceti,pseudorca_crassidens_0,1071,False,1019,52,1824
6,Odontoceti,delphinapterus_leucas_11,1071,False,1019,52,1824
7,Odontoceti,monodon_monoceros_9,1071,False,1019,52,1828
8,Odontoceti,kogia_breviceps_5,1071,False,1019,52,1846
9,Odontoceti,lipotes_vexillifer_12,1071,False,1019,52,1914


In [26]:
dict_align_to_fasta_upd(whippomorpha_dict_align_cds_cassette, "../Alignment/Whippomorpha_cds_cassette.fa")
# dict_align_to_fasta_upd(whippomorpha_dict_align_cds_cassette, "../Alignment/Whippomorpha_cds_cassette.aln")

---

In [27]:
prefix = "../Datasets"
phylum = "Mysticeti"
postfix = "ncbi_dataset/data"
align_type = "protein"

org_names = os.listdir(f"{prefix}/{phylum}")
org_names.remove("balaenoptera_musculus_3")

alignment_dict = dict_align_create(phylum, org_names, align_type)
alignment_dict_upd = {
    "_".join(key.split("_")[:-1]).capitalize(): value
    for key, value in alignment_dict.items()
}

dict_align_to_fasta(alignment_dict_upd, f"../Alignment/{phylum}/{phylum}_protein.fa")
# dict_align_to_fasta(alignment_dict_upd, f"../Alignment/{phylum}/{phylum}_protein.aln")

In [28]:
prefix = "../Datasets"
phylum = "Odontoceti"
postfix = "ncbi_dataset/data"
align_type = "protein"

org_names = os.listdir(f"{prefix}/{phylum}")

alignment_dict = dict_align_create(phylum, org_names, align_type)
alignment_dict_upd = {
    "_".join(key.split("_")[:-1]).capitalize(): value
    for key, value in alignment_dict.items()
}

dict_align_to_fasta(alignment_dict_upd, f"../Alignment/{phylum}/{phylum}_protein.fa")
# dict_align_to_fasta(alignment_dict_upd, f"../Alignment/{phylum}/{phylum}_protein.aln")

---

In [29]:
prefix = "../Datasets"
phylum = "Mysticeti"
postfix = "ncbi_dataset/data"

org_names = os.listdir(f"{prefix}/{phylum}")
org_names.remove("balaenoptera_musculus_3")

alignment_dict = {}
for org_name in org_names:
    data = obtain_data(phylum, org_name)
    formatted_org_name = org_name.rsplit("_", 1)[0].capitalize()
    alignment_dict[formatted_org_name] = data["protein_sliced"]

dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_protein_sliced.fa")
# dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_protein_sliced.aln")

In [30]:
prefix = "../Datasets"
phylum = "Odontoceti"
postfix = "ncbi_dataset/data"

org_names = os.listdir(f"{prefix}/{phylum}")

alignment_dict = {}
for org_name in org_names:
    data = obtain_data(phylum, org_name)
    formatted_org_name = org_name.rsplit("_", 1)[0].capitalize()
    alignment_dict[formatted_org_name] = data["protein_sliced"]

dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_protein_sliced.fa")
# dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_protein_sliced.aln")

---

### RNAfold

In [31]:
prefix = "../Datasets"
phylum = "Mysticeti"
postfix = "ncbi_dataset/data"

org_names = os.listdir(f"{prefix}/{phylum}")
org_names.remove("balaenoptera_musculus_3")

alignment_dict = {}
for org_name in org_names:
    formatted_org_name = org_name.rsplit("_", 1)[0].capitalize()
    alignment_dict[formatted_org_name] = mysticeti_introns[org_name]

dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_introns.fa")
# dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_introns.aln")

In [32]:
# prefix = "../Alignment"
# phylum = "Mysticeti"
#
# run_rnafold_with_highlight(
#     Path(f"{prefix}/{phylum}/{phylum}_cds_cassette.fa"),
#     Path(f"{prefix}/{phylum}/{phylum}_introns.fa"),
#     Path(f"../rnafold_output/{phylum}_cds_cassette")
# )

In [33]:
prefix = "../Datasets"
phylum = "Odontoceti"
postfix = "ncbi_dataset/data"

org_names = os.listdir(f"{prefix}/{phylum}")

alignment_dict = {}
for org_name in org_names:
    formatted_org_name = org_name.rsplit("_", 1)[0].capitalize()
    alignment_dict[formatted_org_name] = odontoceti_introns[org_name]

dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_introns.fa")
# dict_align_to_fasta(alignment_dict, f"../Alignment/{phylum}/{phylum}_introns.aln")

In [34]:
# prefix = "../Alignment"
# phylum = "Odontoceti"
#
# run_rnafold_with_highlight(
#     Path(f"{prefix}/{phylum}/{phylum}_cds_cassette.fa"),
#     Path(f"{prefix}/{phylum}/{phylum}_introns.fa"),
#     Path(f"../rnafold_output/{phylum}_cds_cassette")
# )

---

### Homo sapiens

In [35]:
df_homo = df_taxonomy[df_taxonomy.taxonomy.str.contains("Homo")]

In [37]:
homo_taxids = {
    "Homo": df_homo.index.tolist(),
}

df_homo_subset = df.loc[homo_taxids["Homo"]]
df_homo_subset_nxf1 = df_homo_subset[df_homo_subset["symbol"] == "NXF1"]

In [38]:
# download_all_files_ncbi(df_homo_subset_nxf1, homo_taxids, phylas=list(homo_taxids.keys()))

In [41]:
# после изменения эта функция не должна ничего печатать
homo_species_to_update = check_transcript_count(["Homo"])

In [40]:
update_data_for_species(homo_species_to_update)

In [42]:
prefix = "../Datasets"
phylum = "Homo"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

# Homo
homo_sapiens_0 = analyze_exons(
    f"{prefix}/{phylum}/homo_sapiens_0/{postfix}/{nof}")  # 110 37 | 9, 10 || 116 43 | 15, 16

In [43]:
data_homo = {
    "homo_sapiens_0": (homo_sapiens_0, [9, 10]),
}

In [44]:
homo_introns = create_many_cassettes("Homo", data_homo)

In [46]:
prefix = "../Datasets"
phylum = "Homo"
postfix = "ncbi_dataset/data"
align_type = "cds_cassette"

org_names_homo = os.listdir(f"{prefix}/{phylum}")

homo_alignment_dict = dict_align_create(phylum, org_names_homo, align_type)
homo_alignment_dict_upd = {
    "_".join(key.split("_")[:-1]).capitalize(): value
    for key, value in homo_alignment_dict.items()
}

os.makedirs(f"../Alignment/{phylum}", exist_ok=True)
dict_align_to_fasta(homo_alignment_dict_upd, "../Alignment/Homo/Homo_cds_cassette.fa")
# dict_align_to_fasta(homo_alignment_dict_upd, "../Alignment/Homo/Homo_cds_cassette.aln")

In [48]:
dict_align_info = {
    "Homo": org_names_homo,
}

In [49]:
homo_df_cds_cassette, homo_dict_align_cds_cassette = dict_align_info_analyze(dict_align_info, "cds")
homo_df_cds_cassette

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Homo,homo_sapiens_0,1815,True,-1,1816,1801


In [50]:
homo_df_cds_cassette, homo_dict_align_cds_cassette = dict_align_info_analyze(dict_align_info, "cds_cassette")
homo_df_cds_cassette

Unnamed: 0,phylum,org_name,stop_codon_pos,equal_to_cds,cassette_start,length_to_stop_codon,intron_length
0,Homo,homo_sapiens_0,1068,False,1016,52,1801
