In [82]:
import pandas as pd

from entrez import select_phyla
from fasta_processing import read_fasta, read_single_fasta, dict_align_to_fasta
from data_processing import choose_best_frameshift, save_subset_df_transcripts, find_codon

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0
9598,Pan troglodytes,451267,0,live,NXF1,CK820_G0030982,nuclear RNA export factor 1,nuclear RNA export factor 1,,9,NC_072407.2,65104305.0,65118449.0,minus,23.0
9913,Bos taurus,512136,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,29,NC_037356.1,41099477.0,41110792.0,minus,22.0
9615,Canis lupus familiaris,483780,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,18,NC_051822.1,54936964.0,54947631.0,plus,21.0


In [4]:
cnidaria = select_phyla(df_cnidaria, "Cnidaria")

50429 Stylophora pistillata
6105 Actinia tenebrosa
6087 Hydra vulgaris
45351 Nematostella vectensis


In [5]:
cnidaria

{'Cnidaria': [50429, 6105, 6087, 45351]}

In [6]:
df_cnidaria = df.loc[cnidaria["Cnidaria"]]

In [4]:
df_cnidaria

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
50429,Stylophora pistillata,111339613,0,live,LOC111339613,"AWC38_SpisGene17253, NXF1",nuclear RNA export factor 1-like,nuclear RNA export factor 1-like|Nuclear RNA e...,,Un,NW_019218197.1,98177.0,131366.0,plus,21.0
6105,Actinia tenebrosa,116292710,0,live,LOC116292710,,nuclear RNA export factor 1-like,nuclear RNA export factor 1-like,,Un,NW_022259381.1,16092.0,33206.0,plus,24.0
6087,Hydra vulgaris,100214173,0,live,LOC100214173,,nuclear RNA export factor 1,nuclear RNA export factor 1,,12,NC_088975.1,736736.0,739315.0,minus,2.0
45351,Nematostella vectensis,5510737,0,live,LOC5510737,NEMVEDRAFT_v1g244041,nuclear RNA export factor 1,nuclear RNA export factor 1|predicted protein,,7,NC_064040.1,11955766.0,11973360.0,plus,24.0


In [3]:
df_cnidaria = df.loc[[50429, 6105, 6087, 45351]]

---

In [4]:
cnidaria_seqs = save_subset_df_transcripts(df_cnidaria)

Found 2 mRNA sequences for GeneID 111339613.
Saved transcript to ../References/stylophora_pistillata/XM_022946298.1.fasta
Saved transcript to ../References/stylophora_pistillata/NW_019218197.1.fasta

Found 2 mRNA sequences for GeneID 116292710.
Saved transcript to ../References/actinia_tenebrosa/XM_031700062.1.fasta
Saved transcript to ../References/actinia_tenebrosa/NW_022259381.1.fasta

Found 3 mRNA sequences for GeneID 100214173.
Saved transcript to ../References/hydra_vulgaris/XM_065811174.1.fasta
Saved transcript to ../References/hydra_vulgaris/XM_002156424.5.fasta


KeyboardInterrupt: 

In [54]:
hydra_vulgaris_cds = read_single_fasta("../References/hydra_vulgaris/XM_065811174.1.fasta")
hydra_vulgaris = choose_best_frameshift(hydra_vulgaris_cds, translate=True)

Frameshift 0: Start: 48, Stop: 63, Length: 15
Frameshift 1: Start: 4, Stop: 49, Length: 45
Frameshift 2: Start: 284, Stop: 2225, Length: 1941


In [50]:
stylophora_pistillata = choose_best_frameshift(cnidaria_seqs["stylophora_pistillata"], translate=True)
actinia_tenebrosa = choose_best_frameshift(cnidaria_seqs["actinia_tenebrosa"], translate=True)
hydra_vulgaris = choose_best_frameshift(cnidaria_seqs["hydra_vulgaris"], translate=True)
nematostella_vectensis = choose_best_frameshift(cnidaria_seqs["nematostella_vectensis"], translate=True)

Frameshift 0: Start: 0, Stop: 2226, Length: 2226
Frameshift 1: Start: 82, Stop: 172, Length: 90
Frameshift 2: Start: 788, Stop: 821, Length: 33
Frameshift 0: Start: 129, Stop: 147, Length: 18
Frameshift 1: Start: 496, Stop: 628, Length: 132
Frameshift 2: Start: 98, Stop: 2192, Length: 2094
Frameshift 0: Start: 201, Stop: 222, Length: 21
Frameshift 1: Start: 130, Stop: 163, Length: 33
Frameshift 2: Start: 110, Stop: 131, Length: 21
Frameshift 0: Start: 579, Stop: 594, Length: 15
Frameshift 1: Start: 127, Stop: 2227, Length: 2100
Frameshift 2: Start: 29, Stop: 89, Length: 60


In [56]:
align_dict = {
    "stylophora_pistillata": stylophora_pistillata,
    "actinia_tenebrosa": actinia_tenebrosa,
    "hydra_vulgaris": hydra_vulgaris,
    "nematostella_vectensis": nematostella_vectensis,
}
dict_align_to_fasta(align_dict, "cnidaria_4_sp")

In [3]:
from Bio import Entrez, SeqIO

Entrez.email = "artemvaskaa@gmail.com"
gene_id = "111339613"

In [4]:
search_term = f"{gene_id}[GeneID]"
handle = Entrez.esearch(db="nucleotide", term=f"{gene_id}[GeneID]", idtype="acc", retmax=3)
search_results = Entrez.read(handle)
handle.close()

In [6]:
seq_id = search_results["IdList"][0]

In [17]:
fetch_handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="gb", retmode="text")
record = SeqIO.read(fetch_handle, "genbank")
fetch_handle.close()

Может вот так последовательность доставать? Раз я ее сразу закидываю в choose_best_frameshift

In [21]:
print(record.seq)

ATGTGGTTCCTACCACCAGTGGTCTGCTTCGATAATAAGCTTCATTCCCAAGAGCCAGTCATCTACAGGGCAAAGCGTATGGATGAAGATGCTGCTCAGATCTTTCAGGTTGAACTGTTCGAAGAGAACATTAAGACGATTCATGAGCAATTCGAATTCACAAAGAAGGTGATCTTCACTCAAGAGGATAGGCAAACTTTCAATGCAAAGATGTCGTTGTCGGCTCTTCCATCGCCAAAGACGAAGACAAAACTTGTATCAGATAATCCAGTCTATTGTGGAATGAGCATCCTCGATATCAACAAGACACTGATGTATGACTTCCTCTATAACTACATCAAAGAGAAGAATGGAGATCGTGCAAAGCTTCTATTTACAGACACTGACAATCTCACGTATGAGATTGAAACTAAAGACTTTTACAAGGATATGGGTGAAGGTGTTGATGACAATAAGGAGACANCAGGCAGAAGCAGGGGTAGAGGAAGGGGGAGAGGAAGAGGGCGTAGCAGTAGCCACCATGGAAGCTATCCTCATCCTAGATCACACTTAGCTGGTGATGATGATGATGATGATGACATAGACATGGATGAAGGAAGCCAAAGATCTCAATCAAGATACAATCCTTACAACAGAAGACCTCCGTCTCGTCGTGGAGACAGAGGAAGTAATAGAGGTGATGTAAGGTCTAGATTAGGGGCAGTCCCTGATAGAGCTGCTGGAACTCAAGGAAATAAGTCGGACTGGCATAAAGTTGTGATACCACAAGGGAAAAAACACGACAAAGAATGGCTTATAAAAAAACTGCAGAATACATGTGAAGAGGCCTTCCAACCAGTTAGTTTCCATCCCTTCAAGGGAGAGTCTTCAGCCTTCTTTGTTGAAGGAAGTAAAGCTGCAGAAGCATTGAAAAGAGTTAGTCACAAAATAACTGTCAAGGATGGATCAAAGCTGATAGTGAGTGTTCGTGCCAGTGCCCCACCCCAAAACCAAACCAATA

In [None]:
# stylophora_pistillata
"datasets download gene gene-id 111339613 --include gene,rna,protein --filename Datasets/stylophora_pistillata.zip"
"unzip Datasets/stylophora_pistillata.zip -d Datasets/stylophora_pistillata && rm -r Datasets/stylophora_pistillata.zip"

In [None]:
# actinia_tenebrosa
"datasets download gene gene-id 116292710 --include gene,rna,protein --filename Datasets/actinia_tenebrosa.zip"
"unzip Datasets/actinia_tenebrosa.zip -d Datasets/actinia_tenebrosa && rm -r Datasets/actinia_tenebrosa.zip"

In [None]:
# hydra_vulgaris
"datasets download gene gene-id 100214173 --include gene,rna,protein --filename Datasets/hydra_vulgaris.zip"
"unzip Datasets/hydra_vulgaris.zip -d Datasets/hydra_vulgaris && rm -r Datasets/hydra_vulgaris.zip"

In [None]:
# nematostella_vectensis
"datasets download gene gene-id 5510737 --include gene,rna,protein --filename Datasets/nematostella_vectensis.zip"
"unzip Datasets/nematostella_vectensis.zip -d Datasets/nematostella_vectensis && rm -r Datasets/nematostella_vectensis.zip"

In [17]:
org_name = "stylophora_pistillata"
path = f"../Datasets/{org_name}/ncbi_dataset/data"

gene = read_single_fasta(f"{path}/gene.fna")
rna = read_single_fasta(f"{path}/rna.fna")
protein = read_single_fasta(f"{path}/protein.faa")

In [18]:
dict_align = {
    "stylophora_pistillata_gene": gene,
    "stylophora_pistillata_rna": rna,
}

In [21]:
dict_align_to_fasta(dict_align, f"../Alignment/stylophora_gene_rna.fa")
dict_align_to_fasta(dict_align, f"../Alignment/stylophora_gene_rna.aln")

In [20]:
"AAGGAAGTAAAGCTGCAGAAGCATTGAAAAGAGTTAGTCACAAAAT" in gene

False

In [48]:
# ПЕРЕДЕЛАТЬ. Просто создать табличку (индекс[0-ласт экзон], длина, последовательность), название НЕ НУЖНО

def find_exons(org_name: str, exons_len: tuple = (110, 37)) -> dict:

    print(f"Selected lengths are {exons_len[0]} and {exons_len[1]}\n")
    exon_seq = {
        exons_len[0]: [],
        exons_len[1]: []
    }
    path = f"../Datasets/{org_name}/ncbi_dataset/data/"
    exons = read_fasta(f"{path}/{org_name}_exons.fa")

    lengths = {}
    i = 0
    for header, seq in exons.items():
        lengths[i] = len(seq)
        i += 1
        if len(seq) == exons_len[0]:
            exon_seq[str(exons_len[0])].append(seq)
        if len(seq) == exons_len[1]:
            exon_seq[str(exons_len[1])].append(seq)

    for key, value in lengths.items():
        print(f"{key}: {value}")

    return exon_seq

попробовать из выравнивания транскрипта на ген вытащить экзоны! (если нет файла .exons.fa)

и потом уже искать сначала 110 37

если не находятся, то просматриваем примерно похожие, вытаскиваем пары и выравниваем на ген (т.к. нам нужны интроны и кассетный интрон в частности)

In [26]:
stylophora_pistillata_exons = read_fasta("../Datasets/stylophora_pistillata/ncbi_dataset/data/stylophora_pistillata_exons.fa")

In [69]:
def analyze_exons(org_name: str) -> pd.DataFrame:
    path = f"../Datasets/{org_name}/ncbi_dataset/data"
    exons = read_fasta(f"{path}/{org_name}_exons.fa")

    lengths = [len(seq) for seq in exons.values()]
    seqs = exons.values()

    df = pd.DataFrame(
        {
            "length": lengths,
            "sequence": seqs,
        }
    )
    return df

In [70]:
df_nematostella_vectensis = analyze_exons("nematostella_vectensis")
df_stylophora_pistillata = analyze_exons("stylophora_pistillata")
df_actinia_tenebrosa = analyze_exons("actinia_tenebrosa")
# df_hydra_vulgaris = analyze_exons("hydra_vulgaris")

In [71]:
df_nematostella_vectensis

Unnamed: 0,length,sequence
0,218,GTAACCGCGCCTGCCTCTTTAAGGCCAGGATGCCAAGTAAACTTGA...
1,250,GTGACGGCATGGAGGATGATAGGCAGAGTAATAGAGGTAGAGGTCG...
2,148,CACACCATATGGCGCGCGACCCCCTAGTAGGCGTGGCTATCAGCAT...
3,84,GTTAACAGGGCCAAGATCCATGACAAAGAATGGCTCATCAAACGGC...
4,105,TTCCATTACATGGGTGAAAGTGCTGTGTTTTTTGTGGAGGGCTCAA...
5,111,TTGATAATTACAGTACGGCCAAGCCAGAAACCATTTAAAAGTCACA...
6,70,GAATGTCTTAGCAATCGATACAATCCTCAAACAAAAACTATGGATC...
7,89,TTCTAAAAGCAAATAATGTTTTTGGAGCTCTGAACAAGTATCCACT...
8,108,GTTGAGTCCCTAGATGTCAGCAATAACCGGCTATTCCAGTTGGACC...
9,116,TTAAAAAGTATTGATGAACTCGACAAAGTGAAGGGACTCAAGGAGC...


In [72]:
df_stylophora_pistillata

Unnamed: 0,length,sequence
0,243,ATGTGGTTCCTACCACCAGTGGTCTGCTTCGATAATAAGCTTCATT...
1,210,AAGACAAAACTTGTATCAGATAATCCAGTCTATTGTGGAATGAGCA...
2,167,AAGGAGACANCAGGCAGAAGCAGGGGTAGAGGAAGGGGGAGAGGAA...
3,139,CAATCCTTACAACAGAAGACCTCCGTCTCGTCGTGGAGACAGAGGA...
4,104,CCCCACCCCAAAACCAAACCAATAATTTTCCTGTTCGTGAAGTTCA...
5,70,GAGTGTCTCAGTAAACGTTATGATAATATGGCAAAAGCACTCGACC...
6,89,TGCTAAATTCAAATAATGTACAAGGAGTACTGTCAAGATATTGGCT...
7,108,GTTCAGTCACTTGACTTAAGTAACAATCGCCTAAAGAGCTTGGATG...
8,116,TTAAGAGCTGTAGAAGAACTGGACAAGATAAAGTCCTTGAGTCAGC...
9,51,TTTTTTTGGTGATCCTGCAGTTAAGGACCTGGTGCTAAAATTCATT...


In [73]:
df_actinia_tenebrosa

Unnamed: 0,length,sequence
0,183,TTTTTCTCTAGTGTCGCGAGGATTATTGTATAAATATTGTTCAACA...
1,229,ATAACCAAGATTCAAGCTCAAACTCTAATAGAGGAGGATATCGAGG...
2,133,TACGCCATATGGAACAAGACCACCTTCAAGGCGTGGTTTTCAAAAT...
3,84,ATTGTAAAAGGGAAGCAGCACGACAAAGATTGGTTACTCAGAAAAC...
4,105,TTCCATTACAAGGGAGACACTGCTATGTTTTTTGTAGAGGGTAGTA...
5,123,TTGACCCTCACTGTTAGACAAAGTGAACGGCCATTTATGACAAATC...
6,70,GAATGTTTGAGCAATCGCTACAATATGGAAACAAAGACTATGGATT...
7,89,TTTTAAAGGCTAATAATGTTCAAGGAAGTTTATATAGAGCTCCTGT...
8,108,ATGCAAGGTTTGGATATGAGTGATAATCGCCTGTACAATCTTGAAG...
9,116,ATGAGGAATATTGAAGAACTGGATAAACTCAAAGGACTTACTGGAG...


In [41]:
def concat_2_exons(df: pd.DataFrame, org_name: str, indices: list):
    path = f"../Datasets/{org_name}/ncbi_dataset/data/"

    seq_0 = df.iloc[indices[0]].sequence
    seq_1 = df.iloc[indices[1]].sequence

    filename = f"{org_name}_{indices[0]}-{len(seq_0)}-{indices[1]}_{len(seq_1)}.fa"

    with open(f"{path}/{filename}", "w") as handle:
        handle.write(f">{org_name}\n")
        handle.write(f"{''.join([seq_0, seq_1])}\n")

In [42]:
concat_2_exons(df, "stylophora_pistillata", [8, 9])

In [43]:
concat_2_exons(df, "stylophora_pistillata", [13, 14])

In [74]:
concat_2_exons(df_nematostella_vectensis, "nematostella_vectensis", [9, 10])

In [75]:
concat_2_exons(df_actinia_tenebrosa, "actinia_tenebrosa", [9, 10])

In [45]:
org_name_1 = "nematostella_vectensis"
org_name_2 = "stylophora_pistillata"

dict_align = {
    f"{org_name_1}_gene": read_single_fasta(f"../Datasets/{org_name_1}/ncbi_dataset/data/gene.fna"),
    f"{org_name_2}_2_exons": read_single_fasta(f"../Datasets/{org_name_2}/ncbi_dataset/data/{org_name_2}_8-116-9_51.fa"),
}
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_gene_stylophora_2_exon.fa")
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_gene_stylophora_2_exon.aln")

In [77]:
org_name_1 = "nematostella_vectensis"
org_name_2 = "actinia_tenebrosa"

dict_align = {
    f"{org_name_1}_gene": read_single_fasta(f"../Datasets/{org_name_1}/ncbi_dataset/data/gene.fna"),
    f"{org_name_1}_2_exons": read_single_fasta(f"../Datasets/{org_name_1}/ncbi_dataset/data/{org_name_1}_9-116-10_37.fa"),
}
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_gene_2_exon.fa")
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_gene_2_exon.aln")

dict_align = {
    f"{org_name_2}_gene": read_single_fasta(f"../Datasets/{org_name_2}/ncbi_dataset/data/gene.fna"),
    f"{org_name_2}_2_exons": read_single_fasta(f"../Datasets/{org_name_2}/ncbi_dataset/data/{org_name_2}_9-116-10_37.fa"),
}
dict_align_to_fasta(dict_align, f"../Alignment/actinia_gene_2_exon.fa")
dict_align_to_fasta(dict_align, f"../Alignment/actinia_gene_2_exon.aln")

In [78]:
org_name_1 = "nematostella_vectensis"
org_name_2 = "actinia_tenebrosa"

dict_align = {
    f"{org_name_1}_2_exons": read_single_fasta(f"../Datasets/{org_name_1}/ncbi_dataset/data/{org_name_1}_9-116-10_37.fa"),
    f"{org_name_2}_2_exons": read_single_fasta(f"../Datasets/{org_name_2}/ncbi_dataset/data/{org_name_2}_9-116-10_37.fa"),
}
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_actinia_2_exon.fa")
dict_align_to_fasta(dict_align, f"../Alignment/nematostella_actinia_2_exon.aln")

In [59]:
def dict_align_create(org_names: list, align_type: str) -> dict:
    align_types = ["gene", "rna", "protein"]
    if align_type not in align_types:
        raise ValueError(f"Unknown alignment type: {align_type}")
    else:
        match align_type:
            case "gene":
                ext = "fna"
            case "rna":
                ext = "fna"
            case "protein":
                ext = "faa"

    filename = f"{align_type}.{ext}"
    dict_align = {}
    for org_name in org_names:
        dict_align[f"{org_name}"] = read_single_fasta(f"../Datasets/{org_name}/ncbi_dataset/data/{filename}")

    return dict_align

In [57]:
org_names = [
    "nematostella_vectensis",
    "stylophora_pistillata",
    "actinia_tenebrosa",
    "hydra_vulgaris",
]

In [62]:
dict_align = dict_align_create(org_names, "gene")

In [63]:
dict_align_to_fasta(dict_align, f"../Alignment/4_cnidaria_gene.fa")
dict_align_to_fasta(dict_align, f"../Alignment/4_cnidaria_gene.aln")

In [None]:
>NW_022259381.1:16092-33206 LOC116292710 [organism=Actinia tenebrosa] [GeneID=116292710] [chromosome=Un]
>NC_088975.1:c739315-736736 LOC100214173 [organism=Hydra vulgaris] [GeneID=100214173] [chromosome=12]
>NC_064040.1:11955766-11973360 LOC5510737 [organism=Nematostella vectensis] [GeneID=5510737] [chromosome=7]
>NW_019218197.1:98177-131366 LOC111339613 [organism=Stylophora pistillata] [GeneID=111339613] [chromosome=Un]

In [79]:
actinia_tenebrosa_cassette = "ATGAGGAATATTGAAGAACTGGATAAACTCAAAGGACTTACTGGAGTAGTTACCCTTTTTCTGAATGGGAATCCATTCTGTGATAAATTTGAAGGCAAAGAATCAAGCTATATAAGGTTAGTTGAGTGAAATTATTATTCCCTAACAAAATAGATGTTATGATGTCACTTTGAAAACCTAGCATGTAGGTATGATGCTAGATTATATTCTCAGCAGCTTCTTCTAACCAGTAACTACAGTTAGAGAAAAGATAGTAGAATCTAATAAAATAAATGCTTTTACTTTTCAGTGCGGTTAGAAGCAGGTTTCCAAAAGTCTTAAATCTG"

In [83]:
find_codon(actinia_tenebrosa_cassette, which="stop", frame_shift=0, print_seq=True)

ATG-AGG-AAT-ATT-GAA-GAA-CTG-GAT-AAA-CTC-AAA-GGA-CTT-ACT-GGA-GTA-GTT-ACC-CTT-TTT-CTG-AAT-GGG-AAT-CCA-TTC-TGT-GAT-AAA-TTT-GAA-GGC-AAA-GAA-TCA-AGC-TAT-ATA-AGG-TTA-GTT-GAG-TGA


126

In [84]:
actinia_tenebrosa_2_exon = "ATGAGGAATATTGAAGAACTGGATAAACTCAAAGGACTTACTGGAGTAGTTACCCTTTTTCTGAATGGGAATCCATTCTGTGATAAATTTGAAGGCAAAGAATCAAGCTATATAAGTGCGGTTAGAAGCAGGTTTCCAAAAGTCTTAAATCTG"

In [90]:
actinia_tenebrosa_rna = read_single_fasta("../Datasets/actinia_tenebrosa/ncbi_dataset/data/rna.fna")

In [88]:
actinia_tenebrosa_rna_cassette = actinia_tenebrosa_rna.replace(actinia_tenebrosa_2_exon, actinia_tenebrosa_cassette)

In [91]:
len(actinia_tenebrosa_rna)

2869

In [92]:
len(actinia_tenebrosa_2_exon)

153

In [93]:
len(actinia_tenebrosa_cassette)

326

In [94]:
len(actinia_tenebrosa_rna_cassette)

3042

In [95]:
326-153 == 3042-2869

True

In [103]:
actinia_tenebrosa_rna_cassette[98:1253]

'ATGGCAGGTCTGTTTGGGAAGGCAATGAAGGATGCCTCATTATCGGTAACGACCACAAGAGAAGGCTCTCGGACGTTCGATCAAGATAACCAAGATTCAAGCTCAAACTCTAATAGAGGAGGATATCGAGGCAGAAGTCAAGGAAGAGGTCAAAAGAGAGGTCGTGGTAGAGGAAACTACTGGAGGCCTAGGGGKAGAGGTGGTAGAGGAAGAGCCAATCCACAGAATCCCACACCAAGGTCATACCTGATTGATGAAGAAGATGATGAAAGCATGGGCGATGAAGAAGACAATGCCACATCTTTTTCAAGATATACGCCATATGGAACAAGACCACCTTCAAGGCGTGGTTTTCAAAATGACAGAGGGAACAACAGGGGAACTGGTGGTATTAAACGATGGCTTGGAAATCAACCCCAAGGAAAATCAGACTGGTACAAAGTTGCTATTGTAAAAGGGAAGCAGCACGACAAAGATTGGTTACTCAGAAAACTACAAAATGCTTCAGAAGAAGCCTTTCAACCAGTTGAGTTCCATTACAAGGGAGACACTGCTATGTTTTTTGTAGAGGGTAGTAGTGCTGCAGATGCATTAAAAAAAGTCAGTCATCAAATAACTGTCAAAGATGGTTCCAAGTTGACCCTCACTGTTAGACAAAGTGAACGGCCATTTATGACAAATCATAGTGGTAAAGAAGGCAGTGCTTTTAGTGGAACTGATAGCTCACAATGGAATGCAGAAACAGAGCAAGCATTAAAGGAATGTTTGAGCAATCGCTACAATATGGAAACAAAGACTATGGATTTATCTGATCTTTTTCATGATGAAGTTTTAAAGGCTAATAATGTTCAAGGAAGTTTATATAGAGCTCCTGTGGCAAATGCAATCTTGAAACTTATTGGTGAAAATTGTCCTGATATGCAAGGTTTGGATATGAGTGATAATCGCCTGTACAATCTTGAAGCAATGAAAGACTTGCCTACTTATGCACCCAGTATA

In [99]:
actinia_tenebrosa_rna_cassette_prot = choose_best_frameshift(actinia_tenebrosa_rna_cassette, translate=True)

Frameshift 0: Start: 129, Stop: 147, Length: 18
Frameshift 1: Start: 496, Stop: 628, Length: 132
Frameshift 2: Start: 98, Stop: 1253, Length: 1155


In [100]:
actinia_tenebrosa_rna_prot = choose_best_frameshift(actinia_tenebrosa_rna, translate=True)

Frameshift 0: Start: 129, Stop: 147, Length: 18
Frameshift 1: Start: 496, Stop: 628, Length: 132
Frameshift 2: Start: 98, Stop: 2192, Length: 2094


In [101]:
actinia_tenebrosa_rna_cassette_prot

'MAGLFGKAMKDASLSVTTTREGSRTFDQDNQDSSSNSNRGGYRGRSQGRGQKRGRGRGNYWRPRGRGGRGRANPQNPTPRSYLIDEEDDESMGDEEDNATSFSRYTPYGTRPPSRRGFQNDRGNNRGTGGIKRWLGNQPQGKSDWYKVAIVKGKQHDKDWLLRKLQNASEEAFQPVEFHYKGDTAMFFVEGSSAADALKKVSHQITVKDGSKLTLTVRQSERPFMTNHSGKEGSAFSGTDSSQWNAETEQALKECLSNRYNMETKTMDLSDLFHDEVLKANNVQGSLYRAPVANAILKLIGENCPDMQGLDMSDNRLYNLEAMKDLPTYAPSIQHLKLSNNQMRNIEELDKLKGLTGVVTLFLNGNPFCDKFEGKESSYIRLVE*'

In [102]:
actinia_tenebrosa_rna_prot

'MAGLFGKAMKDASLSVTTTREGSRTFDQDNQDSSSNSNRGGYRGRSQGRGQKRGRGRGNYWRPRGRGGRGRANPQNPTPRSYLIDEEDDESMGDEEDNATSFSRYTPYGTRPPSRRGFQNDRGNNRGTGGIKRWLGNQPQGKSDWYKVAIVKGKQHDKDWLLRKLQNASEEAFQPVEFHYKGDTAMFFVEGSSAADALKKVSHQITVKDGSKLTLTVRQSERPFMTNHSGKEGSAFSGTDSSQWNAETEQALKECLSNRYNMETKTMDLSDLFHDEVLKANNVQGSLYRAPVANAILKLIGENCPDMQGLDMSDNRLYNLEAMKDLPTYAPSIQHLKLSNNQMRNIEELDKLKGLTGVVTLFLNGNPFCDKFEGKESSYISAVRSRFPKVLNLDGVEHAPPIGFDLATTTALPKVQGSYLVDPEIKKLLLSFLEQYFRIYDSNDRQPLLEAYHDQAIFSMSVNPGSFNREKGPRGPSLGEYMKSSRNMIRRKEQDVRASLIKHNRLSVVAMLNELPPTTHELSSFVVDVSLAIPTCLHFSIRGFFMEGNKTMRSFTRVFVALPAAGGKSLKIVNDELHVRGPALPQIQAFKKQQESITATSTLQVPAAVPTIPTTLPTIVTVPTLTPSTSLTTLPIPSGLTPEQQQMILQFSRESRMNAEWSKKCLADNGWDYQKSAECFTSLNNQGLIPPGAFIKT*'

Пайплайн:

- прочитать табличку генов nxf
- достать из нее все сначала все таксоны (это индекс в табличке), затем по ним сделать сабсет таблицы и достать gene_id
- с помощью датасетс по gene_id скачать весь ген, рнк и белок (os.subprocess)
- отдельно скачать все экзоны (КАК?)
- функция для dict_align....