In [1]:
import pandas as pd

from entrez import select_phyla
from fasta_processing import read_single_fasta, dict_align_to_fasta
from data_processing import choose_best_frameshift, save_subset_df_transcripts

In [2]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0
9598,Pan troglodytes,451267,0,live,NXF1,CK820_G0030982,nuclear RNA export factor 1,nuclear RNA export factor 1,,9,NC_072407.2,65104305.0,65118449.0,minus,23.0
9913,Bos taurus,512136,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,29,NC_037356.1,41099477.0,41110792.0,minus,22.0
9615,Canis lupus familiaris,483780,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,18,NC_051822.1,54936964.0,54947631.0,plus,21.0


In [4]:
cnidaria = select_phyla(df_cnidaria, "Cnidaria")

50429 Stylophora pistillata
6105 Actinia tenebrosa
6087 Hydra vulgaris
45351 Nematostella vectensis


In [5]:
cnidaria

{'Cnidaria': [50429, 6105, 6087, 45351]}

In [6]:
df_cnidaria = df.loc[cnidaria["Cnidaria"]]

In [4]:
df_cnidaria

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
50429,Stylophora pistillata,111339613,0,live,LOC111339613,"AWC38_SpisGene17253, NXF1",nuclear RNA export factor 1-like,nuclear RNA export factor 1-like|Nuclear RNA e...,,Un,NW_019218197.1,98177.0,131366.0,plus,21.0
6105,Actinia tenebrosa,116292710,0,live,LOC116292710,,nuclear RNA export factor 1-like,nuclear RNA export factor 1-like,,Un,NW_022259381.1,16092.0,33206.0,plus,24.0
6087,Hydra vulgaris,100214173,0,live,LOC100214173,,nuclear RNA export factor 1,nuclear RNA export factor 1,,12,NC_088975.1,736736.0,739315.0,minus,2.0
45351,Nematostella vectensis,5510737,0,live,LOC5510737,NEMVEDRAFT_v1g244041,nuclear RNA export factor 1,nuclear RNA export factor 1|predicted protein,,7,NC_064040.1,11955766.0,11973360.0,plus,24.0


In [3]:
df_cnidaria = df.loc[[50429, 6105, 6087, 45351]]

---

In [4]:
cnidaria_seqs = save_subset_df_transcripts(df_cnidaria)

Found 2 mRNA sequences for GeneID 111339613.
Saved transcript to ../References/stylophora_pistillata/XM_022946298.1.fasta
Saved transcript to ../References/stylophora_pistillata/NW_019218197.1.fasta

Found 2 mRNA sequences for GeneID 116292710.
Saved transcript to ../References/actinia_tenebrosa/XM_031700062.1.fasta
Saved transcript to ../References/actinia_tenebrosa/NW_022259381.1.fasta

Found 3 mRNA sequences for GeneID 100214173.
Saved transcript to ../References/hydra_vulgaris/XM_065811174.1.fasta
Saved transcript to ../References/hydra_vulgaris/XM_002156424.5.fasta


KeyboardInterrupt: 

In [54]:
hydra_vulgaris_cds = read_single_fasta("../References/hydra_vulgaris/XM_065811174.1.fasta")
hydra_vulgaris = choose_best_frameshift(hydra_vulgaris_cds, translate=True)

Frameshift 0: Start: 48, Stop: 63, Length: 15
Frameshift 1: Start: 4, Stop: 49, Length: 45
Frameshift 2: Start: 284, Stop: 2225, Length: 1941


In [50]:
stylophora_pistillata = choose_best_frameshift(cnidaria_seqs["stylophora_pistillata"], translate=True)
actinia_tenebrosa = choose_best_frameshift(cnidaria_seqs["actinia_tenebrosa"], translate=True)
hydra_vulgaris = choose_best_frameshift(cnidaria_seqs["hydra_vulgaris"], translate=True)
nematostella_vectensis = choose_best_frameshift(cnidaria_seqs["nematostella_vectensis"], translate=True)

Frameshift 0: Start: 0, Stop: 2226, Length: 2226
Frameshift 1: Start: 82, Stop: 172, Length: 90
Frameshift 2: Start: 788, Stop: 821, Length: 33
Frameshift 0: Start: 129, Stop: 147, Length: 18
Frameshift 1: Start: 496, Stop: 628, Length: 132
Frameshift 2: Start: 98, Stop: 2192, Length: 2094
Frameshift 0: Start: 201, Stop: 222, Length: 21
Frameshift 1: Start: 130, Stop: 163, Length: 33
Frameshift 2: Start: 110, Stop: 131, Length: 21
Frameshift 0: Start: 579, Stop: 594, Length: 15
Frameshift 1: Start: 127, Stop: 2227, Length: 2100
Frameshift 2: Start: 29, Stop: 89, Length: 60


In [56]:
align_dict = {
    "stylophora_pistillata": stylophora_pistillata,
    "actinia_tenebrosa": actinia_tenebrosa,
    "hydra_vulgaris": hydra_vulgaris,
    "nematostella_vectensis": nematostella_vectensis,
}
dict_align_to_fasta(align_dict, "cnidaria_4_sp")

In [3]:
from Bio import Entrez, SeqIO

Entrez.email = "artemvaskaa@gmail.com"
gene_id = "111339613"

In [4]:
search_term = f"{gene_id}[GeneID]"
handle = Entrez.esearch(db="nucleotide", term=f"{gene_id}[GeneID]", idtype="acc", retmax=3)
search_results = Entrez.read(handle)
handle.close()

In [6]:
seq_id = search_results["IdList"][0]

In [17]:
fetch_handle = Entrez.efetch(db="nucleotide", id=seq_id, rettype="gb", retmode="text")
record = SeqIO.read(fetch_handle, "genbank")
fetch_handle.close()

Может вот так последовательность доставать? Раз я ее сразу закидываю в choose_best_frameshift

In [21]:
print(record.seq)

ATGTGGTTCCTACCACCAGTGGTCTGCTTCGATAATAAGCTTCATTCCCAAGAGCCAGTCATCTACAGGGCAAAGCGTATGGATGAAGATGCTGCTCAGATCTTTCAGGTTGAACTGTTCGAAGAGAACATTAAGACGATTCATGAGCAATTCGAATTCACAAAGAAGGTGATCTTCACTCAAGAGGATAGGCAAACTTTCAATGCAAAGATGTCGTTGTCGGCTCTTCCATCGCCAAAGACGAAGACAAAACTTGTATCAGATAATCCAGTCTATTGTGGAATGAGCATCCTCGATATCAACAAGACACTGATGTATGACTTCCTCTATAACTACATCAAAGAGAAGAATGGAGATCGTGCAAAGCTTCTATTTACAGACACTGACAATCTCACGTATGAGATTGAAACTAAAGACTTTTACAAGGATATGGGTGAAGGTGTTGATGACAATAAGGAGACANCAGGCAGAAGCAGGGGTAGAGGAAGGGGGAGAGGAAGAGGGCGTAGCAGTAGCCACCATGGAAGCTATCCTCATCCTAGATCACACTTAGCTGGTGATGATGATGATGATGATGACATAGACATGGATGAAGGAAGCCAAAGATCTCAATCAAGATACAATCCTTACAACAGAAGACCTCCGTCTCGTCGTGGAGACAGAGGAAGTAATAGAGGTGATGTAAGGTCTAGATTAGGGGCAGTCCCTGATAGAGCTGCTGGAACTCAAGGAAATAAGTCGGACTGGCATAAAGTTGTGATACCACAAGGGAAAAAACACGACAAAGAATGGCTTATAAAAAAACTGCAGAATACATGTGAAGAGGCCTTCCAACCAGTTAGTTTCCATCCCTTCAAGGGAGAGTCTTCAGCCTTCTTTGTTGAAGGAAGTAAAGCTGCAGAAGCATTGAAAAGAGTTAGTCACAAAATAACTGTCAAGGATGGATCAAAGCTGATAGTGAGTGTTCGTGCCAGTGCCCCACCCCAAAACCAAACCAATA