In [1]:
import ast
import os

import pandas as pd
from Bio import Entrez

from data_processing import analyze_exons
from fasta_processing import read_single_fasta, dict_align_to_fasta
from parse_psi_blast_results import parse_psiblast_xml, filter_psiblast_hits, update_df, save_files, \
    create_many_cassettes, dict_align_info_analyze

Entrez.email = "artemvaskaa@gmail.com"

### Platyhelminthes

[schistosoma_haematobium_platyhelminthes](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_051072204.1%20NXF1_3%20%5Borganism%3DSchistosoma%20haematobium%5D%20%5BGeneID%3D24590952%5D%0AMPHGNHRNKSNHYRWRSHGENDRRIDTSDEYLRRREHGGKFSSGRKSIISGNNVDMLKRAMNMNLIGGSA%0AASVVTAQNSGLAPGEVWVRITIVHGANHPMMDLQQLVTTIVGTQLRFYNTCVEGRNALMHAKIRQKDVQS%0AYRKSLQNLRDPSQGSQLITDITIVPEPRVPSSSDKRNESPNTSPLPETWIEALKQCFVQRYQPTTRSLDL%0ASSLHTDPVLLSQGLYLPLNKQAVVHTLITILKQNQAQLAVLNLSNNRLTHLNAFSPLSSTSAGFIPVSIE%0ARIDLSSNPLSSIPVLSGLRDIVGLVELDLTETPLMSKFNPNDKSFAAKLHTILPTIKRLNGQELPQTVQF%0AAIEQGSDSSKRPPTKPLPQSILGFFPNDEVKIALLSFLKLYLSRYDSKPRGESLLPYYTTVSQLVFSVSP%0AENRFPNSQNVSFTARVEIQNGSDQPTTAYLTTSRLNQAYFLRSRNLLRCRDQSRRRDMVVRGSLAIAHFL%0ADELPTTEHQLESLSVDVAFHSGTQMLFTMGGVFYEVSSMGSSSTNSSSHEKSVRKVLRCFTRTMILIAPG%0AGHIVQDDYIVSNPTTSLCKKYITEMATRCKQDSQASNQQQNVLSSDPSAPEVKENIVIEFSRRTGMNIPF%0ASRQCLEEYEWNANAALTAFETMNLAGKIPPEAFSV&JOB_TITLE=schistosoma_haematobium_platyhelminthes&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Platyhelminthes%20%28taxid%3A6157%29&EQ_MENU1=Schistosoma%20haematobium%20(taxid:6185)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

In [2]:
phyla = "Platyhelminthes"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/schistosoma_haematobium_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

KeyError: ProteinID XP_018646589.1 GeneID XM_018799620.1 -> skipping...
IndexError: ProteinID CDI98011.1 GeneID not found -> skipping...
IndexError: ProteinID CDS21420.1 GeneID not found -> skipping...
IndexError: ProteinID CDS25628.1 GeneID not found -> skipping...
KeyError: ProteinID XP_024351515.1 GeneID XM_024494126.1 -> skipping...


In [3]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [4]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [9]:
phyla = "Platyhelminthes"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [10]:
introns = create_many_cassettes(dir, data)

In [11]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Mesocestoides_corti__VDD84209.1: start codon not in the beginning of sequence


array([ True])

In [12]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Mesocestoides_corti__VDD84209.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Schistosoma_guineensis__CAH8656942.1,981,False,980,1,652
1,Schistosoma_curassoni__CAH8663826.1,981,False,980,1,652
2,Schistosoma_bovis__CAH8671530.1,981,False,980,1,652
3,Schistosoma_margrebowiei__CAH8667553.1,981,False,980,1,650
4,Schistosoma_intercalatum__CAH8646207.1,981,False,980,1,652
5,Schistosoma_rodhaini__CAH8677847.1,981,False,980,1,671
6,Schistosoma_japonicum__TNN16962.1,978,False,977,1,847
7,Schistosoma_turkestanicum__CAH8587689.1,978,False,977,1,905
8,Paragonimus_westermani__KAF8560478.1,993,False,992,1,13971
9,Taenia_crassiceps__KAL5103217.1,981,False,980,1,278


In [13]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Schistosoma_haematobium_reference"] = read_single_fasta("../Datasets/Spiralia/schistosoma_haematobium_1/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Mollusca/Bivalvia

[mya_arenaria](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

[crassostrea_virginica_bivalvia](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

[magallana_gigas_bivalvia](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

#### mya_arenaria

In [14]:
phyla = "Bivalvia"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/mya_arenaria_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [15]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [16]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [17]:
phyla = "Bivalvia"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [18]:
introns = create_many_cassettes(dir, data)

In [19]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Dreissena_polymorpha__KAH3852511.1: start codon not in the beginning of sequence
Sinanodonta_woodiana__KAL3874362.1: no stop codon found


array([ True])

In [20]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Dreissena_polymorpha__KAH3852511.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Mercenaria_mercenaria__XP_053373641.1,1107,False,1106,1,1690
1,Dreissena_polymorpha__XP_052275796.1,1107,False,1106,1,2207
2,Ruditapes_philippinarum__XP_060562241.1,1095,False,1094,1,1646
3,Mactra_antiquata__KAL4236273.1,840,False,839,1,2319
4,Mytilus_coruscus__CAC5418521.1,1089,False,1088,1,1234
5,Potamilus_streckersoni__KAK3583320.1,1071,False,1070,1,4567
6,Mytilus_californianus__XP_052058679.1,1089,False,1088,1,1248
7,Saccostrea_echinata__XP_061162409.1,1098,False,1097,1,1556
8,Mytilus_edulis__CAG2249952.1,1089,False,1088,1,1360
9,Mytilus_edulis__XP_071161937.1,1089,False,1088,1,1360


In [24]:
os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)

# dict_align_cds_cassette["Mya_arenaria_reference"] = read_single_fasta("../Datasets/Spiralia/mya_arenaria_2/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Mollusca/Gastropoda

[gigantopelta_aegis](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_041353365.1%20LOC121371499%20%5Borganism%3DGigantopelta%20aegis%5D%20%5BGeneID%3D121371499%5D%0AMSSLTVTAGRDGSRSFGGHDDRWSGKQRNQRRGRGKSYYSRGSYNRGGNKGPRGGGVNPRSRFTDEDDDV%0ANMDSSSGHINRRFDPYGQGGGWNRRNRGRGSFKGGRYHGDFQSHKDTGAMQRMGLPVSRQNQWYKITIPH%0AGKKTGKDEILKLINSVIGAPFQPVYFHYQNKDALFYVKDWDQATALRRTSKMITLPSGFKMIVVVSPCSP%0APVIPMDKESIEKLQKRMSERYDPASKCLDLSSLYQDEVLSASGLYLALNRANTMSNVVKIIQENIPELVG%0ALDLSSNRLLSLSHMTDLVSAAPFVTKLNIGKNQLRSIEELQKIEGWKLLQLVLDGNDLCDRYKERSEYIS%0AMVRKRFPKVINLDGHELPPPITFDLETTTVIPPSKGSYFMNDEVKTIVVKFLKEYYTIYDSDNRQPLFEA%0AYHEQAVFCIATAYNGTLDYKQPSLTDYLQESRNIFRVKDTARREKSIKNGRLPVVSQLCLLPKTTHDPNS%0AFVVDINIVRPTLLSFSLTGIFKETESKSDKPPIRAFNRLFVTVPCGSGMVITNDVLTITNASPEQSQSAF%0AKSTGPTPSSSPVSSAPPAVPSTSLSPIPGTSAVPNKEMIQRFSAESGMNAEWSHKCLEENGWNYKKAALV%0AFTELHSQGKIPQDAFVK&JOB_TITLE=gigantopelta_aegis_gastropoda&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Gastropoda%20%28taxid%3A6448%29&EQ_MENU1=Gigantopelta%20aegis%20(taxid:1735272)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [4]:
phyla = "Gastropoda"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/gigantopelta_aegis_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

KeyError: ProteinID XP_009055075.1 GeneID XM_009056827.1 -> skipping...


In [5]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [6]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [7]:
phyla = "Gastropoda"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [8]:
introns = create_many_cassettes(dir, data)

In [9]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Physella_acuta__XP_059147995.1: start codon not in the beginning of sequence
Candidula_unifasciata__CAG5136897.1: start codon not in the beginning of sequence


array([ True])

In [10]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Physella_acuta__XP_059147995.1: start codon not in the beginning of sequence
Candidula_unifasciata__CAG5136897.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Haliotis_cracherodii__XP_071079091.1,1041,False,1040,1,2506
1,Haliotis_rufescens__XP_046327842.1,1041,False,1040,1,2505
2,Haliotis_asinina__XP_067680995.1,1038,False,1037,1,2375
3,Patella_vulgata__XP_055957145.1,1119,False,1118,1,1384
4,Patella_caerulea__KAK6181635.1,1092,False,1091,1,1362
5,Patella_caerulea__KAK6181636.1,1092,False,1091,1,1362
6,Littorina_saxatilis__XP_070180466.1,1095,False,1076,19,6746
7,Batillaria_attramentaria__KAK7466614.1,1005,False,1004,1,8614
8,Lymnaea_stagnalis__CAL1544423.1,1074,False,1073,1,2705
9,Pomacea_canaliculata__PVD26032.1,471,False,470,1,255


In [11]:
os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)

# dict_align_cds_cassette["Gigantopelta_aegis_reference"] = read_single_fasta("../Datasets/Spiralia/gigantopelta_aegis_6/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Ecdysozoa/Nematoda

[caenorhabditis_elegans](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3ENP_001129880.1%20nxf-1%20%5Borganism%3DCaenorhabditis%20elegans%5D%20%5BGeneID%3D191736%5D%0AMNRKGFGGHRDAKQLSRTKNRFARLDPDTQSRYEDDDEPAVPVRASLTSASSRGRGGSSRGFGQSAASIA%0ANTGVRNADIVYKCRATGAAKKVDAKWLIKQLNQIIENFKPLLWTDNARGDFEWYVRDEDTASTIRANNRR%0AVVHKESGTRVEFYTSKVPAPWMKLKREEIEIIHRVVDKRHNAENRVLDLSNFHEDEEFKAKDMMMNLTKG%0ANVMLTVLDHIDDKYGNIVALSLSNNRIRHLDYASALVSIAKFVMELDLSHNHISTEKELEKFAGLPVERF%0AFFEGNPVVESFTQRAAYISYIHQSFPRCNMLDGVEVQPLVVGPDLDIHDAMPFRAGYYPNPQIRVLVEQF%0AVTSYFDFYDGPDGQRTRRNLHNAYDADASTFSLTIEHLRGSSHARHHNDECFAQYAGVSHNVLKQERFAR%0AHRASRSARGAMDIAVALSKLPTSSHMRDTFIVDVFLQSNDLLGFTVQGLFCDGDLTQTPSPSFFSRSFLV%0ASPRENDSVAVISDQLFITVASLDRLEKFKKLYDQSIANGAAVEQVSAVQIAQIGVNGMGFDGAPALPIRE%0AEMIKAMCQFSGMIPPFSEKCLADCAWNFDFACQKFNEIKSSVPAEAFAH&JOB_TITLE=caenorhabditis_elegans_nematoda&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Nematoda%20%28taxid%3A6231%29&EQ_MENU1=Caenorhabditis%20elegans%20(taxid:6239)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [12]:
phyla = "Nematoda"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/caenorhabditis_elegans_nematoda.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

KeyError: ProteinID XP_045092724.1 GeneID XM_045238126.1 -> skipping...
KeyError: ProteinID XP_064060911.1 GeneID XM_064205030.1 -> skipping...
IndexError: ProteinID OZC11383.1 GeneID not found -> skipping...


In [13]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [14]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [15]:
phyla = "Nematoda"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [16]:
introns = create_many_cassettes(dir, data)

In [17]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Angiostrongylus_cantonensis__KAE9413040.1: no stop codon found
Mesorhabditis_belari__CAJ0960930.1: no stop codon found
Pristionchus_fissidentatus__GMT09523.1: start codon not in the beginning of sequence
Mesorhabditis_spiculigera__CAJ0568410.1: no stop codon found
Pristionchus_pacificus__KAF8386643.1: no stop codon found
Pristionchus_entomophagus__GMS77887.1: start codon not in the beginning of sequence
Pristionchus_pacificus__KAF8382567.1: start codon not in the beginning of sequence
Onchocerca_ochengi__VDK67817.1: no stop codon found
Aphelenchoides_avenae__KAH7707721.1: start codon not in the beginning of sequence


array([ True])

In [18]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Pristionchus_fissidentatus__GMT09523.1: start codon not in the beginning of sequence
Pristionchus_entomophagus__GMS77887.1: start codon not in the beginning of sequence
Pristionchus_pacificus__KAF8382567.1: start codon not in the beginning of sequence
Aphelenchoides_avenae__KAH7707721.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Caenorhabditis_nigoni__PIC28742.1,1140,False,926,214,142
1,Caenorhabditis_briggsae__UMM37086.1,1143,False,926,217,145
2,Caenorhabditis_brenneri__EGT34035.1,1095,False,920,175,130
3,Caenorhabditis_brenneri__EGT56062.1,1143,False,968,175,130
4,Caenorhabditis_sp._36_PRJEB53466__CAI2353387.1,1134,False,929,205,133
5,Caenorhabditis_angaria__CAI5450371.1,993,False,914,79,96
6,Caenorhabditis_bovis__CAB3397759.1,1236,False,920,316,235
7,Caenorhabditis_bovis__CAB3397758.1,1236,False,920,316,235
8,Caenorhabditis_auriculariae__CAD6196466.1,1047,False,902,145,156
9,Necator_americanus__ETN84757.1,1050,False,914,136,243


In [21]:
df_cds_cassette.intron_length_to_stop_codon.value_counts()

intron_length_to_stop_codon
316    4
19     4
214    3
91     3
106    3
25     2
175    2
97     2
205    2
145    2
304    2
10     2
103    2
217    1
229    1
190    1
322    1
136    1
79     1
376    1
151    1
181    1
112    1
211    1
37     1
88     1
115    1
253    1
223    1
334    1
157    1
7      1
30     1
100    1
43     1
Name: count, dtype: int64

In [19]:
os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)

# dict_align_cds_cassette["Caenorhabditis_elegans_reference"] = read_single_fasta("../Datasets/Ecdysozoa/caenorhabditis_elegans_0/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Ecdysozoa/Scalidophora

[priapulus_caudatus](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_014677623.1%20LOC106817467%20%5Borganism%3DPriapulus%20caudatus%5D%20%5BGeneID%3D106817467%5D%0AMSYQITTNRDGDRNFYGNRGRRRGRGGNGNQSGGGPTPRSRLDAVDEEGDIDMGGGAGRQVHFNRYTPYG%0AGGGGRRPYRGGPRGDRGGYRGNSSYQPPSTMSDRAGSSRNSLERLGLPLQRGGRRGSGNYNKDRHDERQR%0ASVWYKIMIPYGREMGKDFILRSLTTECSVPFTPISFRLDGKMAIFNVEGWDTAEALMKVSRLVTAPSGFK%0AMLVKATPSGPPMPQNLSPTILEQVKEVLNKRYNSELKTLNLTDFHNDPDFKAKKLYLPLNRQNVILSVID%0ALIGKTASEVVALDFTNNKLPNCDSISTLYTKTPKLKGLNLSKNYIRSEYDLDRIKDLNLQELILEGNPLC%0ASQFKDKMSYVSAIRKRFPKVIKLDGLDLPPPIGFDVETHLELPKSKESFFVNDVVKTLVLRFIQEYFMVY%0ADSNDRQQLLNAYHEQAFFSMFAVRKDNSRGPYLQEYLSESRNMIKVKGFERRSKLLKRGRLAVVSMLTTL%0APKTQHDPTSFCIDINHYSAQLLSFTVTGVFKEGDKSSPPVRSFTRVFVVVPVGDGLCIVNEQMCINNATV%0AEQVQASFKSPTPTPSSSPSYPQPGPSGLTSSPVTGAPLVAVNSPSSLTVEQKKVMVEQFMKDSGMNSEWS%0AAKCLEENAWNYEGAGQVFLELRKVGSIPMEAFTKS&JOB_TITLE=priapulus_caudatus_scalidophora&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Scalidophora%20%28taxid%3A1215728%29&EQ_MENU1=Priapulus%20caudatus%20(taxid:37621)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

No significant similarity found.

### Cnidaria

In [32]:
import os

import pandas as pd

from datasets import download_all_files_ncbi, check_transcript_count, update_data_for_species
from data_processing import analyze_exons, create_cassette, concat_cassette, dict_align_info_analyze

In [24]:
column_names = ["tax_id", "org_name", "gene_id", "current_id", "status", "symbol", "aliases", "description",
                "other_designations", "map_location", "chromosome", "genomic_nucleotide_accession.version",
                "start_position_on_the_genomic_accession", "end_position_on_the_genomic_accession", "orientation",
                "exon_count", "to_delete_1", "to_delete_2"]

df = pd.read_csv("../all_nxf1_2.txt", sep="\t", skiprows=1, names=column_names, index_col=0)
df.drop(["to_delete_1", "to_delete_2"], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,org_name,gene_id,current_id,status,symbol,aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9606,Homo sapiens,10482,0,live,NXF1,"MEX67, TAP",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,11q12.3,11,NC_000011.10,62792130.0,62805440.0,minus,22.0
10090,Mus musculus,53319,0,live,Nxf1,"Mex67, Mvb1, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|Mex 67 homolog|mRN...,19 5.5 cM,19,NC_000085.7,8734467.0,8748274.0,plus,20.0
10116,Rattus norvegicus,59087,0,live,Nxf1,"Mex67h, Tap",nuclear RNA export factor 1,nuclear RNA export factor 1|mRNA export factor...,1q43,1,NC_086019.1,215084563.0,215097756.0,plus,21.0
8407,Rana temporaria,120917577,0,live,NXF1,,nuclear RNA export factor 1,nuclear RNA export factor 1,,11,NC_053499.1,120752259.0,120809664.0,plus,22.0
8364,Xenopus tropicalis,734058,0,live,nxf1,"mex67, tap",nuclear RNA export factor 1,nuclear RNA export factor 1,,4,NC_030680.2,36493131.0,36532401.0,minus,23.0


In [25]:
df_taxonomy = pd.read_csv("../all_phylas_taxonomy.tsv", sep="\t", names=["taxid", "taxonomy"], index_col=0,
                          dtype={"taxid": int, "taxonomy": str})

In [26]:
df_cnidaria = df_taxonomy[df_taxonomy.taxonomy.str.contains("Cnidaria")]

In [29]:
cnidaria_taxids = {
    "Cnidaria": df_cnidaria.index.tolist(),
}

In [30]:
download_all_files_ncbi(df, cnidaria_taxids, phylas=list(cnidaria_taxids.keys()))

Gene, mRNA, protein for Cnidaria/stylophora_pistillata_0 downloaded successfully
Gene, mRNA, protein for Cnidaria/actinia_tenebrosa_1 downloaded successfully
Gene, mRNA, protein for Cnidaria/hydra_vulgaris_2 downloaded successfully
Gene, mRNA, protein for Cnidaria/nematostella_vectensis_3 downloaded successfully
Gene GenBank for Cnidaria/stylophora_pistillata_0 downloaded successfully
Gene GenBank for Cnidaria/actinia_tenebrosa_1 downloaded successfully
Gene GenBank for Cnidaria/hydra_vulgaris_2 downloaded successfully
Gene GenBank for Cnidaria/nematostella_vectensis_3 downloaded successfully
Exons for Cnidaria/stylophora_pistillata_0 created successfully
Exons for Cnidaria/actinia_tenebrosa_1 created successfully
Exons for Cnidaria/hydra_vulgaris_2 created successfully
Exons for Cnidaria/nematostella_vectensis_3 created successfully



In [34]:
cnidaria_species_to_update = check_transcript_count(["Cnidaria"])

Cnidaria/hydra_vulgaris_2: 2 transcripts

Delete other transcripts from cds.fna, protein.faa and rna.fna
Also check gene.fna !!!


In [35]:
update_data_for_species(cnidaria_species_to_update)

In [37]:
def create_many_cassettes(phylum: str, data: dict) -> dict:
    introns = {}
    for org_name, (df, exons_i) in data.items():
        cassette = create_cassette(phylum, org_name, df, exons_i=exons_i)
        introns[org_name] = concat_cassette(cassette, "i")
    return introns

In [38]:
os.listdir("../Datasets/Cnidaria")

['hydra_vulgaris_2',
 'stylophora_pistillata_0',
 'nematostella_vectensis_3',
 'actinia_tenebrosa_1']

In [39]:
prefix = "../Datasets"
phylum = "Cnidaria"
postfix = "ncbi_dataset/data"
nof = "exons.fa"

# Cnidaria
hydra_vulgaris_2 = analyze_exons(
    f"{prefix}/{phylum}/hydra_vulgaris_2/{postfix}/{nof}")  # ничего

stylophora_pistillata_0 = analyze_exons(
    f"{prefix}/{phylum}/stylophora_pistillata_0/{postfix}/{nof}")  # 116 43 | 13, 14

nematostella_vectensis_3 = analyze_exons(
    f"{prefix}/{phylum}/nematostella_vectensis_3/{postfix}/{nof}")  # 116 37 | 9, 10 || 116 43 | 16, 17

actinia_tenebrosa_1 = analyze_exons(
    f"{prefix}/{phylum}/actinia_tenebrosa_1/{postfix}/{nof}")  # 116 37 | 9, 10 || 116 43 | 16, 17

In [40]:
data_cnidaria = {
    "nematostella_vectensis_3": (nematostella_vectensis_3, [9, 10]),
    "actinia_tenebrosa_1": (actinia_tenebrosa_1, [9, 10]),
}

In [41]:
cnidaria_introns = create_many_cassettes("Cnidaria", data_cnidaria)

---

In [43]:
import ast
import os

import pandas as pd
from Bio import Entrez

from data_processing import analyze_exons
from fasta_processing import read_single_fasta, dict_align_to_fasta
from parse_psi_blast_results import parse_psiblast_xml, filter_psiblast_hits, update_df, save_files, \
    create_many_cassettes, dict_align_info_analyze

Entrez.email = "artemvaskaa@gmail.com"

#### Anthozoa

[nematostella_vectensis](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_001631177.1%20LOC5510737%20%5Borganism%3DNematostella%20vectensis%5D%20%5BGeneID%3D5510737%5D%0AMAGIFGRAMKDVAFSVTTSGDGSRTFNSNIGDGMEDDRQSNRGRGRGGYNNNQGSYRSRGRGRGRGRGRG%0AGNNRGRGGRGGRQDANPRSYLADEEDDESMGDADDNTGGYTSRYTPYGARPPSRRGYQHDDRSGGAGGGI%0AKSRLGQRQHSGWKSFDSDWHRVVVNRAKIHDKEWLIKRLQSSSEEAFQPVEFHYMGESAVFFVEGSRAAE%0AALKRVSHSITVKDGSKLIITVRPSQKPFKSHKGGAGGGGGSSQWSPENEQVLKECLSNRYNPQTKTMDLT%0ADMFHDEVLKANNVFGALNKYPLAQEILKLIGENCPDVESLDVSNNRLFQLDHFKDLATQAPGIKCLNLSN%0ATMLKSIDELDKVKGLKELVTLNLTGNTFCKSFEGKSSAYVSAVRSRFPKVTNLDGNELPPPIGFDLQTST%0AVLPTVQGSYIPDPAVKDLVLKFLEQYFKIYDSGDRQPLLDAYHDQAIFSMCVDTQANSSTKGSRGPSLGP%0AYMRNSRNMKRVTETDHRAALIKHNRLSVVAMLNDFPGTKHDLASFVVDINLTLSTIISFSVRGLFMEEDK%0ATARSFTRVFVAVPAAGGKALSIINDELHIRNASSSQAEKLASAASASIITPNVGSGIMAPAPSATPLPAA%0ATTVPAAGMQNLTPQQQQQMVLQFSNQSQMNPEWSFKCLSENGWNYEKSAEIFTSLQAAGSIPPDAFVKK&JOB_TITLE=nematostella_vectensis_anthozoa&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Anthozoa%20%28taxid%3A6101%29&EQ_MENU1=Nematostella%20vectensis%20(taxid:45351)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [45]:
phyla = "Anthozoa"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/nematostella_vectensis_anthozoa.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [46]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [47]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [48]:
phyla = "Anthozoa"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [49]:
introns = create_many_cassettes(dir, data)

In [50]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Exaiptasia_diaphana__XP_020894941.1: start codon not in the beginning of sequence
Porites_lobata__CAH3150435.1: start codon not in the beginning of sequence


array([ True])

In [51]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Exaiptasia_diaphana__XP_020894941.1: start codon not in the beginning of sequence
Porites_lobata__CAH3150435.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Actinia_tenebrosa__XP_031555922.1,1152,False,1142,10,173
1,Pocillopora_verrucosa__XP_058954564.2,1152,False,1118,34,390
2,Porites_lutea__XP_073247758.1,1173,False,1112,61,711
3,Pocillopora_damicornis__XP_027046453.1,1164,False,1118,46,392
4,Porites_lutea__XP_073247755.1,1173,False,1112,61,711
5,Porites_lutea__XP_073247757.1,1173,False,1112,61,711
6,Pocillopora_damicornis__XP_027046452.1,1164,False,1118,46,392
7,Montipora_foliosa__XP_068703612.1,1155,False,1124,31,907
8,Montipora_foliosa__XP_068703613.1,1155,False,1124,31,907
9,Pocillopora_verrucosa__XP_058954563.2,1152,False,1118,34,390


In [52]:
df_cds_cassette.intron_length_to_stop_codon.value_counts()

intron_length_to_stop_codon
61     5
40     5
46     3
34     2
10     2
31     2
76     1
103    1
Name: count, dtype: int64

In [53]:
os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)

dict_align_cds_cassette["Nematostella_vectensis_reference"] = read_single_fasta("../Datasets/Cnidaria/nematostella_vectensis_3/ncbi_dataset/data/cds_cassette.fa")
dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")