In [2]:
import ast
import os

import pandas as pd
from Bio import Entrez

from data_processing import analyze_exons
from fasta_processing import read_single_fasta, dict_align_to_fasta
from parse_psi_blast_results import parse_psiblast_xml, filter_psiblast_hits, update_df, save_files, \
    create_many_cassettes, dict_align_info_analyze

Entrez.email = "artemvaskaa@gmail.com"

### Cephalochordata Бесчерепные +2 вида

[branchiostoma_floridae_cephalochordata](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_035686715.1%20LOC118422917%20%5Borganism%3DBranchiostoma%20floridae%5D%20%5BGeneID%3D118422917%5D%0AMKKLLHPGNSLKLRREGVRLYMLWLQALQENASEECLLIFACLIPGFPSPSSDSGMYTLEGLVTGAFHNG%0ASASDAVVPTEISAVLTPQSGERVTDDQTKFFLDAVLEFMVRKLEWKDAQFQPKGFNFLFQHFQQHYLTHI%0AFPSMSSTTSLYQPKXSQVQLFQCISHDDRTSSGSRDRGGGFGFNRDRGGGGGSWRGGYDQNRDGGYRSGG%0ARGGQSRGGNRNRYRGNRRGGWKKGGPGRGGQGGGRGAGPTPRSRFEDDEGDIQMSDDASDSQHSQRYNPY%0AGRPDSRRSNRPNNSGRGGRGRGGGYRDLDAPSTSHSDRSEGGDEDGWHKITIVQGKRSNKDWLMSTLQKT%0ACPVPFQPTEFHYEKNNAVFYVPDKATADGLTGISRKVRTKEGYRVVVFSRVETPETIKMVNKTTMEAIKL%0ASMAKRFDAATSALNLSNLFGDIELQAQDIRVALSRKMYMNSVIAIIKENVPVIEHLDMSNNRLFHLGDLA%0ADLVSVRKGVKYLNLSHNELKSEFELDKIKEWKLDELWLDGNPVCNHFKEQSAYISAVRKRFPKVARLDGH%0AELPPPIAFDLESNTTLPETKGSNFDNNDMCRKIITDFLEKFFIVYDSDDRQGLLEAYHDQAYFSFCMSYP%0APGAPHYHRKSLQDFCRDSRNLLFVNDLSQRVKFLKHSRLNVVAFLNELPHTQHDPNSFVIDVGVAMNSLI%0ACFTVSGVFREVVSKSGGNPPIRAFSRVFTAVPAPQGLCIVNDMMTISAATPAQEKAAFTSPAPTPSPSPV%0APGPSGLSEPQQQMIKMFAEQSGMNEEWSQSCLEQNGWEYNKSAQVFTELKAQNKIPPEAFLK&JOB_TITLE=branchiostoma_floridae_cephalochordata&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Cephalochordata%20%28taxid%3A7735%29&EQ_MENU1=Branchiostoma%20floridae%20(taxid:7739)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

In [4]:
phyla = "Cephalochordata"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/branchiostoma_floridae_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0.5, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [5]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [6]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [7]:
phyla = "Cephalochordata"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [8]:
introns = create_many_cassettes(dir, data)

In [9]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

array([ True])

In [10]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Branchiostoma_belcheri__XP_019647902.1,1227,False,1226,1,1663
1,Branchiostoma_belcheri__XP_019647903.1,1227,False,1226,1,1663
2,Branchiostoma_lanceolatum__XP_066280099.1,1224,False,1223,1,1753
3,Branchiostoma_lanceolatum__CAH1263602.1,1323,False,1322,1,1746
4,Branchiostoma_belcheri__KAI8485387.1,1695,False,1640,55,625


In [None]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Branchiostoma_floridae_reference"] = read_single_fasta("../Datasets/Cephalochordata/branchiostoma_floridae_0/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Tunicata Оболочники +1 вид

[styela_clava_tunicata](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_039257222.1%20LOC120333895%20%5Borganism%3DStyela%20clava%5D%20%5BGeneID%3D120333895%5D%0AMSYRGRSRNNYNKRGSYSYRGDYNNKYSQHDNRSGSPSYNNRDKESGESYNDRQYSNDRYEPRGRGGGKF%0ARGRRPNGRFSARRGSRGRGGSYRGRGGRGDFNQNNHIDPDGDIDMEGGTSTGKARPAPYARPSRDNRPPR%0AGNPRFHQNNRNGPNWSLVTIVQGAMLDREWLIEAIQKEIMVQIKPVQYFHDGENSQFCIEDAEVAQNIRD%0ACNRKITGPDGNKLLILKTYCSPPITDDQLVILRDALSKRYNTDTLHLDASDLYSDKTLRENRIDMKLKFS%0AKAMYILINIIGEHISGLLSLDLSNNRMDNLSYLKNLVVVTPRLKCLKLERNELRHSKELDNIREWDLTEL%0ANIDGNPLCQHFESQSDYISAIRDKFPNLQVLDGKKLPPPVKFDLEKVTKLVPAIPNHIPPAVTEIIKQFV%0AKQYFALYDTNRENLSQAYDENCMFSLTIPSNPRGAPLTRYLDYTRNLRRLKNAKLRLSYLRKSRSEICET%0ALKKLPKTEHDISGFCVDVPLVSPTMIKFIIRGVFKEKHSGKDNSCMRAFTRAFLCLTDGARLSIINEEIH%0AIRNTTIVEYKSAFAKPPVTPSPSPVPEQTIASPAQSTSSSVPTVAPVVVGSPSKLDMVAAFCKESGMNAG%0AFSEQCLNENGWDYMKAGQAFLSLKNEGKIPAEAFVR&JOB_TITLE=styela_clava_tunicata&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Tunicata%20%28taxid%3A7712%29&EQ_MENU1=Styela%20clava%20(taxid:7725)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [11]:
phyla = "Tunicata"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/styela_clava_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0.5, min_ident=0, min_sbjct_len=500)
df = update_df(df)

IndexError: ProteinID CAB3264527.1 GeneID not found -> skipping...


In [12]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [13]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [14]:
phyla = "Tunicata"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [15]:
introns = create_many_cassettes(dir, data)

In [16]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

array([ True])

In [17]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Ciona_intestinalis__XP_002129680.2,1116,False,1085,31,261
1,Ciona_intestinalis__XP_018671856.1,1116,False,1085,31,261


In [None]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Styela_clava_reference"] = read_single_fasta("../Datasets/Tunicata/styela_clava_0/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Cyclostomata Круглоротые + 2 вида

[petromyzon_marinus_cyclostomata](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_032819871.1%20NXF1%20%5Borganism%3DPetromyzon%20marinus%5D%20%5BGeneID%3D116947809%5D%0AMSQKQDADQFFDGRHYSEHDDRTGSQAVDRRNRGRGPRLYMDGRSSGSPARSSGGGGSSGGGGGGGGGGG%0AGRRSGHGYFRGNRKGRNNGGGGGGGGGGGGGRGGGGGRIGGPNPRSHLNDDDDVDMWEDMPSSRARFTPY%0ASTNPNRWRQQQRTDHRRSAVEVTFRPDSTGYGGRPSHSTSGGHRERSSWYKITIPFGKKYNKTWLLQSLQ%0AQASSTPFTPVQFHYEDKRAVFHLEDRAAAEALKSISRQIVDTDNFKVAVVMNQSGPPPSLLNDLKEEDLQ%0AHIKECLSRRFEPAEQALDLSGIRNDEELQARGVDLVMNRKSCMDAVTKIISENIPTIMSLNLSSNRLYRL%0ADDLAELAQHTPSLKTMNLSRNELKSERDLDRIKGFKLEELWLSGNSLCDSFRDQSAYISAVRQRFPKLMK%0ALDGQELPPPIAFDLEAPTTLPPTREGYFPNDEIKSLILRFLTQYFTVYDSANRQSLLNAYHDTACCSITI%0APHSPQNPSKSSLGEYFKESRNIKRLKDPIMRQKLLKHTRLNVVAFLNELPHTQHDLPSFVIDVSVHLASL%0ALNFTVNGFFKETEGKCRESVRAFSRVFLAVPAANGGLCIVNDQLFVRNASTEEIRRAFVMPAPTPSSSPV%0APTLTPLQQEMLQAFSTQSGMNLEWSQRCLQQHSWDFERSAQIFTQLNTAGHIPKEAFVK&JOB_TITLE=petromyzon_marinus_cyclostomata&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Cyclostomata%20%28taxid%3A1476529%29&EQ_MENU1=Petromyzon%20marinus%20(taxid:7757)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [18]:
phyla = "Cyclostomata"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/petromyzon_marinus_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0.6, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [19]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [20]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [21]:
phyla = "Cyclostomata"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [22]:
introns = create_many_cassettes(dir, data)

In [23]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

array([ True])

In [24]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Lampetra_planeri__CAM9977466.1,1272,False,1247,25,3126
1,Lampetra_planeri__CAL9852373.1,1287,False,1262,25,3122
2,Lampetra_fluviatilis__CAM9885821.1,1308,False,1283,25,3116
3,Lampetra_fluviatilis__CAL5915659.1,2313,False,2288,25,3123


In [None]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Petromyzon_marinus_reference"] = read_single_fasta("../Datasets/Cyclostomata/petromyzon_marinus_2/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Chondrichthyes Хрящевые рыбы + 11 видов

[amblyraja_radiata_chondrichthyes](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_032871106.1%20nxf1%20%5Borganism%3DAmblyraja%20radiata%5D%20%5BGeneID%3D116968450%5D%0AFETKLLPWLSSAPQRSGSVNYQFQGKGTNEGRWNWRKDKGDRYGEHDDRVGGNFPIRKKKGRGPFRWKMY%0ASDANHKPRNRGGGGGNPRLRFEDEDGDVAMNDHDVPRPRFSPYGSRPSRRPGNWHDSEGGPSNIHVTVKP%0ANSERGSSNNANTRRNWFKITIPYGKKYDKTWLLSNLQSMSSVPFNPVQFHYDGNKALFYVEDSTTANALK%0AQISRRITDKDLYRVVIIINQSAPPSSVSNELKAEEIVHIRQCMSKRYDGSQQALDLNTVRSDPDLVSQNI%0AEVVLNRRNSMSTVVKIIEENIPELLSLNLGNNKLYRLEDLTDLISKAPGLKILNLSRNELKSERDLDKIK%0AGFKLEELWLEGNPLCGNFRDQATYVSSVREKFPKLLRLDGHDLPPPISFDVETPTTLPSCKGSYFGTEEI%0AKVIVTRFIQQYYSVYDSTDRQGLLDAYHDTACCSLSIPFTQQNPARSSLGEYFKESRNVKRLKDPTLRAR%0ALLKHTRLNVVAFLNELPKTQHDTASFVVDVSTQTNTLLCFTVHGVFKEVDSKSRESVRAFSRVFVAVPAG%0ANAGLCIVNDQLFIRNATTEEIRMAFVTPAPTPSSSPVPTLTAPQQEMLQLFSQQSNMNIEWSQKCLQDND%0AWDFNQAAQIFTQLKAEGKIPEIAFVRQL&JOB_TITLE=amblyraja_radiata_chondrichthyes&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Chondrichthyes%20%28taxid%3A7777%29&EQ_MENU1=Amblyraja%20radiata%20(taxid:386614)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [25]:
phyla = "Chondrichthyes"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/amblyraja_radiata_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0.7, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [26]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [27]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [35]:
phyla = "Chondrichthyes"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [29]:
introns = create_many_cassettes(dir, data)

In [30]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True
# Scyliorhinus_torazame__XP_072345558.1 - стоп-кодон в кодирующей части не в конце

array([ True, False])

In [36]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Pristis_pectinata__XP_051901084.1,1023,False,1018,5,2981
1,Pristiophorus_japonicus__XP_070724389.1,1218,False,1121,97,3326
2,Pristiophorus_japonicus__XP_070724388.1,1218,False,1121,97,3326
3,Stegostoma_tigrinum__XP_048381107.1,1203,False,1001,202,3219
4,Heptranchias_perlo__XP_067831842.1,1212,False,1022,190,3838
5,Carcharodon_carcharias__XP_041037859.1,1284,False,1115,169,3263
6,Heptranchias_perlo__XP_067831841.1,1212,False,1022,190,3838
7,Hemiscyllium_ocellatum__XP_060711579.1,1110,False,1016,94,3273
8,Pristiophorus_japonicus__XP_070724386.1,1218,False,1121,97,3326
9,Chiloscyllium_punctatum__GCC19136.1,1299,False,1157,142,3151


In [41]:
len(df_cds_cassette.org_name_protein_id.str.split("__").str[0].unique())

11

In [33]:
df_cds_cassette.intron_length_to_stop_codon.value_counts()

intron_length_to_stop_codon
 97     6
 190    3
 247    3
 169    3
 94     2
 202    2
 133    2
 5      1
-502    1
 142    1
Name: count, dtype: int64

In [34]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Amblyraja_radiata_reference"] = read_single_fasta("../Datasets/Chondrichthyes/amblyraja_radiata_0/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Coelacanthimorpha Латимерии (Целакантообразные) + 0 видов

[latimeria_chalumnae_coelacanthimorpha](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_005988847.1%20NXF1B%20%5Borganism%3DLatimeria%20chalumnae%5D%20%5BGeneID%3D102357219%5D%0AMSTGEDGRYHNEHDDRLCSGFIVRKKKGRGPFRWKTHNDVNYKHRSRGGGGPNPRSRLEDDDGDIPMSDS%0ASHDVTRGRYNPYSSRPNRRGDDWHDRDRGGPSNVQVTVRRDPLNQDRGLGSTGPRKNWFKITIPYGKKYD%0AKSWLLTSVQNLCSVPFSPVEFHYDHNKAVFYVEDSTTANALKQVSRKITDRDNYKVSLMVSLSSPPSSVQ%0ANELKPEHLEHLKQCMSKRYDGSQQALDLNSIRSDPDLVSQNIDVVLSKKSSLLAVIKIIEENIPELLSLN%0ALGNNRLFKLDDLTDLVTKAPNIKILNLSRNVLKSDRELDKVKGFKLEELWLDGNPLCDSFRDQSAYISAI%0ARERFPKLLRLDGHDLPPPIAFDVEAPTTLPPCKGSYFITDDIKVLVLRFLQQYYSVYDSGDRQALLDAYH%0ADGACCSLSIPYVSPNPSRCSLGEYFKDSRNVKKLKDPTLRFKLLKHTRLNVVAFLNELPKTQHDVNSFVV%0ADVNAQTNTLLCFTVHGIFKEVDGKSRDSVRAFSRVFIAVPAGNAGLCIVNDELFIRNATTEEIRKAFVTP%0AAPTPSSSPVPTLTAPQQEMLQVFSVQSGMNLEWSQKCLQDNDWDFNRSAQIFTQLKAEGKIPEVAFIK&JOB_TITLE=latimeria_chalumnae_coelacanthimorpha&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Coelacanthimorpha%20%28taxid%3A118072%29&EQ_MENU1=Latimeria%20chalumnae%20(taxid:7897)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [None]:
# No significant similarity found.

### Dipnomorpha Рипидистии (Лопастеперые рыбы) + 0 видов

[protopterus_annectens_dipnomorpha](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_043931305.1%20NXF1%20%5Borganism%3DProtopterus%20annectens%5D%20%5BGeneID%3D122805282%5D%0AMPEIKSIVGDKPRFAYRSCEALKQKICYKDFQVTYGKKYDREWLLESIKKCCSLPFTPIEYHFEENKAVF%0AYVQDSNVANALKQVTRKITDKDNYRVKLLVHRSGAPLSVRNELKPEQLEVVKQCMSKRFNAAEQSLSLKN%0AICSDPDLVAQKIDDVFFKRFCIYAVTKIIAEHIPELMSLDLSSNKLHYLDDVTDLLFGIRNLKILKLSQN%0AELKYDRELDKLKGLKLDELWLDENPLCKTFADQAAYISNIRDRFPKLRRLDGHELPPPICFEIASETTLP%0ATCKGSYFCTDDVKGLILHFLQQFYTFYDSDDRQGLLAAYHDNACFSVCTKSPPLNSTRALNDYLRENRNL%0AKKVKDAALRFRLLRHKRLNIIAFLNELPKTEHDLKSFVVDVSVQTNTLLSFTVNGIFKEANDKAKDPVMA%0AFSRVFVAVPAGSNGLCIVNDHLCVRSATSGEIRKAFVSPAPTPSSSPVPTLSATQQEMIQAFSLQSGMNF%0AEWSQKCLNDNNWDYTKAAHVFTVLKWIFY&JOB_TITLE=protopterus_annectens_dipnomorpha&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Dipnomorpha%20%28taxid%3A7878%29&EQ_MENU1=Protopterus%20annectens%20(taxid:7888)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

In [None]:
# No significant similarity found.