In [1]:
import ast
import os

import pandas as pd
from Bio import Entrez

from data_processing import analyze_exons
from fasta_processing import read_single_fasta, dict_align_to_fasta
from parse_psi_blast_results import parse_psiblast_xml, filter_psiblast_hits, update_df, save_files, \
    create_many_cassettes, dict_align_info_analyze

Entrez.email = "artemvaskaa@gmail.com"

### Platyhelminthes

[schistosoma_haematobium_platyhelminthes](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_051072204.1%20NXF1_3%20%5Borganism%3DSchistosoma%20haematobium%5D%20%5BGeneID%3D24590952%5D%0AMPHGNHRNKSNHYRWRSHGENDRRIDTSDEYLRRREHGGKFSSGRKSIISGNNVDMLKRAMNMNLIGGSA%0AASVVTAQNSGLAPGEVWVRITIVHGANHPMMDLQQLVTTIVGTQLRFYNTCVEGRNALMHAKIRQKDVQS%0AYRKSLQNLRDPSQGSQLITDITIVPEPRVPSSSDKRNESPNTSPLPETWIEALKQCFVQRYQPTTRSLDL%0ASSLHTDPVLLSQGLYLPLNKQAVVHTLITILKQNQAQLAVLNLSNNRLTHLNAFSPLSSTSAGFIPVSIE%0ARIDLSSNPLSSIPVLSGLRDIVGLVELDLTETPLMSKFNPNDKSFAAKLHTILPTIKRLNGQELPQTVQF%0AAIEQGSDSSKRPPTKPLPQSILGFFPNDEVKIALLSFLKLYLSRYDSKPRGESLLPYYTTVSQLVFSVSP%0AENRFPNSQNVSFTARVEIQNGSDQPTTAYLTTSRLNQAYFLRSRNLLRCRDQSRRRDMVVRGSLAIAHFL%0ADELPTTEHQLESLSVDVAFHSGTQMLFTMGGVFYEVSSMGSSSTNSSSHEKSVRKVLRCFTRTMILIAPG%0AGHIVQDDYIVSNPTTSLCKKYITEMATRCKQDSQASNQQQNVLSSDPSAPEVKENIVIEFSRRTGMNIPF%0ASRQCLEEYEWNANAALTAFETMNLAGKIPPEAFSV&JOB_TITLE=schistosoma_haematobium_platyhelminthes&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Platyhelminthes%20%28taxid%3A6157%29&EQ_MENU1=Schistosoma%20haematobium%20(taxid:6185)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

In [2]:
phyla = "Platyhelminthes"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/schistosoma_haematobium_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

KeyError: ProteinID XP_018646589.1 GeneID XM_018799620.1 -> skipping...
IndexError: ProteinID CDI98011.1 GeneID not found -> skipping...
IndexError: ProteinID CDS21420.1 GeneID not found -> skipping...
IndexError: ProteinID CDS25628.1 GeneID not found -> skipping...
KeyError: ProteinID XP_024351515.1 GeneID XM_024494126.1 -> skipping...


In [3]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [4]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [9]:
phyla = "Platyhelminthes"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [10]:
introns = create_many_cassettes(dir, data)

In [11]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Mesocestoides_corti__VDD84209.1: start codon not in the beginning of sequence


array([ True])

In [12]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Mesocestoides_corti__VDD84209.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Schistosoma_guineensis__CAH8656942.1,981,False,980,1,652
1,Schistosoma_curassoni__CAH8663826.1,981,False,980,1,652
2,Schistosoma_bovis__CAH8671530.1,981,False,980,1,652
3,Schistosoma_margrebowiei__CAH8667553.1,981,False,980,1,650
4,Schistosoma_intercalatum__CAH8646207.1,981,False,980,1,652
5,Schistosoma_rodhaini__CAH8677847.1,981,False,980,1,671
6,Schistosoma_japonicum__TNN16962.1,978,False,977,1,847
7,Schistosoma_turkestanicum__CAH8587689.1,978,False,977,1,905
8,Paragonimus_westermani__KAF8560478.1,993,False,992,1,13971
9,Taenia_crassiceps__KAL5103217.1,981,False,980,1,278


In [13]:
# os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)
#
# dict_align_cds_cassette["Schistosoma_haematobium_reference"] = read_single_fasta("../Datasets/Spiralia/schistosoma_haematobium_1/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")

### Mollusca/Bivalvia

[mya_arenaria](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

[crassostrea_virginica_bivalvia](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

[magallana_gigas_bivalvia](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Proteins&PROGRAM=blastp&QUERY=%3EXP_052767696.1%20LOC128208251%20%5Borganism%3DMya%20arenaria%5D%20%5BGeneID%3D128208251%5D%0AMAEFRVTTDRDGSRSFNDHDDRWTRGGGSGRYRGRGGNFNRRPRRGGGYGYGGSSYNRGRGGYSGGRGRG%0AGDRDGPGPRSRLDDDGDETMGNSETKDIARYNPYGRSSRGNFRGNSKYRSDNRNRPTQIGGDAGIFKRLG%0ALPLDRKTMGDSDWYKVTIPWGKKTEKDFILKSINDHIDVPFVPTYFHYEDKNAVFYVNDSRAAEGLKSTT%0AKRVTMPNGYKMTILVKPSQPPNIPMGKEEIDKLKVCMSNRYDPATKALNLSCLHTDQELAQSNLFMCLAR%0APQVMSNVVKVIKENIPELVQLDVSNNKLQSLEHLGGLVPSTPDMKVLNLANNKINMLEELRKVQKWKIDW%0ALTLDGNPLCDKFNDHTAYVSGVRRLFPKVLKLDTTDLPPPITFDIEARTDLPKSKDSYFPNDTVKNGVVK%0AFLKDYFLVYDSDDRTGLAGAYHETAMFSLSTSYNPTVQNKQTSLSTYIDETRNLLRVYKDTSRKFKTLKN%0AGNKIVAQLCLLPKTQHDPNSFVVDCNFATAQMISFNIQGVFKEVDKKSDKPPMRAFSRTFVTVASGSGMV%0AIVNDVLTVTNASPDQIQTAFKNPAPTPSSSPIPQTSPTEPFAAAGLTEIQTQMVASFMNDSRMNSEWSAK%0ACLVQNNWEYAKAGQNFLELQQKGLIPPEAFKT&JOB_TITLE=mya_arenaria_bivalvia&GAPCOSTS=11%201&DATABASE=nr&BLAST_PROGRAMS=psiBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&I_THRESH=0.005&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

#### mya_arenaria

In [14]:
phyla = "Bivalvia"
dir = f"../Sequences_protein_id/{phyla}"

name_of_blast_res = f"../Blast_res/psi_blast/mya_arenaria_{phyla.lower()}.xml"  # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)

In [15]:
os.makedirs(f"{dir}", exist_ok=True)
df.to_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index=True)

In [16]:
save_files(df, f"../Sequences_protein_id/{phyla}")

In [17]:
phyla = "Bivalvia"
dir = f"../Sequences_protein_id/{phyla}"
df = pd.read_csv(f"{dir}/df_{phyla.lower()}.tsv", sep="\t", index_col=0,
                 converters={11: ast.literal_eval, 13: ast.literal_eval})

ref_exon_len = [37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]  # !!!

data = {}

for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    df_exons = analyze_exons(f"{dir}/{protein_id}/exons.fa")
    exon_37_idx = df_exons[df_exons["length"] == 37].index[0]
    exon_110_idx = exon_37_idx - 1
    data[f"{org_name}__{protein_id}"] = (df_exons, [exon_110_idx, exon_37_idx])

In [18]:
introns = create_many_cassettes(dir, data)

In [19]:
df_cds, dict_align_cds = dict_align_info_analyze(df_found_protein_ids, "cds", dir)
df_cds.equal_to_cds.unique()  # должно быть только True

Dreissena_polymorpha__KAH3852511.1: start codon not in the beginning of sequence
Sinanodonta_woodiana__KAL3874362.1: no stop codon found


array([ True])

In [20]:
df_cds_cassette, dict_align_cds_cassette = dict_align_info_analyze(df_found_protein_ids, "cds_cassette", dir)
df_cds_cassette

Dreissena_polymorpha__KAH3852511.1: start codon not in the beginning of sequence


Unnamed: 0,org_name_protein_id,stop_codon_pos,equal_to_cds,cassette_intron_start,intron_length_to_stop_codon,intron_length
0,Mercenaria_mercenaria__XP_053373641.1,1107,False,1106,1,1690
1,Dreissena_polymorpha__XP_052275796.1,1107,False,1106,1,2207
2,Ruditapes_philippinarum__XP_060562241.1,1095,False,1094,1,1646
3,Mactra_antiquata__KAL4236273.1,840,False,839,1,2319
4,Mytilus_coruscus__CAC5418521.1,1089,False,1088,1,1234
5,Potamilus_streckersoni__KAK3583320.1,1071,False,1070,1,4567
6,Mytilus_californianus__XP_052058679.1,1089,False,1088,1,1248
7,Saccostrea_echinata__XP_061162409.1,1098,False,1097,1,1556
8,Mytilus_edulis__CAG2249952.1,1089,False,1088,1,1360
9,Mytilus_edulis__XP_071161937.1,1089,False,1088,1,1360


In [22]:
dict_align_cds_cassette

{'Mercenaria_mercenaria__XP_053373641.1': 'ATGGCTGAATTTCGGGTGACGACTGGAAGAGACGGAATTCGATCATTTGGTGAGCATGATGACAGATGGACAGATAGTGGGGGTTCCGGGAAGTTCCGAGGACGTGGCAGTTACAGAGGAAATAGGAGATCATTCAACAACAAAAAAAGCTATGGCTTCAGCAGTTACAGTCGAAGAAGTTCTGGTAGCCCATTCAGGAGTAGAGGTCGAGGTAGAGGTGGGCCTAACCCCCGGAGTCGTGTAGATGATGAAGGAGATGATGTCATGGGTGGCGAAGATGCTCAAACGTCACATAGCAGATTCAATCCATATGGTCGACCAAGTAAAGGTGGTTTCCATGGAAAAAATAGTAGATATGACAGCAACTCAAATAAAAGAACTGGTGGTAGCATGGGTACGTTTAAAAGACTAGGTCTTCCTATAGACTCAAGAAGAGATGATGGAGAGCCACAGTGGTTCAAGATTGTGATACCATGGGGTAAAAAAGCAGATAAAGATTTCATTTTGAAGAATATAAATAATCACGTTGATGTACCATTTGTGCCAACATATTTTCACTATGAGGACAACACTGCAATTTTTTTTGTAAATGACAAGAGAGCAGCGGAATCTATCAGAGGGATCACAAAAAGAATAACCATGCCAACTGGATATAAAATGACAATTCTTGTGAAAAATAGTATTCCTCCAAACATTCCTATGGGCACAGAGGAAGTAGACAAACTCAAAGTTGTAATGAGCAACAGATATGATCCAACAACAAAGGCACTGAATTTGAGCAGTTTGCACACAGATAAAGAATTGAGTCAAGACAACCTGTATATGAACCTGGCAAGACCTCAAGTGATGACAAATGTTGTAAAGATCATCCAAGAAAACATTCCAGAGTTGTGTTGTTTGGACATGAGTGACAACAAGCTCTTCAGTCTGGATCATCTAGCAGCATTAGTGTCATCA

In [24]:
os.makedirs(f"../Alignment/psi_blast/{phyla}", exist_ok=True)

# dict_align_cds_cassette["Mya_arenaria_reference"] = read_single_fasta("../Datasets/Spiralia/mya_arenaria_2/ncbi_dataset/data/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.fa")
# dict_align_to_fasta(dict_align_cds_cassette, f"../Alignment/psi_blast/{phyla}/cds_cassette.aln")