In [12]:
import pandas as pd
from Bio import Entrez, SeqIO

from data_processing import analyze_exons
from fasta_processing import dict_align_to_fasta, read_single_fasta, plain_to_fasta
from parse_psi_blast_results import parse_psiblast_xml, filter_psiblast_hits, update_df, save_genes, save_cdss_exons, save_proteins

Entrez.email = "artemvaskaa@gmail.com"

In [2]:
name_of_blast_res = "../Blast_res/schistosoma_haematobium_psi_blast.xml" # XML2 !!!
df = parse_psiblast_xml(name_of_blast_res)
df = filter_psiblast_hits(df, min_qc=0, min_ident=0, min_sbjct_len=500)
df = update_df(df)
df.to_csv("../Sequences_protein_id/df_no_filters.tsv", sep="\t", index=True)

save_genes(df)
save_cdss_exons(df)
save_proteins(df)

---

In [14]:
df = pd.read_csv("../Sequences_protein_id/df_no_filters.tsv", sep="\t", index_col=0)

In [8]:
prefix = "../Sequences_protein_id"
ref_exon_len = [107, 110, 113, 116, 37]
found_protein_ids = {}

for protein_id in df.protein_id:
    df_subset = df[df["protein_id"] == protein_id]
    df_exons = analyze_exons(f"{prefix}/{protein_id}/exons.fa")
    condition = set(ref_exon_len) & set(df_exons.length.tolist())
    if condition:
        found_protein_ids[protein_id] = condition

df_found_protein_ids = df[df["protein_id"].isin(found_protein_ids.keys())]

dict_align = {"Schistosoma_haematobium_reference": read_single_fasta("../Datasets/Spiralia/schistosoma_haematobium_1/ncbi_dataset/data/protein.faa")}
for protein_id in df_found_protein_ids.protein_id:
    df_subset = df_found_protein_ids[df_found_protein_ids["protein_id"] == protein_id]
    org_name = df_subset.org_name.iloc[0]
    dict_align[f"{org_name}_{protein_id}"] = read_single_fasta(f"{prefix}/{protein_id}/protein.faa")

dict_align_to_fasta(dict_align, f"../Alignment/psi_blast/protein.fa")
dict_align_to_fasta(dict_align, f"../Alignment/psi_blast/protein.aln")

---