In [1]:
from Scripts.data_processing import obtain_data
from Scripts.fasta_processing import read_single_fasta

from Bio import Entrez
from Bio import Blast

from entrez import nucl_search, save_esearch_results
from parse_blast_results import calculate_qc, filter_df, update_df
from fasta_processing import plain_to_fasta, read_fasta
from data_processing import (cluster_analysis_preview,
                             cluster_analysis,
                             save_seqs,
                             extract_genome_coverages,
                             add_genome_coverages,
                             select_max_ids,
                             filter_genome_coverages
                             )
from group_species import group_species, group_species_genome_coverage

Entrez.email = "artemvaskaa@gmail.com"
Blast.email = "artemvaskaa@gmail.com"

In [None]:
prefix = "../Datasets"
postfix = "ncbi_dataset/data"

### Platyhelminthes

phylum: Spiralia Platyhelminthes

query: schistosoma_haematobium_1 | cds.fna

blast_settings:
- job title: schistosoma_haematobium
- db: wgs
- organism: Platyhelminthes (taxid:6157), Schistosoma haematobium (taxid:6185) exclude
- blast_algorithm: megablast

additional_params:
- word size: 28

[blastn_link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=MegaBlast&PROGRAM=blastn&JOB_TITLE=schistosoma_haematobium&NEWWIN=on&NEWWIN=on&GAPCOSTS=0%200&MATCH_SCORES=1,-2&DATABASE=Whole_Genome_Shotgun_contigs&BLAST_PROGRAMS=megaBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=16&REPEATS=repeat_9606&TEMPLATE_TYPE=0&TEMPLATE_LENGTH=0&FILTER=L&FILTER=m&EQ_MENU=Platyhelminthes%20%28taxid%3A6157%29&EQ_MENU1=Schistosoma%20haematobium%20(taxid:6185)&ORG_EXCLUDE1=on&NUM_ORG=2&DB_GROUP=wgsOrg&PROG_DEFAULTS=on&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=on&BOOKMARK=on)

phylum: Spiralia Platyhelminthes

query: schistosoma_haematobium_1 | protein.faa

blast_settings:
- job title: schistosoma_haematobium_tblastn
- db: wgs
- organism: Platyhelminthes (taxid:6157), Schistosoma haematobium (taxid:6185) exclude

[tblastn_link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=Translations&PROGRAM=tblastn&JOB_TITLE=schistosoma_haematobium_tblastn&GAPCOSTS=11%201&DATABASE=Whole_Genome_Shotgun_contigs&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=5&MATRIX_NAME=BLOSUM62&COMPOSITION_BASED_STATISTICS=2&FILTER=L&EQ_MENU=Platyhelminthes%20%28taxid%3A6157%29&EQ_MENU1=Schistosoma%20haematobium%20(taxid:6185)&ORG_EXCLUDE1=on&NUM_ORG=2&DB_GROUP=wgsOrg&BLAST_PROGRAMS=tblastn&SHOW_OVERVIEW=on&SHOW_LINKOUT=on&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=on&NEW_VIEW=on&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ADV_VIEW=on&BOOKMARK=on)

In [3]:
phylum = "Spiralia"
org_name = "schistosoma_haematobium_1"

schistosoma_data = obtain_data(phylum, org_name)

---

### QC >= 45%

In [4]:
name_of_blast_res = "../Blast_res/schistosoma_blastn.xml" # XML2 !!!
result_stream = open(name_of_blast_res, "rb")
blast_record = Blast.read(result_stream)
df = calculate_qc(blast_record)

In [5]:
df = update_df(df, blast_record)  #

In [6]:
df = filter_df(df, qc_threshold=0.45, range_threshold=1900)

In [7]:
cluster_analysis_preview(df)  # select eps

eps: 0.01, n_clusters: 4
cluster: 3, qcs_range: (0.4925, 0.4925), items_in_cluster: 1
cluster: 2, qcs_range: (0.6887, 0.6887), items_in_cluster: 1
cluster: 1, qcs_range: (0.8053, 0.8053), items_in_cluster: 1
cluster: 0, qcs_range: (1.01, 1.0115), items_in_cluster: 2

eps: 0.02, n_clusters: 4
cluster: 3, qcs_range: (0.4925, 0.4925), items_in_cluster: 1
cluster: 2, qcs_range: (0.6887, 0.6887), items_in_cluster: 1
cluster: 1, qcs_range: (0.8053, 0.8053), items_in_cluster: 1
cluster: 0, qcs_range: (1.01, 1.0115), items_in_cluster: 2

eps: 0.03, n_clusters: 4
cluster: 3, qcs_range: (0.4925, 0.4925), items_in_cluster: 1
cluster: 2, qcs_range: (0.6887, 0.6887), items_in_cluster: 1
cluster: 1, qcs_range: (0.8053, 0.8053), items_in_cluster: 1
cluster: 0, qcs_range: (1.01, 1.0115), items_in_cluster: 2

eps: 0.04, n_clusters: 4
cluster: 3, qcs_range: (0.4925, 0.4925), items_in_cluster: 1
cluster: 2, qcs_range: (0.6887, 0.6887), items_in_cluster: 1
cluster: 1, qcs_range: (0.8053, 0.8053), items_in

In [8]:
df = cluster_analysis(df, eps=0.12)  # 3 clusters

In [None]:
# long-time execution

save_seqs(df, "Platyhelminthes", seq_length_threshold=1900)

In [None]:
group_species(df, "Platyhelminthes", "Platyhelminthes_grouped")

In [None]:
# long-time execution

genome_coverages = extract_genome_coverages(df)

In [13]:
genome_coverages  # check if everything is OK

['            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 100.0x',
 '            Genome Coverage        :: 100.0x',
 '            Genome Coverage        :: 0x']

In [14]:
df = add_genome_coverages(genome_coverages, df)

In [15]:
df = select_max_ids(df)

In [16]:
# видов мало, поэтому оставим все
# df = filter_genome_coverages(df, genome_coverage_threshold=50)

In [17]:
group_species_genome_coverage(df, folder_name="Platyhelminthes", new_folder_name="Platyhelminthes_filtered")

---

In [None]:
from Scripts.fasta_processing import dict_align_to_fasta

# all ranges

alignment_dict = {
    "Schistosoma_haematobium": read_single_fasta("../Datasets/Spiralia/schistosoma_haematobium_1/ncbi_dataset/data/cds_cassette.fa"),
    "Schistosoma_curassoni": read_single_fasta("../Sequences/Platyhelminthes_filtered/1.01-1.01/Schistosoma_curassoni_genome_assembly_CAJSMQ010000001.1.fa"),
    "Schistosoma_margrebowiei_strain_Zambia": read_single_fasta("../Sequences/Platyhelminthes_filtered/1.01-1.01/Schistosoma_margrebowiei_strain_Zambia_genome_assembly_UZAI01001861.1.fa"),
    "Schistosoma_bovis_strain_TAN1997_SBOVIS_629": read_single_fasta("../Sequences/Platyhelminthes_filtered/0.69-0.81/Schistosoma_bovis_strain_TAN1997_SBOVIS_629_QMKO01000629.1.fa"),
    "Schistosoma_bovis_strain_TAN1997_SBOVIS_2705": read_single_fasta("../Sequences/Platyhelminthes_filtered/0.69-0.81/Schistosoma_bovis_strain_TAN1997_SBOVIS_2705_QMKO01002703.1.fa"),
    "Schistosoma_curassoni_strain_Dakar": read_single_fasta("../Sequences/Platyhelminthes_filtered/0.49-0.49/Schistosoma_curassoni_strain_Dakar_UZAK01042702.1.fa"),
}

# dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Platyhelminthes_blast.fa")
# dict_align_to_fasta(alignment_dict, "../Alignment/Spiralia/Platyhelminthes_blast.aln")

In [None]:
# Schistosoma_curassoni_strain_Dakar: нет интрона - удаляем

### QC >= 10%

In [None]:
# ничего хорошего не получилось, нужно брать нормальный QC

### Bivalvia

phylum: Spiralia Bivalvia

query: mya_arenaria_2 | cds.fna

blast_settings:
    - job title: mya_arenaria_cds_blastn
- db: wgs
- organism: Bivalvia (taxid:6544), Mya arenaria (taxid:6604) exclude
- blast_algorithm: megablast

additional_params:
- word size: 28

[blastn_link](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&USER_FORMAT_DEFAULTS=on&SET_SAVED_SEARCH=true&PAGE=MegaBlast&PROGRAM=blastn&JOB_TITLE=mya_arenaria_cds_blastn&NEWWIN=on&NEWWIN=on&GAPCOSTS=0%200&MATCH_SCORES=1,-2&DATABASE=Whole_Genome_Shotgun_contigs&BLAST_PROGRAMS=megaBlast&MAX_NUM_SEQ=100&SHORT_QUERY_ADJUST=on&EXPECT=0.05&WORD_SIZE=28&REPEATS=566037&TEMPLATE_TYPE=0&TEMPLATE_LENGTH=0&FILTER=L&FILTER=m&EQ_MENU=Bivalvia%20%28taxid%3A6544%29&EQ_MENU1=Mya%20arenaria%20(taxid:6604)&ORG_EXCLUDE1=on&NUM_ORG=2&DB_GROUP=wgsOrg&PROG_DEFAULTS=on&SHOW_OVERVIEW=true&SHOW_LINKOUT=true&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&GET_SEQUENCE=true&NCBI_GI=false&NUM_OVERVIEW=100&DESCRIPTIONS=100&ALIGNMENTS=100&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&SHOW_CDS_FEATURE=false&ADV_VIEW=true&BOOKMARK=on)