In [2]:
from Bio import Entrez
from Bio import Blast

from entrez import nucl_search, save_esearch_results
from parse_blast_results import calculate_qc, filter_df, update_df
from fasta_processing import plain_to_fasta, read_fasta
from data_processing import (cluster_analysis_preview,
                             cluster_analysis,
                             save_seqs,
                             extract_genome_coverages,
                             add_genome_coverages,
                             select_max_ids,
                             filter_genome_coverages
                             )
from group_species import group_species, group_species_genome_coverage

Entrez.email = "artemvaskaa@gmail.com"
Blast.email = "artemvaskaa@gmail.com"

In [3]:
name_of_blast_res = "../blast_res/full_sbr_RA_wgs_megablast_250_16.xml" # XML2 !!!
result_stream = open(name_of_blast_res, "rb")
blast_record = Blast.read(result_stream)
df = calculate_qc(blast_record)

In [4]:
df = update_df(df, blast_record)  # 8395741 8410026 3349617 3363902

Target range cannot be calculated automatically. Please enter coordinates manually from the list below:
[7188510, 7189026, 7297434, 7297950, 8395741, 8395989, 8396190, 8396961, 8397432, 8397472, 8398959, 8399803, 8400836, 8400909, 8402321, 8402572, 8402845, 8402932, 8405238, 8405296, 8406927, 8408741, 8409289, 8409341, 8409703, 8410026]
Target range cannot be calculated automatically. Please enter coordinates manually from the list below:
[2269971, 2270487, 3349617, 3349865, 3350066, 3350837, 3351311, 3351351, 3352835, 3353679, 3354712, 3354785, 3356197, 3356448, 3356721, 3356808, 3359114, 3359172, 3360803, 3362617, 3363165, 3363217, 3363579, 3363902]


In [5]:
df = filter_df(df)

In [6]:
cluster_analysis_preview(df)  # select eps

eps: 0.01, n_clusters: 16
cluster: 15, qcs_range: (0.1554, 0.1803), items_in_cluster: 7
cluster: 14, qcs_range: (0.1963, 0.1963), items_in_cluster: 1
cluster: 11, qcs_range: (0.2065, 0.3523), items_in_cluster: 100
cluster: 13, qcs_range: (0.3883, 0.3883), items_in_cluster: 1
cluster: 12, qcs_range: (0.4376, 0.4376), items_in_cluster: 1
cluster: 9, qcs_range: (0.4747, 0.4808), items_in_cluster: 3
cluster: 10, qcs_range: (0.4984, 0.4984), items_in_cluster: 1
cluster: 8, qcs_range: (0.5211, 0.5304), items_in_cluster: 6
cluster: 4, qcs_range: (0.5582, 0.6002), items_in_cluster: 14
cluster: 6, qcs_range: (0.6168, 0.6168), items_in_cluster: 1
cluster: 5, qcs_range: (0.651, 0.6661), items_in_cluster: 9
cluster: 7, qcs_range: (0.6859, 0.6874), items_in_cluster: 2
cluster: 1, qcs_range: (0.8843, 0.9011), items_in_cluster: 9
cluster: 2, qcs_range: (0.9171, 0.9171), items_in_cluster: 3
cluster: 3, qcs_range: (0.9375, 0.9382), items_in_cluster: 2
cluster: 0, qcs_range: (0.9649, 0.9797), items_in_c

In [7]:
df = cluster_analysis(df, eps=0.04)

In [8]:
# long-time execution

save_seqs(df, "Drosophilidae")

In [9]:
group_species(df, "Drosophilidae", "Drosophilidae_grouped")

In [10]:
# long-time execution

genome_coverages = extract_genome_coverages(df)

In [11]:
genome_coverages  # check if everything is OK

['            Genome Coverage        :: 12x',
 '            Genome Coverage        :: 12x',
 '            Genome Coverage        :: 50x',
 '            Genome Coverage        :: 50x',
 '            Genome Coverage        :: 12x',
 '            Genome Coverage        :: 50x',
 '            Genome Coverage        :: 12x',
 '            Genome Coverage        :: 103.8x',
 '            Genome Coverage        :: 165.0x',
 '            Genome Coverage        :: 160.0x',
 '            Genome Coverage           :: 120.0x',
 '            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 50x',
 '            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 180.0x',
 '            Genome Coverage        :: 0x',
 '            Genome Coverage        :: 104.0x',
 '            Genome Coverage        :: 12x',
 '            Genome Coverage        :: 75.0x',
 '            Genome Coverage        :: 123.1x',
 '          

In [12]:
df = add_genome_coverages(genome_coverages, df)
df = select_max_ids(df)
df = filter_genome_coverages(df, genome_coverage_threshold=50)

In [13]:
group_species_genome_coverage(df, folder_name="Drosophilidae", new_folder_name="Drosophilidae_filtered")

In [35]:
edit_names_for_alignment("Drosophilidae_filtered")