# Imports

In [298]:
from Bio import Entrez
from Bio import Blast

from entrez import nucl_search, save_esearch_results
from parse_blast_results import extract_target_range, calculate_qc
from fasta_processing import plain_to_fasta, read_fasta

Entrez.email = "artemvaskaa@gmail.com"
Blast.email = "artemvaskaa@gmail.com"

# Fasta processing funcs

In [42]:
path_to_plain_dir = "/home/artemvaska/Master_degree/Diploma/References/"

In [None]:
plain_to_fasta(path_to_plain_dir + "sbr_RA_gene_plain.fa", fasta_line_length=80, uppercase=True)

In [45]:
plain_to_fasta(path_to_plain_dir + "sbr_RA_CDS_plain.fa", fasta_line_length=80, uppercase=True)

# Entrez funcs

In [8]:
query = "(Drosophilidae[ORGN] NOT Drosophila melanogaster[ORGN]) AND (chromosome X[WORD] NOT PREDICTED[WORD] NOT gene[WORD]) AND 15000000:75000000[SLEN]"

In [9]:
id_list_test = nucl_search(query)

In [63]:
save_esearch_results(id_list_test[:3], "Drosophilidae")

In [13]:
# save all seqs in 1 file

with open("3_species.fa", "w") as ouf:
    for rec in id_list_test[:3]:
        lne = Entrez.efetch(
        db="nucleotide", id=rec, retmode="text", rettype="fasta"
        ).read()
        ouf.write(lne + "\n")

# Local blast+

Available DB in NCBI blastn -remote:

https://rc.dartmouth.edu/index.php/blast-introduction/blast-databases/

Установка blast+ на linux (https://www.ncbi.nlm.nih.gov/books/NBK52640/):

1. https://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ (ncbi-blast-2.16.0+-x64-linux.tar.gz)
2. `mv ~/Downloads/ncbi-blast-2.16.0+-x64-linux.tar.gz ~`
3. `tar zxvpf ncbi-blast-2.16.0+-x64-linux.tar.gz`
4. `export PATH=$PATH:$HOME/ncbi-blast-2.16.0+/bin`
5. `mkdir $HOME/blastdb`
6. `export BLASTDB=$HOME/blastdb`
7. export into bash_profile (`nano .bash_profile`)

## Commands for local db

`$ makeblastdb -in Master_degree/Diploma/References/sbr_RA_gene.fa -dbtype nucl -out blastdb/sbr_ra/sbr_ra`

`$ blastn -db blastdb/sbr_ra -query Master_degree/Diploma/References/Drosophilidae/1797095071.fa`

`$ makeblastdb -in Master_degree/Diploma/References/3_species.fa -dbtype nucl -out blastdb/3_sp/3_sp`

`$ blastn -db blastdb/3_sp/3_sp -query Master_degree/Diploma/References/sbr_RA_gene.fa`

AE - NCBI - Genome project

AJ - EBI - Direct submissions

NM - curated RefSeq

# sbr gene

https://www.alliancegenome.org/gene/FB:FBgn0003321

Sequence Details (Mode: cDNA)

---

# Parsing BLAST XML2 and calculating QC

Drosophilidae (taxid:7214)

Drosophila melanogaster (taxid:7227)

https://biochem.slu.edu/bchm628/handouts/2013/Entrez_boolian_searches.pdf

https://biopython.org/docs/dev/Tutorial/chapter_blast.html

https://biopython.org/docs/dev/Tutorial/chapter_blast.html#the-blast-records-record-and-hit-classes

In [282]:
name_of_blast_res = "../Blast_res/full_sbr_RA_wgs_megablast_250_16_xml2.xml" # XML2 !!!
result_stream = open(name_of_blast_res, "rb")

In [283]:
blast_record = Blast.read(result_stream)

In [48]:
original_blast_record = blast_record[:]

In [284]:
hit = blast_record[0]
# hsp = hit[0] # HSPs -- “High-scoring Segment Pairs”
# hit.target  # hit.targets

In [285]:
print(hit)  # span -- the alignment length including gaps

Query: Query_719443
       sbr-RA-gene
  Hit: gi|2735068466|gb|JBBODO010000732.1| (length=1492563)
       Drosophila simulans strain SZ45 tig00001109, whole genome shotgun
       sequence
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         0    9745.89    8606     [5873:14341]        [165809:174040]
          1         0    6167.08    4460      [1113:5494]        [161115:165507]
          2         0    1055.56     761       [360:1116]        [160331:161089]
          3   7.9e-81     315.05     231      [5500:5728]        [165482:165708]
          4     5e-43     189.48     217          [0:217]        [160127:160331]


In [287]:
hsp = hit[0]

In [288]:
print(hsp)

Query : Query_719443 Length: 14341 Strand: Plus
        sbr-RA-gene
Target: gi|2735068466|gb|JBBODO010000732.1| Length: 1492563 Strand: Plus
        Drosophila simulans strain SZ45 tig00001109, whole genome shotgun
        sequence

Score:9745 bits(5277), Expect:0,
Identities:7582/8606(88%),  Gaps:513.8606(6%)

gi|273506    165809 AGTTGAAAAGCAACTA-A-ATACCGCCGTCTAGAGTTCTAAATGCTTAATGGTAATTGGC
                  0 ||||||||||||||||-|-|||||-.|||||||||||.|||||.||||..|||||||||.
Query_719      5873 AGTTGAAAAGCAACTAGAGATACC-TCGTCTAGAGTTTTAAATTCTTATCGGTAATTGGG

gi|273506    165867 TTTACTTAATTACTCACTGCTACAATTACTTTGCCTGCCTTCAGGCGTTGCTAAAAAGC-
                 60 .|||.|||||||||||||||||||||||||||||||||||||||||||||||||||-||-
Query_719      5932 CTTATTTAATTACTCACTGCTACAATTACTTTGCCTGCCTTCAGGCGTTGCTAAAA-GCt

gi|273506    165926 -TTTTTCATAAACAAATTGC-ATAGCATAGTCTAAGTTGTAGTGCGCGACGAATCGGTGT
                120 -......|||||||||||||-||||||||||||||.|||||||||..|.|||||..||||
Query_719      5991 tttttttATAAACAAAT

In [289]:
# filter_func = lambda hit: len(hit) > 1
# len(blast_record) # 250
# blast_record[:] = filter(filter_func, blast_record)
# len(blast_record) # 228

# for hit in blast_record[:5]:  # quick check for the hit lengths
#     print(f"{hit.target.id} {len(hit)}")

In [290]:
extract_target_range(hit)

[160127, 174040]

In [291]:
qcs = calculate_qc(blast_record)

In [292]:
qcs

{'gi|2735068466|gb|JBBODO010000732.1|': 0.9797,
 'gi|2735070232|gb|JBBODR010000286.1|': 0.9764,
 'gi|2733175125|gb|JBAMBY010000011.1|': 0.9729,
 'gi|2733173904|gb|JBAMBW010000153.1|': 0.9757,
 'gi|2735068651|gb|JBBODP010000434.1|': 0.9773,
 'gi|2733173966|gb|JBAMBV010000350.1|': 0.9738,
 'gi|2735067492|gb|JBBODN010000294.1|': 0.9761,
 'gi|2053678612|gb|JAEIGY010000049.1|': 0.9705,
 'gi|1601089141|gb|NIGA01000006.1|': 0.9665,
 'gi|1601087464|gb|NIFY01000006.1|': 0.9752,
 'gi|2105612828|gb|NGVV02000001.1|': 0.9752,
 'gi|2736901983|emb|CAKMKS020000146.1|': 0.9768,
 'gi|2182812339|emb|CAKMKU010000128.1|': 0.9752,
 'gi|2733174183|gb|JBAMBX010000322.1|': 0.9674,
 'gi|2182812831|emb|CAKMKT010000087.1|': 0.9761,
 'gi|650401384|gb|JMCE01000012.1|': 0.9745,
 'gi|2182812156|emb|CAKMKR010000141.1|': 0.9761,
 'gi|2053675408|gb|JAEIGU010000044.1|': 0.9773,
 'gi|2735070348|gb|JBBODQ010000284.1|': 0.9757,
 'gi|672917037|gb|JPYS01000001.1|': 0.9713,
 'gi|2053677094|gb|JAEIGV010000073.1|': 0.9649,
 'gi|

In [293]:
blast_record["gi|2735068466|gb|JBBODO010000732.1|"]

<Bio.Blast.Hit target.id='gi|2735068466|gb|JBBODO010000732.1|' query.id='Query_719443'; 5 HSPs>

In [294]:
qcs["gi|2735068466|gb|JBBODO010000732.1|"]

0.9797

# TODOs

1. Найти референсные последовательности для sbr - https://www.alliancegenome.org/gene/FB:FBgn0003321 (Sequence Details (Mode: cDNA)):
   - найти координаты гены на X хромосоме
   - полная последовательность гена
   - сплайсированный вариант
   - консервативная кассета
2. Написать срипт, который:
   - ищет ген sbr / nxf1 в базе данных NCBI для семейства Drosophilidae
   - выбирает из найденных только адекватные варианты и формирует из них БД для бласта
3. Сделать бласт вариантов из 1-го пункта на варианты из второго пункта