# Setup our environment

In [73]:
import ncbi_genome_download as ngd
import numpy as np
import os, sys
from os import listdir
import pandas as pd
from Bio import SeqIO
from array import array
import gzip
import shutil
import random
from random import randint

# Download the data

Here we decide to download a specific taxa of *Escherichia coli*, but any data in NCBI can be downloaded using the package [NCBI Genome Download](https://github.com/kblin/ncbi-genome-download). The code below downloads **6,848** genomic sequences associated with *E. coli*.

In [74]:
out_directory = os.path.join('/local_data/atitus', 'data')
bug = 'Escherichia coli'
group = 'bacteria'
f_format = 'fasta'
num_cores = 32

#ngd.download(genus = bug, group=group, file_format = f_format, output = out_directory, parallel = num_cores)

After we download the data, it's stored in '.gz' files that must be unzipped. The code below unzips all the files into a single directory for subsequent processing. 

In [75]:
out_dir = os.path.join(out_directory, group)

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

file_dir = 'refseq/bacteria/'
directory = os.path.join(out_directory, file_dir)

files = os.listdir(directory)

for file in files:
    new_dir = os.path.join(directory, file)
    new_files = os.listdir(new_dir)
    rel_files = [f for f in new_files if file in f]
    
    data_zip_file = os.path.join(new_dir, rel_files[0])
    file_base = rel_files[0][:-7]
    new_file = file_base + '.fasta'
    data_file = os.path.join('/local_data/atitus/data/bacteria', new_file)

    # This is commented out to prevent accidental running that takes a while to opperate
    #with gzip.open(data_zip_file, 'rb') as f_in, open(data_file, 'wb') as f_out:
    #    shutil.copyfileobj(f_in, f_out)

We next need to generate a number of queries for our analyses. We should have a query that represents 100% match, 0% match, a completely random sequence, and a few varying levels of matching. In this case, we generate two queryies with ~33% and ~66% match. 

## Query 1 - 100% match

In [76]:
random.seed(1)
files = os.listdir(out_dir)
num_files = len(files)

query_file_ind = randint(0, num_files)

query_file = files[query_file_ind]
print('Selected "' + query_file + '" as the query base for query 1')

query_path = os.path.join(out_dir, query_file)
query_base = list(SeqIO.parse(query_path, "fasta"))

query = ''

with open(query_path, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        query += record.seq
        
query_len = len(query)
print('Query length: %s' %str(query_len))


#with open(data_file, 'wb') as f_out:
#    shutil.copyfileobj(f_in, f_out)

Selected "GCF_001606515.1_ASM160651v1_genomic.fasta" as the query base for query 1
Query length: 5445722


## Query 2 - 0% match

In [77]:
query_no_match = 'X'*query_len
print(len(query_no_match), query_no_match[:100])

5445722 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


## Query 3 - completely random sequence

In [78]:
random.seed(3)
bases = ['T', 'C', 'G', 'A']

query_random_bases = ''

for i in range(0, query_len):
    query_random_bases += random.choice(bases)

print(len(query_random_bases), query_random_bases[:100])

5445722 CCGATTAGCCAAACCCATTCTGTGAAAAACGTTCACGAGAAGACGTGCGTCGGTTAATGTACTGAATTTAGGCTGTTTTCAGGCTGGGCAAAATGACGAG


## Query 4 - approximately 33% match

In [None]:
random.seed(4)

query_33_match = ''

for base in query:
    choice = np.random.choice([0, 1], replace=True, p=[0.33, 0.67])
    
    if choice == 1:
        query_33_match += 'X'
    else:
        query_33_match += base

print(len(query_33_match), query_33_match[:100])

## Query 5 - approximately 66% match

In [None]:
random.seed(5)

query_66_match = ''

for base in query:
    choice = np.random.choice([0, 1], replace=True, p=[0.67, 0.33])
    
    if choice == 1:
        query_66_match += 'X'
    else:
        query_66_match += base

print(len(query_66_match), query_66_match[:100])

In [82]:
np.random.choice([0, 1], replace=True, p=[0.33, 0.67])

1