# Data mining in biological engineering
Doug White

## Sequence alignment with BioPython

### loading sequence into BioPython

In [None]:
#Import statements
from Bio import SeqIO

#Load the sequence
file_path = "E:/Data Science Biology Tutorial/sequence 1.fasta" #<- INSERT YOUR PATH HERE
num_sequences = 0
for seq_record in SeqIO.parse(file_path, "fasta"):
    sequence = seq_record.seq
    num_sequences += 1

print(num_sequences)
print(len(sequence))

### performing sequence alignment

In [None]:
#Subest the sequence
from Bio import pairwise2
seq_1 = sequence[0:1000]
seq_2 = sequence[-1000:-1]

#Perform the alignment
alignments = pairwise2.align.globalxx(seq_1, seq_2)
print(len(alignments))
print(alignments[0])

print(pairwise2.format_alignment(*alignments[0]))

#### Use case
- Companion diagnostic:
    - Patient selection
      * by sequencing
        - e.g. a boil
        - could tell you if a sample is antibiotic resistance
- Find "probes" for antibiotic resistance, same approach applies for cancer, infectious disease, flu, viral disease: all of that requires

## Accessing the NCBIâ€™s online API with the Python requests library

### executing a GET request

In [None]:
#Import statements
import requests

#get the webpage
web_page = "https://www.ncbi.nlm.nih.gov/"
response = requests.get(web_page)
print(response)
print(response.text)

#post to the webpage
#get the webpage
web_page = "https://www.ncbi.nlm.nih.gov/"
response = requests.post(web_page, data=[])
print(response)
print(response.text)

### performing a search

In [None]:
#get the webpage
web_page = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
db = 'db=nuccore'
term = "term=mrsa"
retmax = "retmax=100"
response = requests.get(web_page + db + '&' + term + '&' + retmax)
print(response)
print(response.text)

### parsing the html response

In [None]:
#get the webpage
web_page = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
db = 'db=nuccore'
term = "term=mrsa"
retmax = "retmax=100"
response = requests.get(web_page + db + '&' + term + '&' + retmax)
print(response)
print(response.text)

In [None]:
#Parse the response using lxml
from lxml import html

tree = html.fromstring(response.content)
ids = tree.xpath('//id/text()')

### retrieving the FASTA files using efetch

In [None]:
#Fetech the data using efetch
web_page = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
db = 'db=nuccore'
id_lookup = "id="
for uid in ids:
    id_lookup += uid +","
#Remove the last ,
id_lookup = id_lookup[:-1]

ret_type = "rettype=fasta&retmode=text"
#Now fecth the data
response = requests.post(web_page + db + '&' + id_lookup + '&' + ret_type)
print(response)
print(response.text[0:1000])

## Storing our data in a databse in the cloud with MongoDB

### connecting to the database

In [None]:
#Import statements
from pymongo import MongoClient

#Connect to the database
mongo_uri = 'mongodb://test_user:Emory18!@ds023455.mlab.com:23455/heroku_kn66qwrg'
client = MongoClient(mongo_uri)
database = client["heroku_kn66qwrg"]
#Make a new collection
your_name = "Doug" #<- INSERT YOUR NAME HERE
data = {} 
database[your_name + "_sequences"].insert_one(data)

print(database[your_name + "_sequences"])

### store the first 100 FASTA files from NCBI

In [None]:
#Import statements
import requests

#get the webpage
web_page = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
db = 'db=nuccore'
term = "term=mrsa"
retmax = "retmax=100"
response = requests.get(web_page + db + '&' + term + '&' + retmax)
print(response)
print(response.text)

#Parse the response using lxml
from lxml import html

tree = html.fromstring(response.content)
ids = tree.xpath('//id/text()')

#Fetech the data using efecth
web_page = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
db = 'db=nuccore'
id_lookup = "id="
for uid in ids:
    id_lookup += uid +","
#Remove the last ,
id_lookup = id_lookup[:-1]

ret_type = "rettype=fasta&retmode=text"
#Now fecth the data
response = requests.post(web_page + db + '&' + id_lookup + '&' + ret_type)
print(response)

#Create a temporary local file to hold the information
f = open('temp.fasta', 'w')
f.write(response.text)
f.close()

#load the FASTA files using biopython
from Bio import SeqIO

#Now save these to mongoDB
for seq_record in SeqIO.parse('temp.fasta', "fasta"):
    seq_data = dict()
    seq_data["sequence"] = str(seq_record.seq)
    seq_data["record_id"] = seq_record.id
    result = database[your_name + "_sequences"].insert_one(seq_data)
    print(result.inserted_id)

### accessing the same files

In [None]:
#Import statements
from pymongo import MongoClient

#Connect to the database
mongo_uri = 'mongodb://test_user:Emory18!@ds023455.mlab.com:23455/heroku_kn66qwrg'
client = MongoClient(mongo_uri)
database = client["heroku_kn66qwrg"]
#Make a new collection
your_name = "Doug" #<- INSERT YOUR NAME HERE
data = {} 
database[your_name + "_sequences"].insert_one(data)

print(database[your_name + "_sequences"])


## setting up a secure database with the Flask library

### building a simple Flask application

In [None]:
#Import statements
from pymongo import MongoClient
from flask import Flask

#Define the app
app = Flask(__name__)

#Create our sequence method
@app.route('/sequences')
def sequences():
    pass

#run the app locally
if __name__ == "__main__":
    app.run()

### connect Flask to MongoDb database

In [None]:
#Import statements
from pymongo import MongoClient
from flask import Flask
from flask import jsonify

#Connect to the database
mongo_uri = 'mongodb://test_user:Emory18!@ds023455.mlab.com:23455/heroku_kn66qwrg'
client = MongoClient(mongo_uri)
database = client["heroku_kn66qwrg"]

#Define the app
app = Flask(__name__)

#Create our sequence method
@app.route('/sequences')
def sequences():
    data = []
    #Get all of the records in your collection
    your_name = "Doug"
    results = database[your_name + "_sequences"].find({})
    #Print out the information
    for result in results:
        del result["_id"]
        data.append(result)
    return jsonify(data), 200

#run the app locally
if __name__ == "__main__":
    app.run()