Modify this line to briefly discribe the functionality of ./prepare_data/data_preparation.ipynb<br/><br/>Copyright (C) 2017  Martin Engqvist Lab<br/>This program is free software: you can redistribute it and/or modify<br/>it under the terms of the GNU General Public License as published by<br/>the Free Software Foundation, either version 3 of the License, or<br/>(at your option) any later version.<br/>This program is distributed in the hope that it will be useful,<br/>but WITHOUT ANY WARRANTY; without even the implied warranty of<br/>MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the<br/>GNU General Public License for more details.<br/>You should have received a copy of the GNU General Public License<br/>along with this program.  If not, see <http://www.gnu.org/licenses/>.

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = ''
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:
from dnapy.resources import fasta

import subprocess
from seqsample import seqsample
from seqsample import distance
import logging
import sys
import multiprocessing
import types

from Bio import SeqIO
from Bio.Seq import Seq

#### First I need to filter out the sequences that do not belong to 1.1.3.15

In [3]:
fasta = SeqIO.parse(join(INTERMEDIATE, 'BRENDA', '1_1_3__BRENDA_sequences_filtered.fasta'), "fasta")

seq_data = []
with open(join(INTERMEDIATE, 'BRENDA', '1_1_3__BRENDA_sequences_filtered.fasta'), 'r') as f:
    for record in fasta:
        header, seq = record.description, record.seq

        # get the id
        ec = header.split(';')[2]

        if ec == '1.1.3.15':
            seq_data.append('>%s\n%s' % (header, seq))
            
fasta_filepath = join(INTERMEDIATE, 'BRENDA', '1_1_3_15_BRENDA_sequences_filtered.fasta')
with open(fasta_filepath, 'w') as f:
    f.write('\n'.join(seq_data))



#### Now perform the clustering 

In [19]:
# setup the logger
seqsample.setup_logfile()

# write some initial information
seqsample.initial_info()

# define filepaths
seqsample.define_paths()

# create folders
seqsample.setup_folders()

# build blast db
db_filepath = join(INTERMEDIATE, 'BRENDA', 'my_prot_blast_db')
seqsample.build_blast_db(infile=fasta_filepath, outfile=db_filepath)

# perform all vs all blast
blast_outfile = join(INTERMEDIATE, 'BRENDA', 'all-vs-all.tsv')
seqsample.blast_fasta_file(db_file=db_filepath, fasta_file=fasta_filepath, outfile=blast_outfile, cores=4, outformat=6)


#### Use the BLAST data to build a network and cluster the sequences

In [23]:
# make an abc file based on evalue
abc_file = join(INTERMEDIATE, 'BRENDA', 'all-vs-all.abc')
seqsample.blast_to_abc_evalue(infile=blast_outfile, outfile=abc_file)

# build a network
mci_file = join(INTERMEDIATE, 'BRENDA', 'seq.mci')
tab_file = join(INTERMEDIATE, 'BRENDA', 'seq.tab')
seqsample.build_network(infile=abc_file, mci_file=mci_file, tab_file=tab_file)

# cluster the sequences using different granularity settings
seqsample.cluster(mci_file=mci_file, 
                  tab_file=tab_file, 
                  out_folder=join(INTERMEDIATE, 'BRENDA'), 
                  granularity_levels=('1.4', '2', '4', '6'), 
                  cores=4)


#### Build fasta files and make multiple sequence alignments from the sequences 

In [7]:
# make fasta files
seqsample.make_cluster_fasta_files(cluster_infile=join(INTERMEDIATE, 'BRENDA', 'dump.seq.mci.I40'), 
                                   sequence_infile=fasta_filepath, 
                                   fasta_outdir=join(FINAL, 'BRENDA'))

# make mcs
seqsample.align_all_clusters(folder_path=join(FINAL, 'BRENDA'))