In [1]:
import os
import sys
import time
import tempfile
import logging
import argparse
from lib import ApytramNeeds
from lib import BlastPlus
from lib import Trinity


start_time = time.time()

In [2]:
### Option parsing
parser = argparse.ArgumentParser(prog = "apytram.py",
                                 description='''
    Run apytram on a fastq file to retrieve
    homologous sequences of bait sequences.''')

requiredOptions = parser.add_argument_group('required arguments')
requiredOptions.add_argument('-d', '--database', nargs='?', type=str,
                             help='Database preffix name', required=True)

parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument('-log', nargs='?', type=str, default="apytram.log")
parser.add_argument('-t', '--tmp',  type=str,
                    help = "Directory to stock intermediary files for the apytram run. (default: a directory in /tmp which will be removed at the end)",
                    default = "" )


parser.add_argument('-fa', '--fasta',  type=str)

parser.add_argument('-q', '--query',  type=str,
                    help = "Fasta file (nt) with bait sequence for the apytram run." )
parser.add_argument('-i', '--iteration_max',  type=int,
                    help = "Maximum number of iteration. (Default 5)",
                    default = 5 )
parser.add_argument('-e', '--evalue',  type=float,
                    help = "Evalue. (Default 1e-3)",
                    default = 1e-3 )

parser.add_argument('--threads',  type=int,
                    help = "Available threads. (Default 1)",
                    default = 1 )


#args = parser.parse_args()
args = parser.parse_args('-d example_exec/db/example -fa example/example_db.fasta -q example/ref_gene.fasta -t example_exec/tmp3 -i 5'.split())

MaxIteration = args.iteration_max
Threads = args.threads
Evalue = args.evalue


In [3]:
### Set up the logger
LogFile = args.log
# create logger with 'spam_application'
logger = logging.getLogger('apytram')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(LogFile)
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [4]:
### Set up the working directory
if args.tmp:
    if os.path.isdir(args.tmp):
        logger.info("The temporary directory %s exists" %(args.tmp) )
    else:
        logger.info("The temporary directory %s does not exist, it will be created" % (args.tmp))
        os.makedirs(args.tmp)
    TmpDirName = args.tmp
else:
    TmpDirName = tempfile.mkdtemp()

# Remove only empty directory
#os.rmdir(TmpDirName)

INFO:apytram:The temporary directory example_exec/tmp3 exists


In [5]:
### Check that there is a database else built it
DatabaseName = args.database
if not os.path.isfile(DatabaseName+".nhr"):
    logger.info(DatabaseName+".nhr does not exist")
    #Build blast formated database from a fasta file
    if args.fasta:
        InputFasta = args.fasta
        if not os.path.isfile(InputFasta):
            logger.error("The fasta file (-fa) does not exist.")
            sys.exit(1)
        if os.path.isdir(os.path.dirname(DatabaseName)):
            logger.info("Database directory exists")
        else:
            logger.info("Database directory does not exist, we create it")
            os.makedirs(os.path.dirname(DatabaseName))
        # database building
        logger.info(DatabaseName + " database building")
        MakeblastdbProcess = BlastPlus.Makeblastdb(InputFasta,DatabaseName)
        ExitCode = MakeblastdbProcess.launch()
    else :
        logger.error("The database is not formatted ! A fasta file (-fa) is required !")
        sys.exit(1)
if not os.path.isfile(DatabaseName+".nhr"):
    logger.error("Problem in the database building")
    logger.info(DatabaseName+".nhr does not exist")
    sys.exit(1)
else:
    logger.info(DatabaseName+".nhr exists")

INFO:apytram:example_exec/db/example.nhr exists


In [6]:
### If there is a query continue, else stop
if not args.query:
    logger.info("There is no query (-q), apytram have finished.")
    quit()
elif not os.path.isfile(args.query):
    logger.error(args.query+" (-q) is not a file.")
    sys.exit(1)
else:
    queryFile = args.query
    QueryDatabaseName = TmpDirName + "/" + queryFile.split(".")[0]
    BlastdbcmdProcess = BlastPlus.Makeblastdb(queryFile,QueryDatabaseName)
    command = BlastdbcmdProcess.launch()
    print command
    logger.info("apytram will run with \"%s\" as reads database and \"%s\" as bait sequences" %(DatabaseName,queryFile))
    

makeblastdb -in example/ref_gene.fasta -out example_exec/tmp3/example/ref_gene -dbtype nucl
0

INFO:apytram:apytram will run with "example_exec/db/example" as reads database and "example/ref_gene.fasta" as bait sequences





In [7]:
### Make iterations
# Initialisation
i = 0
Stop = False
BaitSequences = queryFile

logger.info("Iterations begin")
while (i < MaxIteration) and (Stop == False):
    i+=1
    logger.info("Iteration %d/%d" %(i,MaxIteration))
    # Blast bait seqeunce on database of reads
    logger.info("Blast bait sequences on reads database")
    ReadNamesFile = TmpDirName + "/ReadNames.%d.txt" % (i)
    BlastnProcess = BlastPlus.Blast("blastn", DatabaseName, BaitSequences)
    BlastnProcess.Evalue = Evalue
    BlastnProcess.Threads = Threads
    BlastnProcess.OutFormat = "6 sacc"
    # Write read names in ReadNamesFile
    ExitCode = BlastnProcess.launch(ReadNamesFile)
    BlastnProcess.OutFormat = "6"
    ExitCode = BlastnProcess.launch(ReadNamesFile+".completetable")
    # Get paired reads names
    ExitCode = ApytramNeeds.add_paired_read_names(ReadNamesFile)
    # Retrieve sequences
    logger.info("Retrieve sequences")
    ReadFasta = TmpDirName + "/Reads.%d.fasta" % (i)
    BlastdbcmdProcess = BlastPlus.Blastdbcmd(DatabaseName, ReadNamesFile, ReadFasta)
    BlastdbcmdProcess.launch()
    # Launch Trinity
    logger.info("Launch Trinity")
    TrinityFasta = TmpDirName + "/Trinity_iter_%d" % (i)
    TrinityProcess = Trinity.Trinity(ReadFasta,TrinityFasta)
    # Use the  --full_cleanup Trinity option to keep only the contig file
    exitCode = TrinityProcess.launch("full_cleanup")
    if exitCode != 0: # Trinity found nothing
        logger.info("Trinity found nothing (ExitCode: %d)" %exitCode)
        Stop = True
    else:
        # Filter Trinity contigs to keep only homologous sequences of the reference genes
        logger.info("Compare Trinity results with query sequences")
        TrinityFasta = TrinityFasta + ".Trinity.fasta"
        TrinityBlast = TmpDirName + "/Trinity_iter_%d.blast" % (i)
        BlastnProcess = BlastPlus.Blast("blastn", QueryDatabaseName, TrinityFasta)
        BlastnProcess.OutFormat = "6"
        BlastnProcess.Evalue = 1e-8
        BlastnProcess.perc_identity = 80
        BlastnProcess.launch(TrinityBlast)
        # Filter hit
        FileteredTrinityFasta = TrinityFasta
        # Validated sequences become bait sequences
        BaitSequences = FileteredTrinityFasta
    
# Write output
logger.info("End of Iterations")






INFO:apytram:Iterations begin
INFO:apytram:Iteration 1/5
INFO:apytram:Blast bait sequences on reads database


blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example/ref_gene.fasta -evalue 0.001 -outfmt 6 sacc -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.1.txt -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example/ref_gene.fasta -evalue 0.001 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.1.txt.completetable -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
0

INFO:apytram:Retrieve sequences
INFO:apytram:Launch Trinity



blastdbcmd -db example_exec/db/example -entry_batch example_exec/tmp3/ReadNames.1.txt -dbtype nucl -out example_exec/tmp3/Reads.1.fasta
0

INFO:apytram:Compare Trinity results with query sequences
INFO:apytram:Iteration 2/5
INFO:apytram:Blast bait sequences on reads database



blastn -db example_exec/tmp3/example/ref_gene -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_1.Trinity.fasta -evalue 1e-08 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_1.blast -perc_identity 80 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_1.Trinity.fasta -evalue 0.001 -outfmt 6 sacc -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.2.txt -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_1.Trinity.fasta -evalue 0.001 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.2.txt.completetable -perc_identity 20 -m

INFO:apytram:Retrieve sequences
INFO:apytram:Launch Trinity



blastdbcmd -db example_exec/db/example -entry_batch example_exec/tmp3/ReadNames.2.txt -dbtype nucl -out example_exec/tmp3/Reads.2.fasta
0

INFO:apytram:Compare Trinity results with query sequences
INFO:apytram:Iteration 3/5
INFO:apytram:Blast bait sequences on reads database



blastn -db example_exec/tmp3/example/ref_gene -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_2.Trinity.fasta -evalue 1e-08 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_2.blast -perc_identity 80 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_2.Trinity.fasta -evalue 0.001 -outfmt 6 sacc -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.3.txt -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_2.Trinity.fasta -evalue 0.001 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.3.txt.completetable -perc_identity 20 -m

INFO:apytram:Retrieve sequences
INFO:apytram:Launch Trinity



blastdbcmd -db example_exec/db/example -entry_batch example_exec/tmp3/ReadNames.3.txt -dbtype nucl -out example_exec/tmp3/Reads.3.fasta
0

INFO:apytram:Compare Trinity results with query sequences
INFO:apytram:Iteration 4/5
INFO:apytram:Blast bait sequences on reads database



blastn -db example_exec/tmp3/example/ref_gene -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_3.Trinity.fasta -evalue 1e-08 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_3.blast -perc_identity 80 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_3.Trinity.fasta -evalue 0.001 -outfmt 6 sacc -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.4.txt -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_3.Trinity.fasta -evalue 0.001 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.4.txt.completetable -perc_identity 20 -m

INFO:apytram:Retrieve sequences
INFO:apytram:Launch Trinity



blastdbcmd -db example_exec/db/example -entry_batch example_exec/tmp3/ReadNames.4.txt -dbtype nucl -out example_exec/tmp3/Reads.4.fasta
0

INFO:apytram:Compare Trinity results with query sequences
INFO:apytram:Iteration 5/5
INFO:apytram:Blast bait sequences on reads database



blastn -db example_exec/tmp3/example/ref_gene -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_4.Trinity.fasta -evalue 1e-08 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_4.blast -perc_identity 80 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_4.Trinity.fasta -evalue 0.001 -outfmt 6 sacc -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.5.txt -perc_identity 20 -max_target_seqs 1000000000 -num_threads 1
blastn -db example_exec/db/example -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_4.Trinity.fasta -evalue 0.001 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/ReadNames.5.txt.completetable -perc_identity 20 -m

INFO:apytram:Retrieve sequences
INFO:apytram:Launch Trinity



blastdbcmd -db example_exec/db/example -entry_batch example_exec/tmp3/ReadNames.5.txt -dbtype nucl -out example_exec/tmp3/Reads.5.fasta
0

INFO:apytram:Compare Trinity results with query sequences
INFO:apytram:End of Iterations



blastn -db example_exec/tmp3/example/ref_gene -query /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_5.Trinity.fasta -evalue 1e-08 -outfmt 6 -out /home/crey02/Documents/Projets/Convergences/Pipeline/apytram/example_exec/tmp3/Trinity_iter_5.blast -perc_identity 80 -max_target_seqs 1000000000 -num_threads 1


In [8]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 53.1535749435 seconds ---


In [9]:
system("ls")

['apytram.ipynb', 'apytram.log', 'data_test', 'example', 'example_exec', 'lib']