diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index fa2521e..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.github/workflows/github_build.yml b/.github/workflows/github_build.yml index 14f1a20..b92da27 100644 --- a/.github/workflows/github_build.yml +++ b/.github/workflows/github_build.yml @@ -5,9 +5,11 @@ name: build on: push: - branches: [ master ] + branches: + - master + - 0.1.0 tags: - - '*' + - '*' pull_request: branches: [ master ] @@ -38,10 +40,17 @@ jobs: run: | pwd pip install . - fdog.setup -o /home/runner/work/fDOG/fDOG/dt --lib - fdog.setup -o /home/runner/work/fDOG/fDOG/dt + path=$(fdog.setup -d ./ --getSourcepath); for i in $(less $path/data/dependencies.txt); do sudo apt-get install -y -qq $i; done + fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS + fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast fdog.showTaxa - fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 --fasoff + fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff + mkdir seeds + path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done + fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3 + head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa + fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a + ls - name: Deploy if: startsWith(github.event.ref, 'refs/tags') uses: casperdcl/deploy-pypi@v2 diff --git a/fdog/.DS_Store b/fdog/.DS_Store deleted file mode 100644 index f638c26..0000000 Binary files a/fdog/.DS_Store and /dev/null differ diff --git a/fdog/addTaxa.py b/fdog/addTaxa.py index fa4a3a1..3bf7356 100644 --- a/fdog/addTaxa.py +++ b/fdog/addTaxa.py @@ -1,17 +1,13 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to prepare data for fdog. -# For each given genome FASTA file, It will create a folder within genome_dir +# For each given genome FASTA file, It will create a folder within searchTaxa_dir # with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version] -# e.g HUMAN@9606@3), a annotation file in JSON format in weight_dir and -# a blast DB in blast_dir folder (optional). -# For a long header of original FASTA sequence, only the first word -# will be taken as the ID of new fasta file, everything after the -# first whitespace will be removed. If this first word is not unique, -# an automatically increasing index will be added. +# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and +# a blast DB in coreTaxa_dir folder (optional). # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -26,98 +22,74 @@ import sys import os import argparse -from os import listdir -from os.path import isfile, join from pathlib import Path -import subprocess +from Bio import SeqIO import multiprocessing as mp +from tqdm import tqdm from ete3 import NCBITaxa -import csv -from io import StringIO import re import shutil -from tqdm import tqdm from datetime import datetime +import time +from pkg_resources import get_distribution +from collections import OrderedDict -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) +import fdog.libs.zzz as general_fn +import fdog.libs.tree as tree_fn +import fdog.libs.addtaxon as add_taxon_fn -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)] - ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName) - taxName = ncbiName.split() - name = taxName[0][:3].upper()+taxName[1][:2].upper() - except: - name = "UNK" + taxId - return(name) -def parseMapFile(mappingFile): - nameDict = {} - with open(mappingFile) as f: +def parse_map_file(mapping_file, folIn): + """ Create spec name from mapping file + And also check if given input files in mapping file exist + """ + name_dict = {} + with open(mapping_file) as f: for line in f: if not '#' in line: tmp = line.split('\t') - fileName = tmp[0] - taxId = tmp[1].strip() + file_name = tmp[0] + file_in = '%s/%s' % (folIn, file_name) + general_fn.check_file_exist(file_in) + tax_id = tmp[1].strip() try: - taxName = tmp[2].strip() + tax_name = tmp[2].strip() except: - taxName = getTaxName(taxId) + tax_name = '' try: ver = tmp[3].strip() except: - ver = datetime.today().strftime('%y%m%d') #1 - # print(taxName+"@"+str(taxId)+"@"+str(ver)) - nameDict[fileName] = (taxName, str(taxId), str(ver)) - return(nameDict) + ver = datetime.today().strftime('%y%m%d') + spec_name = add_taxon_fn.generate_spec_name(tax_id, tax_name, ver) + name_dict[file_in] = spec_name + return(name_dict) -def runAddTaxon(args): - (f,n,i,o,c,v,a,cpus,replace,delete) = args - cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus) - if c == True: - cmd = cmd + ' -c' - if a == True: - cmd = cmd + ' -a' - if replace == True: - cmd = cmd + ' --replace' - if delete == True: - cmd = cmd + ' --delete' - # print(cmd) - logFile = o + '/addTaxa2fDog.log' - cmd = cmd + ' >> ' + logFile - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) def main(): - version = '0.0.9' - parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True) required.add_argument('-m', '--mapping', - help='Tab-delimited text file containing tabtabtab. The last 2 columns are optional.', + help='Tab-delimited text file containing tabtabtab. The last 2 columns are optional.', action='store', default='', required=True) optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') - optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) - optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False) + optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False) + optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) - optional.add_argument('-f', '--force', help='Force overwrite existing data', action='store_true', default=False) + optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False) - ### get arguments args = parser.parse_args() folIn = args.input + folIn = os.path.abspath(folIn) mapping = args.mapping - checkFileExist(mapping) + general_fn.check_file_exist(mapping) outPath = args.outPath if outPath == '': - fdogPath = os.path.realpath(__file__).replace('/addTaxa.py','') + fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' if not os.path.exists(pathconfigFile): sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') @@ -131,61 +103,63 @@ def main(): cpus = mp.cpu_count()-2 replace = args.replace delete = args.delete + add_taxon_fn.check_conflict_opts(replace, delete) force = args.force + start = time.time() + ### parse mapping file + name_dict = parse_map_file(mapping, folIn) - ### get existing genomes - Path(outPath + "/genome_dir").mkdir(parents = True, exist_ok = True) - Path(outPath + "/weight_dir").mkdir(parents = True, exist_ok = True) - genomeFiles = listdir(outPath + "/genome_dir") - - ### generate taxon names from mapping file - nameDict = parseMapFile(mapping) - - ### read all input fasta files and create addTaxon jobs - jobs = [] - dupList = {} - faFiles = [f for f in listdir(folIn) if isfile(join(folIn, f))] - for f in faFiles: - # tmp = f.split('.') - if f in nameDict: - # check duplicated taxon name in existing data - taxName = '@'.join(nameDict[f]) - flag = 1 - if taxName in genomeFiles: - if force: - shutil.rmtree(outPath + "/genome_dir/" + taxName) - if not noAnno: - shutil.rmtree(outPath + "/weight_dir/" + taxName) - else: - flag = 0 - dupList[f] = taxName + ### initiate paths + Path(outPath + '/searchTaxa_dir').mkdir(parents = True, exist_ok = True) - if flag == 1: - fasta = folIn + '/' + f - name = nameDict[f][0] - taxid = nameDict[f][1] - verProt = nameDict[f][2] - jobs.append([ - folIn + '/' + f, nameDict[f][0], nameDict[f][1], - outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete - ]) - - if len(dupList) > 0: - print("These taxa are probably already present in %s:" % (outPath + "/genome_dir")) - for f in dupList: - print('\t'+f+'\t'+dupList[f]) + ### create file in searchTaxa_dir [and coreTaxa_dir] + genome_jobs = [] + blast_jobs = [] + for f in name_dict: + spec_name = name_dict[f] + ## remove old folder if force is set if force: - print('They will be deleted and re-compiled!') - else: - sys.exit("Please remove them from the mapping file or use different Name/ID/Version!") - - print('Parsing...') - for job in tqdm(jobs): - # print('@'.join([job[1],job[2],job[5]]) + '\t' + job[0]) - runAddTaxon(job) - - print('Output can be found in %s' % outPath) + if os.path.exists(outPath + '/searchTaxa_dir/' + spec_name): + shutil.rmtree(outPath + '/searchTaxa_dir/' + spec_name) + if os.path.exists(outPath + '/coreTaxa_dir/' + spec_name): + shutil.rmtree(outPath + '/coreTaxa_dir/' + spec_name) + ## create jobs + genome_path = '%s/searchTaxa_dir/%s' % (outPath, spec_name) + Path(genome_path).mkdir(parents = True, exist_ok = True) + genome_jobs.append([f, genome_path, spec_name, force, replace, delete]) + if coreTaxa: + genome_file = '%s/%s.fa' % (genome_path, spec_name) + blast_jobs.append([outPath, spec_name, genome_file, force, True]) + pool = mp.Pool(cpus) + + print('Parsing genome for %s species...' % len(genome_jobs)) + genome_out = [] + for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_genome, genome_jobs), + total=len(genome_jobs)): + genome_out.append(_) + out_msg = 'Output for %s can be found in %s within searchTaxa_dir' % (spec_name, outPath) + if len(blast_jobs) > 0: + print('\nCreating Blast DB for %s species...' % len(blast_jobs)) + blast_out = [] + for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_blastdb, blast_jobs), + total=len(blast_jobs)): + blast_out.append(_) + out_msg = '%s, coreTaxa_dir' % out_msg + + ### create annotation + if not noAnno: + Path(outPath + '/annotation_dir').mkdir(parents = True, exist_ok = True) + for f in name_dict: + genome_file = '%s/searchTaxa_dir/%s/%s.fa' % (outPath, name_dict[f], name_dict[f]) + add_taxon_fn.create_annoFile(outPath, genome_file, cpus, force) + if os.path.exists('%s/annotation_dir/tmp' % outPath): + shutil.rmtree('%s/annotation_dir/tmp' % outPath) + out_msg = '%s, annotation_dir' % out_msg + + end = time.time() + print('==> Adding %s taxa finished in %s' % (len(name_dict), '{:5.3f}s'.format(end - start))) + print('==> %s' % out_msg) if __name__ == '__main__': main() diff --git a/fdog/addTaxon.py b/fdog/addTaxon.py index 17ac018..2d5fb56 100755 --- a/fdog/addTaxon.py +++ b/fdog/addTaxon.py @@ -1,17 +1,13 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to prepare data for fdog. -# It will create a folder within genome_dir with the naming scheme of -# fdog ([Species acronym]@[NCBI ID]@[Proteome version], e.g -# HUMAN@9606@3) and a annotation file in JSON format in weight_dir -# (optional). -# For a long header of original FASTA sequence, only the first word -# will be taken as the ID of new fasta file, everything after the -# first whitespace will be removed. If this first word is not unique, -# an automatically increasing index will be added. +# For each given genome FASTA file, It will create a folder within searchTaxa_dir +# with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version] +# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and +# a blast DB in coreTaxa_dir folder (optional). # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -27,65 +23,19 @@ import os import argparse from pathlib import Path -from Bio import SeqIO -import subprocess -import multiprocessing as mp -from ete3 import NCBITaxa -import re import shutil +import multiprocessing as mp from datetime import datetime +from pkg_resources import get_distribution -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) - -def checkOptConflict(replace, delete): - if delete: - if replace: - sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') - if replace: - if delete: - sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') +import fdog.libs.zzz as general_fn +import fdog.libs.tree as tree_fn +import fdog.libs.addtaxon as add_taxon_fn -def checkTaxId(taxId): - ncbi = NCBITaxa() - tmp = ncbi.get_rank([taxId]) - try: - tmp = ncbi.get_rank([taxId]) - rank = tmp[int(taxId)] - if not rank == 'species': - print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (taxId, rank)) - else: - print('\033[92mNCBI taxon info: %s %s\033[0m' % (taxId, ncbi.get_taxid_translator([taxId])[int(taxId)])) - except: - print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % taxId) - -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)] - ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName) - taxName = ncbiName.split() - name = taxName[0][:3].upper()+taxName[1][:2].upper() - except: - name = "UNK" + taxId - return(name) - -def runBlast(args): - (specName, specFile, outPath) = args - blastCmd = 'makeblastdb -dbtype prot -in %s -out %s/blast_dir/%s/%s' % (specFile, outPath, specName, specName) - try: - subprocess.call([blastCmd], shell = True) - except: - sys.exit('Problem with running %s' % blastCmd) - fileInGenome = "../../genome_dir/%s/%s.fa" % (specName, specName) - fileInBlast = "%s/blast_dir/%s/%s.fa" % (outPath, specName, specName) - if not Path(fileInBlast).exists(): - os.symlink(fileInGenome, fileInBlast) def main(): - version = '0.0.11' - parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True) @@ -93,7 +43,7 @@ def main(): optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='') optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str) optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str) - optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False) + optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False) optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False) optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int) optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) @@ -102,12 +52,10 @@ def main(): args = parser.parse_args() - checkFileExist(args.fasta) + general_fn.check_file_exist(args.fasta) faIn = args.fasta - name = args.name.upper() taxId = str(args.taxid) - # outPath = str(Path(args.outPath).resolve()) - outPath = args.outPath #str(Path(args.outPath).resolve()) + outPath = args.outPath if outPath == '': fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','') pathconfigFile = fdogPath + '/bin/pathconfig.txt' @@ -116,119 +64,53 @@ def main(): with open(pathconfigFile) as f: outPath = f.readline().strip() outPath = os.path.abspath(outPath) - noAnno = args.noAnno - coreTaxa = args.coreTaxa + name = args.name.upper() ver = str(args.verProt) if ver == '': ver = datetime.today().strftime('%y%m%d') + noAnno = args.noAnno + coreTaxa = args.coreTaxa cpus = args.cpus if cpus == 0: cpus = mp.cpu_count()-2 replace = args.replace delete = args.delete - checkOptConflict(replace, delete) + add_taxon_fn.check_conflict_opts(replace, delete) force = args.force ### species name after fdog naming scheme - checkTaxId(taxId) - if name == "": - name = getTaxName(taxId) - specName = name+'@'+taxId+'@'+ver - print('Species name\t%s' % specName) + spec_name = add_taxon_fn.generate_spec_name(taxId, name, ver) + print('Species name\t%s' % spec_name) ### remove old folder if force is set - if force: - if os.path.exists(outPath + '/genome_dir/' + specName): - shutil.rmtree(outPath + '/genome_dir/' + specName) - if os.path.exists(outPath + '/blast_dir/' + specName): - shutil.rmtree(outPath + '/blast_dir/' + specName) + if force == True: + if os.path.exists(outPath + '/searchTaxa_dir/' + spec_name): + shutil.rmtree(outPath + '/searchTaxa_dir/' + spec_name) + if os.path.exists(outPath + '/coreTaxa_dir/' + spec_name): + shutil.rmtree(outPath + '/coreTaxa_dir/' + spec_name) + + ### initiate paths + genome_path = add_taxon_fn.create_folders(outPath, spec_name, coreTaxa, noAnno) - ### create file in genome_dir + ### create file in searchTaxa_dir print('Parsing FASTA file...') - Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True) - genomePath = outPath + '/genome_dir/' + specName - Path(genomePath).mkdir(parents = True, exist_ok = True) - # load fasta seq - inSeq = SeqIO.to_dict((SeqIO.parse(open(faIn), 'fasta'))) - specFile = genomePath + '/' + specName + '.fa' - if (not os.path.exists(os.path.abspath(specFile))) or (os.stat(specFile).st_size == 0) or force: - f = open(specFile, 'w') - index = 0 - modIdIndex = 0 - # longId = 'no' - tmpDict = {} - # with open(specFile + '.mapping', 'a') as mappingFile: - for id in inSeq: - seq = str(inSeq[id].seq) - # check ID - # oriId = id - if ' ' in id: - sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) - else: - if '|' in id: - print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') - id = re.sub('\|', '_', id) - # if len(id) > 20: - # modIdIndex = modIdIndex + 1 - # id = modIdIndex - # longId = 'yes' - # if not id in tmpDict: - # tmpDict[id] = 1 - # else: - # index = index + 1 - # id = str(index) - # tmpDict[id] = 1 - # mappingFile.write('%s\t%s\n' % (id, oriId)) - # check seq - if seq[-1] == '*': - seq = seq[:-1] - specialChr = 'no' - if any(c for c in seq if not c.isalpha()): - specialChr = 'yes' - if specialChr == 'yes': - if replace or delete: - if replace: - seq = re.sub('[^a-zA-Z]', 'X', seq) - if delete: - seq = re.sub('[^a-zA-Z]', '', seq) - else: - sys.exit('\033[91mERROR: %s sequence contains special character!\033[0m\nYou can use --replace or --delete to solve it.' % (id)) - f.write('>%s\n%s\n' % (id, seq)) - f.close() - # write .checked file - cf = open(specFile+'.checked', 'w') - cf.write(str(datetime.now())) - cf.close() - # warning about long header - # if longId == 'yes': - # print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile) - else: - print(genomePath + '/' + specName + '.fa already exists!') + genome_file = add_taxon_fn.create_genome([faIn, genome_path, spec_name, force, replace, delete]) + out_msg = 'Output for %s can be found in %s within searchTaxa_dir' % (spec_name, outPath) ### create blast db if coreTaxa: - print('Creating Blast DB...') - Path(outPath + '/blast_dir').mkdir(parents = True, exist_ok = True) - if (not os.path.exists(os.path.abspath(outPath + '/blast_dir/' + specName + '/' + specName + '.phr'))) or force: - try: - runBlast([specName, specFile, outPath]) - except: - print('\033[91mProblem with creating BlastDB.\033[0m') - else: - print('Blast DB already exists!') + print('\nCreating Blast DB...') + add_taxon_fn.create_blastdb([outPath, spec_name, genome_file, force, False]) + out_msg = '%s, coreTaxa_dir' % out_msg ### create annotation if not noAnno: - Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True) - annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus) - if force: - annoCmd = annoCmd + " --force" - try: - subprocess.call([annoCmd], shell = True) - except: - print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) + add_taxon_fn.create_annoFile(outPath, genome_file, cpus, force) + if os.path.exists('%s/annotation_dir/tmp' % outPath): + shutil.rmtree('%s/annotation_dir/tmp' % outPath) + out_msg = '%s, annotation_dir' % out_msg - print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath)) + print('\n==> %s' % out_msg) if __name__ == '__main__': main() diff --git a/fdog/bin/Filehandler.pm b/fdog/bin/Filehandler.pm deleted file mode 100755 index 5f1e08e..0000000 --- a/fdog/bin/Filehandler.pm +++ /dev/null @@ -1,45 +0,0 @@ -package Filehandler; -use strict; -# PROGRAM NAME: Filehandler.pm - -# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at - -# PROGRAM DESCRIPTION: A module that retrieves a filename, path and -# input separator. It opens a file and hands back an object where via -# the command 'next' the next line of the file is fetched. - -# DATE: 19.08.2003 - - -# DATE LAST MODIFIED: - - -##################### start subroutine ####################### -## blessing the variable: -## constructor that returns a file handle -sub TIEHANDLE { - my $class = shift; - my $name = shift; - my $path = shift; - $/ = shift; - $path =~ s/\/$//; - my $self; - open ($self, "$path/$name") or die "could not open $path/$name\n"; - bless($self, $class); - return ($self); -} - -sub READLINE { - my ($self) = shift; - return <$self>; -} -sub CLOSE { - my $self = shift; - close ($self) or die "could not close filehandle\n"; -} - -sub PRINT { - my $self = shift; - print $self @_; -} -1; diff --git a/fdog/bin/getSearchTaxa.pl b/fdog/bin/getSearchTaxa.pl deleted file mode 100644 index 33b7b30..0000000 --- a/fdog/bin/getSearchTaxa.pl +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use Bio::DB::Taxonomy; -use Bio::Tree::Tree; -use Bio::TreeIO; -use Getopt::Std; -use Cwd 'abs_path'; - -sub usage { - my $msg = shift; - print "example: perl getSearchTaxa.pl -i genome_dir -b 0.00005 -h 0.00005 -r 10 -n mammalia -t taxonomy -o searchList.txt\n"; - print "-i\tFolder contains all search species (e.g. genome_dir)\n"; - die $msg."\n"; -} - -# global variables -our($opt_i,$opt_b,$opt_h,$opt_r,$opt_n,$opt_t,$opt_o); -getopts('i:b:h:r:n:t:o:'); - -# sanity checks; -my $genome_dir = ($opt_i) ? $opt_i : usage("ERROR: No input folder given\n"); -my $eval_blast = ($opt_b) ? $opt_b : usage("ERROR: No eval_blast given\n"); -my $eval_hmmer = ($opt_h) ? $opt_h : usage("ERROR: No eval_hmmer given\n"); -my $eval_relaxfac = ($opt_r) ? $opt_r : usage("ERROR: No eval_relaxfac given\n"); -my $group = ($opt_n) ? $opt_n : usage("ERROR: No group given\n"); -my $idx_dir = ($opt_t) ? $opt_t : usage("ERROR: No taxonomy dir given\n"); -my $output = ($opt_o) ? $opt_o : usage("ERROR: No output given\n"); - -open(OUT, ">$output") || die "Cannot create $output\n"; -my $groupNode; -my %taxa; -my $db; - -if($group ne "all") { - $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); - checkGroup($group); - # get tree - %taxa = getTaxa($genome_dir); - my $tree = getTree(); - my $final_eval_blast = $eval_blast*$eval_relaxfac; - my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - if($groupNode) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - foreach (get_leaves($tree)) { - my $tmp = @{$_->name('supplied')}[0]; - print OUT $tmp,"\n"; - } -} else { - %taxa = getTaxa($genome_dir); - foreach my $tax (keys %taxa) { - print OUT $tax,"\n"; - } -} -exit; - -sub checkGroup { - my ($group) = $_[0]; - my $node = $db->get_taxon(-name => $group); - if($node) { - $groupNode = $node; - } else { - print "Your selected group " . $group . " was not found in the taxonomic tree... TERMINATING\n"; - exit; - } -} - -sub getTaxa { - my ($genome_dir) = $_[0]; - ## removal of misplaced files in genome_dir - if (-e "$genome_dir/query.sql"){ - unlink("$genome_dir/query.sql"); - } - if (-e "$genome_dir/@@.fa"){ - unlink("$genome_dir/@@.fa"); - } - my @taxonlist = `ls $genome_dir`; - chomp @taxonlist; - for (my $i = 0; $i < @taxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $taxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $taxonlist[$i]; - $taxa{$taxon_name} = $ncbi_id; - } - my $hashcount = keys(%taxa); - return(%taxa); -} - -sub getTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %taxa) { - my $node = $db->get_taxon(-taxonid => $taxa{$key}); - if (!defined $node){ - print "ISSUE in sub getTree. No correspodence found in taxonomy file for $key and taxid $taxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - return $tree; -} - -sub get_leaves { - my $tree = $_[0]; - my $delFlag = 0; - if(defined($_[1])){ - $delFlag = $_[1]; - } - - my $node = $tree->get_root_node; - my @leaves; - my @children = ($node); - for (@children) { - push @children, $_->each_Descendent(); - } - for (@children) { - push @leaves, $_ if defined($_->name('supplied')); - } - # if the tree is set to be deleted - if ($delFlag){ - @leaves = qw(); - return @leaves; - }else{ - return @leaves; - } -} diff --git a/fdog/bin/hamstr.pl b/fdog/bin/hamstr.pl deleted file mode 100755 index b762854..0000000 --- a/fdog/bin/hamstr.pl +++ /dev/null @@ -1,2357 +0,0 @@ -#!/usr/bin/perl -use strict; -use Getopt::Long; -use Parallel::ForkManager; -use Bio::SearchIO; -use Bio::Search::Hit::BlastHit; -use Bio::SeqIO; -use Bio::Align::ProteinStatistics; -use Bio::AlignIO; -use Term::Cap; -use POSIX; -use Cwd; -use Cwd 'abs_path'; -use Statistics::R; -use File::Basename; -use lib dirname(__FILE__); -use run_genewise_hamstr; - -# PROGRAMNAME: hamstr.pl - -# Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License -# or any later version. - -# This program is distributed in the hope that it will be useful -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# You should have received a copy of the GNU General Public License -# along with this program; If not, see http://www.gnu.org/licenses - -# PROGRAM DESCRIPTION: HaMStR is a program for targeted ortholog search in both EST/RNAseq -# and protein sequence data. - -# DATE: Wed Dec 19 10:41:09 CEST 2007 - -# PROGRAM HISTORY -##23. 07. 2010: found a bug in the extraction of the -## hmm hit sequence from the sequnence_file. A end-of-line char was missing. - -##09.08.2010: added the option to choose the new blastp program from ncbi. Just comment -##out line 45 in the script and uncomment line 46. Note, in order to make this work I have -##to slightly modify the blast output since otherwise it will not be parsed by the Bioperl -##Blast parser. Currently this is a pretty dirty $sedprog hack. It will also take care of removin -##the string lcl| that is added in some instances to the hit id in the blast output. - -## I added the option that one can now provide a comma-separated string of phmm names as an -## argument for the option -hmm - -## 08.03.2011: -## 1) BUG-FIX: Hamstr will now remove automatically newlines from the input sequence file -## 2) BUG-FIX: The sequence header remains now the same whether or not the flag -representative -## has been chosen. - -## 10.04.2011 -## 1) added some information to the log file. - -## 20.05.2011 -## 1) BUG-FIX: The grep for the EST sequence in the sub-routine predictORF received also a hit when -## the search pattern was only a substring of the EST sequence identifier. In some cases the wrong EST -## was then used to predict the ORF. This has been fixed. - -## 30.05.2011 -## 1) Extension: a command line option -longhead has been added. The user can now specify that the -## full sequence id including whitespaces will considered throughout the hamstr search. Note, the -## whitespaces will be replaced by the string specified in the variabel $idsep. -## 2) Modification from the bug fix from 20.05.2011. In the grep for the original EST it is no longer -## necessary that the search string and the EST id are identical over their entire length. Instead the -## search string may be a prefix of the EST id ending with a whitespace. - -## 27.06.2011 -## 1) Extension: I added the option to run a true reciprocal best hit search. Only the best hit from the -## hmmer search is used to check for reciprocity. - -## 06.12.2011 -## 1) Extension: I added the option -hit_limit to set the number of hmmsearch hits that HaMStR uses for -## the re-blast. - -## 10.02.2012 -## 1) Extension: I added checks for the appropriate hmmsearch version (HMMER 3) and for genewise and -## its environmental variable WISECONFIGDIR. -## 2) Bug fix in the -rbh option. - -## 11.09.2012 -## 1) Bug fix: -hitlimit, even if not set explicitely has been invoked resulting in a more stringent -## behaviour of HaMStR. This has been fixed resulting in longer run-times. -## 18.12.2012 -## 1) Bug fix: There was a bug in the CDS extraction for reverse complemented -## sequences. A new line was moved to the beginning of the sequence -## leading to index errors. - -## 18.12.2013 -## 1) Bug fix: I have now adapted the script such that it no longer requires the default directory structure -## 2) Extension: Hamstr is now capable of identifying co-orthologs (sub routine IdentifyCoorthologsProt) -## 3) The re-blast for EST sequences is now a BlastX solving the problem of duplicated output for contigs with a pHMM -## hit in more than one frame. - -## 08.01.2014 -## Extension: check for co-orthology between ref-protein and best blast hit in cases both are not identical. -## Bug fix: option -rbh was disfunctional due to a missing function in new sub-routine parseHmmer3pm -## Bug fix: sortRef actually did not sort anything as it was accessing the unsorted file - -## 09.01.2014 -## Extension: Add the possibility to sort the hits by the hmmersearch Score rather than an alignment score -## This will make the best hmmersearch hit surviving the re-blast automatically the representative - -## 10.01.2014 -## Bug fix (minor): modified option -outpath to accept non-default value -## modification of the translate_tc call. - -## 17.01.2014 -## Bug fix (minor): added the option --anysymbol to the mafft-linsi command to avoid crash when protein sequences -## contain non-standard amino acids (U = selenocystein) - -## 14.02.2014 -## Extension: added the option to use ublast rather than blast for the reciprocity check - -## 25.02.2014 -## Extension: added the option -reuse. By default, old results will now be deleted unless flag -reuse has been set -## Extension: added syntax required for running fact implemented into Hamstr2.0. Option -fact does not occur in help -## as this works only in context with Hamstr2.0 -## Extension: Hamstr now outputs a results summary at the end of the run. -## Extension: added the option -cleartmp to faciliate automatic removal of the tmp-dir prior to the hamstr run - -## 05.03.2014 -## Bug fix (minor): Variable $grepprog was not used throughout the script. In some routines 'grep' was hard coded. -## On MAC OS this could lead to unwanted call of BSD grep resulting in an error during re-blast. - -## 16.03.2014 -## Bug fix (major): There was a problem in translating ESTs in the correct frame. This has been fixed. -## Modification (minor): The alignment positions together with the score are no longer sorted externally. - -## 02.04.2014 -## Bug fix (minor): Flag $runublast was not changed to 1 when configuring hamstr -## with the ublast option. - -## 05.08.2014 -## Exentsion: Update to version 13.2.3. New features include the option to run hamstr in silent mode and the option -## to parallelize the hamstr search for individual core orthologs using the Parallel::ForkManager package - -## 14.08.2014 -## Bug fix (minor): corrected typo in sub routine call 'printOUT' - -## 31.07.2015 -## Extension: Update to version 13.2.4. New feaure provides the option to entirely remove intron sequences and incomplete codons -## from transcripts - -## 03.07.2015 -## Minor extension: Selected behavior with respect to introns in transcripts will be printed to hamstrsearch.log - -## 05.07.2015 -## Minor bug fix: A no-blast hit was not reported properly in the sub routine check4reciprocity resulting in rare -## cases in the acceptance of a spurious ortholog. - -## 14.08.2015 -## Change of output file naming. Upon selection of the strict option the reference species is no longer appended -## to the output file name. - -## 30.01.2016 -## Minor changes including the better integration of the onseq.pl script. Among others the blast files are no longer -## expected to have the '_prot' appendix. - -## 12.02.2016 -## Minor bug fix: In some instances the representative protein was not chosen correctly due to a bug in the subroutine -## sortRef. Analyses of transcript data are not affected at all. - -## 19.12.2017 -## Extension: HaMStR can now automatically determine the hit limit up to which candidates from the intial -## hmm search are evaluated as potential orthologs. Two options are available, either an hmm score driven -## cutoff determination, or alternatively, a lagPhase-based estimator. - -## 02.02.2018 -## Bug fix (solved): using grep within the checkcoorthologsref routine could cause an incomplete alignment of the reference gene, -## the candidate ortholog and the best blast hit. The resulting distance (kimura) calculation may caused an overoptimistic -## acceptence of co-ortholgy relations. The bug onyl occured while using the option checkCoOrthologsRef. -## HaMStR keeps original gene sets in FASTA format and *.fa.mod will link to the original FASTA file (no linebreaks within a sequence). - -## 28.02.2018 -## Minor Bug fix (solved): HaMStR is not longer asking infinite times for the replacement of already existing output files. -## Minor Bug fix (solved): Backward compatibility extended. Naming of reference fasta files: (*.fa and *_prot.fa) - -## 20.07.2019 -## fixed the issue of long proteins with best total hmm bit score but very poor domain scores. Allow now the option to sort -## hmmsearch output according to the best domain bit score. The current routine assumes that neither query nor -## hit has whitespaces in their names - -## 14.04.2020 (Vinh) -## Bug fix (solved): existing symbolic link cannot recognized while checking the reference fasta file - -## 10.07.2020 (v13.2.12 - vinh) solved problem when gene ID contains PIPE -## 13.07.2020 (v13.3.0 - vinh) solved problem when gene ID contains PIPE -## 22.07.2020 (v13.4.0 - vinh) moved tmp blast files to output folder and delete them when finished -## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef -## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name -## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory -## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm -## 12.01.2022 (v13.4.6 - vinh) change aligner from MUSCLE to MAFFT if the sequence is longer than 12,000 aa - -######################## start main ########################################### -my $version = "HaMStR v.13.4.6"; -######################## checking whether the configure script has been run ### -my $configure = 0; -if ($configure == 0){ - die "\n\n$version\n\nPLEASE RUN setup1s BEFORE USING HAMSTR\n\n"; -} -########## EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT ################## -my $prog = 'hmmsearch'; #program for the hmm search -my $eval = 1; # default evalue cutoff for the hmm search -my $sedprog = 'sed'; -my $grepprog = 'grep'; -my $readlinkprog = 'readlink'; -my $alignmentprog = 'clustalw'; -my $alignmentprog_co = 'muscle'; -########## EDIT THE FOLLOWING TWO LINES TO CHOOSE YOUR BLAST PROGRAM ########## -my $blast_prog = 'blastp'; -my $filter = 'F'; # low complexity filter switch. Default 'on'. Set of 'F' to turn off permanently. -my $eval_blast = 10; # default evalue cutoff for the blast search -########## EDIT THE FOLLOWING LINES TO MODIFY DEFAULT PATHS ################### -# my $path = abs_path(dirname(__FILE__)); -# $path =~ s/\/bin//; -my $path = getcwd; -my $hmmpath = "$path/core_orthologs"; #path where the hmms are located -my $blastpath = "$path/blast_dir"; #path to the blast-dbs -my $outpath = '.'; -my $idsep = '__'; #character used to replace whitespaces in the sequence header with (flag -longhead) -my $hmm_dir = 'hmm_dir'; -my $fa_dir = 'fa_dir'; -############################## -# my $termios = new POSIX::Termios; $termios->getattr; -# my $ospeed = $termios->getospeed; -# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; - -############################## Variables ############## -my $fileobj; -## The main variable storing most of the results; -## $fileobj->{$taxon}->{prot}->[$hitcounter] -## $fileobj->{$taxon}->{ids}->[$hitcounter] -## $fileobj->{$taxon}->{cds}->[$hitcounter] -## $fileobj->{$taxon}->{hmmscore}->[$hitcounter] -####################################################### -my $pid = $$; -my $help; -my $debug; -my $seq2store_file=''; -my $cds2store_file=''; -my $hmm; -my @hmms; -my $fa; -my $fafile; -my @seqs2store; -my @cds2store; -my $dbpath; -my $dboutpath; -my $ep2eg; -my $dbfile_base; -my $aln; -my $idfile; -my $taxon_check = 0; -my $hmmset; -my $show_coreortholog_sets; -my $hmmsearch_dir; -my $dbfile; # the file hmmsearch is run against -my $dbfile_short; -my $taxon_file; -my $refspec_string; -my @refspec = qw(); -my @primer_taxa; -my $refspec_name = ''; -my $taxon_global; -my $fa_dir_neu = ''; -my $gwrefprot; -my $seqtype; -my $align; -my $rep; -my $estflag; -my $proteinflag; -my $refseq; -my $strict; -my $relaxed; -my $refspec_final = ''; -my $central; -my $concat; -my $seqs2store_file; -my $append; -my $longhead; -my $check = 1; -my @log = qw(); -my $bhh; -my $hitlimit; -my $autoLimit; -my $scoreThreshold; -my $scoreCutoff = 10; -my $nonoverlappingCO; -my $algorithm = 'blastp'; -my $frame; -my $checkCoRef; -my $sortalign; -my $check_genewise = 1; -my $outputfmt = 'blastxml'; -my $fact; -my $runFACTparameter; -my $hmmcount; -my $reuse; -my $cleartmp; -my $ver; -my $silent; -my $cpu = 1; -my $force; -my $keepintron = 'k'; -my $blastapp = ''; -my $blastdbend = '.pin'; -######### ublast options ######### -my $runublast = 1; -my $ublast = 0; -my $accel = 0.8; -#####determine the hostname####### -# push @log, "VERSION:\t$version\n"; -my $hostname = `hostname`; -chomp $hostname; -push @log, "HOSTNAME\t$hostname\n"; -################################# -if (@ARGV==0) { - $help = 1; -} -## help message -my $helpmessage = " -YOU ARE RUNNING $version on $hostname - -This program is freely distributed under a GPL. -Copyright (c) GRL limited: portions of the code are from separate copyrights - -\nUSAGE: hamstr -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [OPTIONS] - -OPTIONS: - -REQUIRED --sequence_file=<> - path and name of the file containing the sequences hmmer is run against. --hmmset=<> - specifies the name of the core-ortholog set. - The program will look for the files in the default directory 'core-orthologs' unless you specify - a different path via the option -hmmpath. --refspec=<> - sets the reference species. Note, it has to be a species that contributed sequences - to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference - taxa you can have a look at the speclist.txt file in the default core-ortholog sets - that come with this distribution. Please use the abreviations in this list. If you choose - to use core-orthologs where not every taxon is represented in all core-orthologs, you - can provide a comma-separated list with the preferred refspec first. The lower-ranking - reference species will only be used if a certain gene is not present in the preferred - refspecies due to alternative paths in the transitive closure to define the core-orthologs. - CURRENTLY NO CHECK IS IMPLEMENTED! - NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! --taxon - You need to specify a default taxon name from which your ESTs or protein sequences are derived. --est - set this flag if you are searching in ESTs. Note, if neither the -est nor the -protein flag is set, HaMStR will - guess the sequence type. If you select this flag, make sure to specify how to deal with introns retained in the - ESTs. Check option -intron! --protein - set this flag if you are searching in protein sequences. Note, if neither the -est nor the -protein flag is set, HaMStR will - guess the sequence type. - -USING NON-DEFAULT PATHS - --blastpath=<> - Lets you specify the absolute or relative path to the blast databases. DEFAULT: $blastpath --hmmpath=<> - Lets you specify the absolute or relative path to the core ortholog set. DEFAULT: $hmmpath --outpath=<> - You can determine the path to the HaMStR output. Default: current directory. - -ADDITIONAL OPTIONS - --append - set this flag if the output should be appended to the files *.out and *_cds.out. This becomes relevant when running - hamstrsearch with individual hmms and you want to combine the results. --central - set this flag to store the modified infile in the same directory as the infile rather than in the output dir. --checkCoorthologsRef - If the re-blast does not identify the original reference protein sequence as best hit, HaMStR will check whether the best blast - hit is likely a co-ortholog of the reference protein relative to the search taxon. NOTE: Setting this flag will substantially increase - the sensitivity of HaMStR but most likely affect also the specificity, especially when the search taxon is evolutionarily only very - distantly related to the reference taxon. --cleartmp - set this flag to remove existing tmp dir in the HaMStR output directory. --concat - set this flag if you want hamstr to concatenate sequences that align to non-overlapping parts of the reference protein. - If you choose this flag, no co-orthologs will be predicted. --cpu - You can specify the number of parallel jobs in the HaMStR search. HaMStR uses the Parallel::ForkManager module for this purpose. --eval_blast=<> - This option allows to set the e-value cut-off for the Blast search. Default: 10 --eval_hmmer=<> - This options allows to set the e-value cut-off for the HMM search.Default: 1 --filter= - Set this flag to F if the re-blast should be performed without low-complexity filtering. Default is T. --force - Setting this flag forces hamstr to overwrite existing output files (files ending with .out) without further asking. --hit_limit=<> - By default, HaMStR will re-blast all hmmsearch hits against the reference proteome. Reduce the number - of hits for reblast with this option. --autoLimit - Setting this flag will invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically - a hit_limit for each query. --scoreThreshold - Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less - than x percent of the hmm score of the best hit are further evaluated. Default is x = 10. - You can change this cutoff with the option -scoreCutoff. Note, when setting this lag, it will be effective for - both the core ortholog compilation and the final ortholog search. --scoreCutoff=<> - In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a - candidate of the hmmsearch will be subjected for further evaluation. Default: 10%. --hmm - Option to provide only a single hmm to be used for the search. - Note, this file has to end with .hmm --intron= - Specify how to deal with introns that may occur in transcript sequences. Default: keep - Introns will be retained in the transcript - but will be identified by lower case letters. --longhead - Set this flag in the case your sequence identifier contain whitespaces and you whish to keep - the entire sequence identifier throughout your analysis. HaMStR will then replace the whitespaces with - a '__'. If this flag is not set, HaMStR will truncate the sequence - Identifier at the first whitespace, however if and only if the sequence identifier then remain unique. - NOTE: too long sequence headers (~ > 30 chars) will cause trouble in the hmmsearch as the program will truncate - the output! --nonoverlapping_cos - If you set this flag, non-overlapping co-orthologs will be reported as well. NOTE: this flag is still experimental --rbh - set this flag if you want to use a reciprocal best hit criterion. Only the highest scoring - hit from the hmmer search will be used for re-blast. --relaxed - set this flag if the reciprocity criterion is fulfilled when the re-blast against - any of the primer taxa was successfull. Note that setting this flag will substantially decrease the - stringency of the ortholog assignment with the consequence of an increased number of false positives. --representative - From all sequences that fulfill the reciprocity criterion the one showing the highest similarity to the - core ortholog sequence in the reference species is identified and selected as representative. --reuse - Set this flag if you want to prevent HaMStR from overwriting previous results. --show_hmmsets - setting this flag will list all available core ortholog sets in the specified path. Can be combined with -hmmpath. --silent - Supresses (almost) all print statements to the screen. --debug - Get some additional meta information as print out to the screen. --sort_global_align - Setting this flag will tell hamstr to sort ortholog candidates according to their global alignment score to the reference - sequence rather than according to the score they have achieved in the hmmer search (local). NOTE: In the case of searching - EST data this flag is automatically set. --strict - Set this flag if the reciprocity criterion is only fulfilled when the re-blast against - all primer taxa was successfull --aligner - Choose between muscle or mafft-linsi for the alignment of multiple sequences. DEFAULT: muscle - \n\n"; - -GetOptions ( - "append" => \$append, - "autoLimit" => \$autoLimit, - "aligner=s" => \$alignmentprog_co, - "blastpath=s" => \$blastpath, - "checkCoorthologsRef" => \$checkCoRef, - "concat" => \$concat, - "cpu=s" => \$cpu, - "central" => \$central, - "debug" => \$debug, - "est" => \$estflag, - "eval_blast=s" => \$eval_blast, - "eval_hmmer=s" => \$eval, - "fasta_file=s" => \$fafile, - "filter=s" => \$filter, - "force" => \$force, - "h" => \$help, - "hit_limit=s" => \$hitlimit, - "hmm=s" => \$hmm, - "hmmset=s" => \$hmmset, - "hmmpath=s" => \$hmmpath, - "intron=s" => \$keepintron, - "longhead" => \$longhead, - "nonoverlapping_cos" => \$nonoverlappingCO, - "outpath=s" => \$outpath, - "protein"=> \$proteinflag, - "rbh" => \$bhh, - "refspec=s" => \$refspec_string, - "relaxed" => \$relaxed, - "representative" => \$rep, - "reuse" => \$reuse, - "sequence_file=s" => \$dbfile, - "scoreCutoff=s" => \$scoreCutoff, - "scoreThreshold" => \$scoreThreshold, - "show_hmmsets" => \$show_coreortholog_sets, - "silent" => \$silent, - "sort_global_align" => \$sortalign, - "strict" => \$strict, - "taxon_file=s" => \$taxon_file, - "taxon=s" => \$taxon_global, - "ublast" => \$ublast, - "v" => \$ver, - "accel=s" => \$accel, - "fact" => \$fact, - "cleartmp" => \$cleartmp -); - -if ($help) { - print $helpmessage; - exit; -} -elsif($ver){ - print "$version\n"; - exit; -} - -## 1) check if all information is available to run HaMStR -($check, @log) = &checkInput(); -if ($check == 0) { - print "\n\nThere was an error running $version\n\n"; - print join "\n", @log; - exit; -} -else { - open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n"; - print OUT join "\n", @log; - close OUT; -} -my $tmpdir = "$outpath/tmp"; - -### read in of the core-ortholog sequences -my $co_seqs = parseSeqfile("$fafile"); - -## initialize the forking procedure -my $pm = new Parallel::ForkManager($cpu); - -## collect all the entries of the final output file -#my ($spid, $exit_code, $ident, $exit_signal, $core_dump, $data); -#$pm->run_on_finish(sub { -# ($spid, $exit_code, $ident, $exit_signal, $core_dump, $data) = @_; -# $core_dump = undef; -# if ($seqderef){ -# push @seqs2store, @$seqderef; -# if ($estflag) { -# my $estderef = $data->[1]; -# push @cds2store, @$estderef; -# } -# } -#}); - -## 2) loop through the hmms -## process each hmm file separately -$hmmcount = scalar(@hmms); - -for (my $i = 0; $i < @hmms; $i++) { - my $pid = $pm->start and next; - my $localid = $$; - $frame = undef; - $fileobj = undef; - my @seqs = qw(); - my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster - my @newcds = qw(); - my $hmm = $hmms[$i]; - printOUT("Processing $hmm\n"); - my $hmmout = $hmm; - $hmmout =~ s/\.hmm/\.out/; - ## 3) run the hmm search - if (!(-e "$hmmsearch_dir/$hmmout")) { - printOUT("\n\nnow running $prog using $hmm\n"); - my $hmmOutFile = "$hmmsearch_dir/$hmmout"; - my $hmmModel = "$hmm_dir/$hmm"; - my $hmmInfile = "$dboutpath/$dbfile"; - `$prog --noali --tblout \"$hmmOutFile\" -E $eval \"$hmmModel\" \"$hmmInfile\"` or die "Problem running hmmsearch as $prog --noali --tblout \"$hmmOutFile\" -E $eval \"$hmmModel\" \"$hmmInfile\". No output $hmmsearch_dir/$hmmout\n"; - } - else { - printOUT("an hmmresult $hmmout already exists. Using this one!\n"); - } - - ## 4) process the hmm search result - my $hitcount = 0; - ## 4a) loop through the individual results - ## now the modified version for hmmer3 comes - my $hitlimit_local = $hitlimit; - my ($query_name, $results, $hitlimit_local, $criticalValue) = parseHmmer4pm($hmmout, $hmmsearch_dir); - if (! $results) { - printOUT("no hit found for $query_name\n"); - $pm->finish; - next; - } - ## Automatic hit limit information - if (defined $autoLimit) { - printDebug("Automatic cutoff estimation via a lag Phase analysis was selected. Estimated lag point is $criticalValue. Limiting the number of hits for the evaluation from " . scalar(@$results) . " to $hitlimit_local"); - } - elsif (defined $scoreThreshold) { - printDebug("Automatic cutoff estimation via a minimal score was selected. Cutoff: $scoreCutoff percent of the best hmm score. Hits with an hmm score below $criticalValue are not considered. Limiting the number of hits for the evaluation from " . scalar(@$results) . " to $hitlimit_local"); - } - ## - printOUT("Results for $query_name\n"); - my ($check, $refspec_final) = &determineRefspecFinal($query_name, @refspec); - if ($check == 0) { - die "error in retrieving refspec data\n"; - } - if (!defined $hitlimit_local or $hitlimit_local > scalar(@$results)) { - $hitlimit_local = scalar(@$results); - } - for (my $k = 0; $k < $hitlimit_local; $k++) { - my $hitname = $results->[$k]->{id}; - my $hithmmscore = $results->[$k]->{hmmscore}; - printOUT("$hitname\n"); - my $keep = 0; - my $hitseq = ''; - $refseq = ''; - ## 4b) test for the reciprocity criterion fulfilled - ($keep, $hitseq, $frame) = &check4reciprocity($localid, $query_name, $hitname, $refspec_final, @refspec); - if ($keep == 1) { - ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species - my $taxon = $taxon_global; - ## put the info about the hits into an object for later post-processing - ### HERE COMES THE NEW STUFF THAT DEALS WITH THE DIFFERENT POSSIBILITIES: STRICT, RELAXED OR WHATEVER... - $fileobj = &determineReferences ($localid, $fileobj, $taxon, $refspec_final, $hitname, $hithmmscore, $hitseq, $hitcount); - $hitcount++; - } - else { - printOUT("Reciprocity not fulfilled!\n\n"); - } - } - ## 5) do the rest only if at least one hit was obtained - if (defined $fileobj) { - ## 5a) if the hits are derived from ESTs, get the best ORF - if ($estflag) { - $fileobj = &predictORF($frame); - } - &processHits($localid, $fileobj); - if (!$rep and !$concat) { - ## identify co-orothologs only for protein sequences. This adds a key 'coorthologs' to the $fileobj->{$taxon} that - ## holds the index values for the sequences in the $fileobj->{$taxon}->{ids} and the corresponding {prot} array ref - ## that made it into the co-ortholog field - &identifyCoorthologsProt($localid, $taxon_global); - } - ## 6) prepare the output - my @taxa = keys(%$fileobj); - for (my $i = 0; $i< @taxa; $i++) { - push @newseqs, ">$query_name|$fileobj->{$taxa[$i]}->{refspec_final}|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}|1"; - push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; - if ($estflag) { - push @newcds, ">$query_name|$fileobj->{$taxa[$i]}->{refspec_final}|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}|1"; - push @newcds, $fileobj->{$taxa[$i]}->{refcds}; - } - if (!$rep and !$concat){ - ## print the remaining sequences only when the -representative option has not been chosen. - my $coorthologsobj = $fileobj->{$taxa[$i]}->{coorthologs}; - my $idobj = $fileobj->{$taxa[$i]}->{ids}; - my $protobj = $fileobj->{$taxa[$i]}->{prot}; - my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; - my $refspecobj = $fileobj->{$taxa[$i]}->{refspec}; - for (my $j = 0; $j < @$coorthologsobj; $j++) { - my $index = $coorthologsobj->[$j]; - push @newseqs, ">$query_name|$refspecobj->[$index]|$taxa[$i]|$idobj->[$index]|0"; - push @newseqs, $protobj->[$index]; - if ($estflag) { - push @newcds, ">$query_name|$refspecobj->[$index]|$taxa[$i]|$idobj->[$index]|0"; - push @newcds, $cdsobj->[$index]; - } - } - } - my $refs = $co_seqs->{$query_name}; - for (keys %$refs) { - my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; - push @seqs, $line; - } - chomp @seqs; - printOUT("\n"); - @seqs = (@seqs, @newseqs); - open (OUT, ">$fa_dir_neu/$query_name.fa"); - print OUT join "\n", @seqs; - print OUT "\n"; - close OUT; - if ($estflag) { - open (OUT, ">$fa_dir_neu/$query_name.cds.fa"); - print OUT join "\n", @newcds; - close OUT; - } - open (OUT, ">>$seqs2store_file") or die "failed to open output file\n"; - if ($estflag){ - open (OUT2, ">>$cds2store_file") or die "failed to open output file for cds\n"; - } - for (my $i = 0; $i < @newseqs; $i+= 2) { - my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; - $line =~ s/>//; - - print OUT $line; - print OUT "\n"; - # push @seqs2store, $line; - if ($estflag) { - my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; - $cdsline =~ s/>//; - print OUT2 $cdsline; - print OUT2 "\n"; - push @cds2store, $cdsline; - } - } - close OUT; - close OUT2; - } - } - if (@seqs2store > 0) { - my $seqref = \@seqs2store; - my $estref = \@cds2store; - $pm->finish; - } - else { - $pm->finish; - } -} - -$pm->wait_all_children; - -### The following bit of code has been out-commented as shared memory between forked child -### processes does not exist. The handing back of return values from the child to the parent -### does work, however leads to memory problems. -### all HaMStR searches have been completed and all children have finished. Do the output - -#if (@seqs2store > 0) { -# if ($append) { -# open (OUT, ">>$seqs2store_file") or die "failed to open output file\n"; -# } -# else { -# open (OUT, ">$seqs2store_file") or die "failed to open output file\n"; -# } -# print OUT join "\n", @seqs2store; -# print OUT "\n"; -# close OUT; -# if ($estflag) { -# if ($append) { -# open (OUT, ">>$cds2store_file") or die "failed to open output file\n"; -# } -# else { -# open (OUT, ">$cds2store_file") or die "failed to open output file\n"; -# } -# print OUT join "\n", @cds2store; -# print OUT "\n"; -# close OUT; -# } -#} -########################################################################################### - -my $orthologs = 0; -if (-e $seqs2store_file) { - $orthologs = `less $seqs2store_file |wc -l`; - if ($fact){ - ## starting funFACT.pl - system("perl runFact.pl $runFACTparameter $outpath $blastpath $taxon_global $refspec_string"); - } -} -else { - printOUT("no hits found\n\n"); -} -### WRAP UP ##### -my $fa_dir_neu_tmp = $fa_dir_neu; $fa_dir_neu_tmp =~ s/\|/\\\|/g; -my $ortholog_groups = `ls $fa_dir_neu_tmp |$grepprog -v 'cds.fa' |wc -l`; -my $hmmsearch_dir_tmp = $hmmsearch_dir; $hmmsearch_dir_tmp =~ s/\|/\\\|/g; -my $hmmsearched = `ls $hmmsearch_dir_tmp |wc -l`; -chomp ($ortholog_groups, $hmmsearched, $orthologs); - -if (!defined $silent) { - print "\n\n -####HaMStR completed!######### -Results of HaMStR search in $taxon_global -Number of core_orthologs searched: $hmmcount -Number of core_orthologs with hmmsearch output: $hmmsearched -Number of ortholog_groups extended: $ortholog_groups -Number of orthologous sequences: $orthologs -##############################\n\n"; -} else { - # print "$taxon_global done\n"; -} -exit; - - -##################### start sub ############### - -####### checkInput performs a number of checks whether sufficient information -### and all data are available to run HaMStR -sub checkInput { - ######### check a number of flags that only serve for providing the user with some information - if (defined $show_coreortholog_sets) { - ## Do nothing but just list all available core ortholog sets in $hmmpath - my @coresets = (`ls $hmmpath`); - chomp @coresets; - if (scalar(@coresets > 0)){ - print "\nTHE FOLLOWING CORE ORTHOLOG SETS ARE AVAILABLE IN $hmmpath:\n\n"; - for (my $i = 0; $i < @coresets; $i++){ - my @available = qw(); - my @unavailable = qw(); - print "\n$coresets[$i]\n\n"; - my @refspec = `head -n 20 $hmmpath/$coresets[$i]/$coresets[$i].fa |$grepprog '>' |cut -d '|' -f 2 |sort |uniq`; - chomp @refspec; - for (my $j = 0; $j < @refspec; $j++){ - if (-e "$blastpath/$refspec[$j]"){ - push @available, "\t$refspec[$j]"; - } - else { - push @unavailable, "\t$refspec[$j]"; - } - } - print "\tAvailable reference taxa:\n"; - print join "\n", @available; - if (@unavailable > 0){ - print "\n\n\tUnvailable reference taxa (no Blast db at $blastpath)\n"; - print join "\n", @unavailable; - } - } - } - else { - print "\nNO CORE ORTHOLOG SETS ARE AVAILABLE! CHECK $hmmpath!\n\n"; - } - print "\n\n"; - exit; - } - ######### push all user defined variables into the log file ################ - push @log, "\nUSER DEFINED PARAMTERS (inc. default values)\n"; - my %parameters = (append => $append, - blastpath => $blastpath, - checkCoorthologsRef => $checkCoRef, - cleartmp => $cleartmp, - concat => $concat, - est => $estflag, - eval_blast => $eval_blast, - eval_hmmer => $eval, - filter => $filter, - hit_limit => $hitlimit, - hmm => $hmm, - hmmset => $hmmset, - hmmpath => $hmmpath, - intron => $keepintron, - longhead => $longhead, - nonoverlapping_cos => $nonoverlappingCO, - outpath => $outpath, - protein => $proteinflag, - rbh => $bhh, - refspec => $refspec_string, - relaxed => $relaxed, - representative => $rep, - reuse => $reuse, - sequence_file => $dbfile, - show_hmmsets => $show_coreortholog_sets, - sort_global_align => $sortalign, - strict => $strict, - taxon => $taxon_global, - ublast => $ublast); - - foreach ( sort keys %parameters) { - if (defined $parameters{$_}) { - push @log, "\t -$_:\t$parameters{$_}"; - } - else { - push @log, "\t -$_:\tnot set"; - } - } - - ############################################################################# - my $check = 1; - - if (!defined $dbfile) { - push @log, "You need to specify a valid infile with the option -sequence_file\n\n"; - $check = 0; - return($check, @log); - } - ### for FACT use the unmodified value of $dbfile - $runFACTparameter = $dbfile; - ## extract the path from the dbpath if available and prune of trailing '/' - if ($dbfile =~ /(.*\/)/) { - $dbpath = $1; - $dbpath =~ s/\/$//; - } - else { - $dbpath = '.'; - - } - $dbfile =~ s/.*\///; - # $dbfile_short = $dbfile; - # $dbfile_short =~ s/\..*//; - my @dbfileTMP = split(/\./, $dbfile); pop @dbfileTMP; - $dbfile_short = join(".", @dbfileTMP); - if ($central) { - $dboutpath = $dbpath; - # print "setting dboutpath to $dboutpath"; - } - - # print "HERERERERERERERERER $dbfile #################\n"; - # print "THENNNNNNNNNNNNNNNN $dbfile_short #################\n"; - ## - ## 0) Check for presence of the file with the sequences that should be hamstered - if (-e "$dbpath/$dbfile") { - push @log, "\t$dbfile ready"; - } - else { - #the provided infile does not exist: - push @log, "FATAL: The specified infile $dbpath/$dbfile does not exist. PLEASE PROVIDE A VALID INFILE!\n"; - $check = 0; - return ($check, @log); - } - ## 1) check for filetype - printOUT("checking for sequence type:\n"); - if (!defined $estflag and !defined $proteinflag) { - push @log, "\nCHECKING SEQUENCE TYPE\n"; - push @log, "\tNo file sequence type was determined. HaMStR will guess whether EST or protein sequences are analyzed"; - my $seq = `head -n 2 $dboutpath/$dbfile |tail -n 1`; - my $orilength = length($seq); - $seq =~ s/[AGCTN]//ig; - if (length($seq) / $orilength >0.1) { - $proteinflag = 1; - printOUT("Guessing sequence type: Protein\n"); - push @log, "\tMore than 10% of the first sequence in the file are non-AGCTN. Guessing sequence type: Protein"; - } - else { - $estflag = 1; - printOUT("Guessing sequence type: DNA\n"); - push @log, "\tLess than 10% of the first sequence in the file are non-AGCTN. Guessing sequence type: DNA\n"; - } - $check = 1; - } - if ($estflag and !$check_genewise) { - push @log, "\n\nHaMStR has been configured with the flag --protein_only and will not accept DNA sequences as input. I am stopping tests here! If you really want to analyse DNA sequence data please reconfigure.\n"; - $check = 0; - return ($check, @log); - } - ## $dbfile_base hat den originalen file namen, egal ob est oder protein - $dbfile_base = $dbfile; - - if ($ublast){ - if ($runublast){ - $blast_prog = 'usearch'; - $algorithm = 'ublast'; - $outputfmt = 'blasttable'; - $blastdbend = '.udb'; - } - else { - push @log, "\n\nHaMStR has been configured with the --noublast option. Either re-start without the -ublast flag or reconfigure\n"; - $check = 0; - return($check, @log); - } - } - if ($estflag) { - $dbfile = "$dbfile.tc"; - $algorithm = 'blastx'; - if ($blast_prog eq 'blastp'){ - $blast_prog = 'blastx'; - } - $sortalign = 1; - push @log, "HaMStR will run on the ESTs in $dbfile_base"; - push @log, "\nTRANSLATING ESTs\n"; - if (!(-e "$dboutpath/$dbfile")) { - printOUT("translating $dbfile_base, this may take a while\n"); - `$path/bin/translate.pl -infile=$dboutpath/$dbfile_base -outfile=$dbfile -outpath=$dboutpath`; - open (LOG, "$outpath/hamstrsearch.log"); - my @info = ; - @log = (@log, @info); - close LOG; - } - else { - push @log, "Translated file already exists, using this one"; - } - if (! -e "$dboutpath/$dbfile") { - push @log, "FATAL: The translation of $dbfile_base failed. Check the script translate.pl"; - print "failed\n"; - $check = 0; - } - else { - ## file type is protein - printOUT("succeeded\n"); - } - } - ## 2) Check for presence of the blast program - push @log, "\nCHECKING FOR PROGRAMS\n"; - printOUT("checking for the blast program:\t"); - if (`which $blast_prog` =~ / no /) { - push @log, "FATAL: could not execute $blast_prog. Please check if this program is installed and executable"; - print "failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for $blast_prog succeeded"; - unless ($silent) { - print "succeeded\n"; - } - } - ## 3) Check for presence of hmmsearch - printOUT("checking for hmmsearch:\t"); - my $hmmcheck = `$prog -h |$grepprog -c 'HMMER 3'`; - if (! `$prog -h`) { - push @log, "FATAL: could not execute $prog. Please check if this program is installed and executable"; - print "failed: $prog is not installed or not executable\n"; - $check = 0; - } - elsif ($hmmcheck != 1) { - push @log, "FATAL: It seems that $prog is not from the HMMER 3 package. Please check!"; - print "failed: $prog is not from the HMMER 3 package\n"; - $check = 0; - } - else { - push @log, "\tcheck for $prog succeeded"; - printOUT("succeeded\n"); - } - ## 3b) Check for genewise - if ($check_genewise) { - printOUT("checking for genewise:\t"); - if (! `genewise -help`) { - push @log, "FATAL: Could not execute genewise. Please check if this program is installed and executable"; - print "failed: genewise is not executable\n"; - $check = 0; - } - else { - my $gwcheck = `echo \$WISECONFIGDIR`; - if (length($gwcheck) < 1) { - push @log, "FATAL: The environmental variable WISECONFIGDIR has not been set. I am expecting troubles when invoking genewise. - Please consult the installation manual for genewise and set this variable"; - print "failed: the environmental variable WISECONFIGDIR has not been set.\n"; - $check = 0; - } - else { - printOUT("\tsucceeded\n"); - } - } - } - else { - push @log, "GENEWISE-CHECK skipped: The hamstr-script has been configured with the option --protein_only. To override this setting set reconfigure the script or set the variable $check_genewise to 1"; - } - ## 4) Check for presence of the directory structure - - push @log, "\nCHECKING FOR HMMs\n"; - printOUT("checking for presence of the hmm files:\t"); - if ( ! defined $hmmset or ! -e "$hmmpath/$hmmset") { - push @log, "FATAL: You need to specify a valid core ortholog set. Make also sure that you provide the path to this set if it is not in the default location $hmmpath. You can check available core ortholog sets using the option -show_hmmsets."; - print "failed\n"; - $check = 0; - } - else { - $hmmpath = "$hmmpath/$hmmset"; - $fafile = "$hmmpath/$hmmset" . '.fa'; - $hmm_dir = "$hmmpath/$hmm_dir"; - $hmmsearch_dir = $outpath .'/hmm_search_' . $dbfile_short . '_' . $hmmset; - - ## 4b) check for the presence of the hmm-files and the fasta-file - if (!(-e "$hmm_dir")) { - push @log, "FATAL: Could not find $hmm_dir"; - print "failed\n"; - $check = 0; - } else { - if (defined $hmm) { - @hmms = split ',', $hmm; - chomp @hmms; - ### check for the presence of all hmms - for (my $k = 0; $k < @hmms; $k++) { - if (! -e "$hmm_dir/$hmms[$k]") { - push @log, "FATAL: $hmms[$k] has been defined but could not be found in $hmm_dir/$hmms[$k]"; - $check = 0; - last; - } else { - push @log, "\t$hmms[$k] has been found"; - } - } - } else { - push @log, "\trunning fDOG with all hmms in $hmm_dir"; - my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g; - @hmms = `ls $hmm_dir_tmp`; - } - chomp @hmms; - printOUT("\tsucceeded\n"); - } - } - ## 6) Test for presence of the fasta file containing the sequences of the core-ortholog cluster - printOUT("checking for presence of the core-ortholog file:\t"); - if (defined $fafile) { - if (! -e "$fafile") { - push @log, "Fatal: Could not find the file $fafile"; - print "failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for $fafile succeeded"; - printOUT("\tsucceeded\n"); - } - } - else { - push @log, "FATAL: Please provide path and name of fasta file containing the core-ortholog sequences"; - $check = 0; - print "failed\n"; - } - ## 7) Checks for the taxon_file - push @log, "\nCHECKING TAXON NAME\n"; - printOUT("testing whether the taxon has been determined:\t"); - if (defined $taxon_global) { - push @log, "\tusing default taxon $taxon_global for all sequences"; - printOUT("succeeded\n"); - $taxon_check = 2; - } - else { - push @log, "FATAL: No taxon_file found. Please provide a global taxon name using the option -taxon"; - print "failed\n"; - $check = 0; - } - ## 8) Check for reference taxon - push @log, "\nCHECKING FOR REFERENCE TAXON\n"; - printOUT("checking for reference species and blast-dbs:\t"); - if (!(defined $refspec_string) and (! defined $strict and ! defined $relaxed)) { - push @log, "FATAL: Please provide a reference species for the reblast!"; - print "failed\n"; - $check = 0; - } - elsif (defined $strict or defined $relaxed) { - if (! defined $refspec_string) { - ## The user has not provided a string of reference taxa. Chose all from the fasta file containing - ## the core orthologs. - @refspec = `$grepprog '>' $fafile |cut -d '|' -f 2 |sort |uniq`; - chomp @refspec; - $refspec_string = join ',', @refspec; - } - else { - @refspec = split (/,/, $refspec_string); - } - if ($strict) { - push @log, "\tStrict flag has been set. Reference species for the reblast: All of $refspec_string"; - } - else { - push @log, "\tRelaxed flag has been set. Reference species for the reblast: Any of $refspec_string"; - } - if (@refspec == 0) { - print "failed\n"; - $check = 0; - } - else { - printOUT("succeeded\n"); - } - } - else { - push @log, "\t Reference species for the re-blast: $refspec_string"; - @refspec = split(/,/, $refspec_string); - $refspec_name = $refspec[0]; - printOUT("succeeded\n"); - } - ## 9) Check for presence of the required blast dbs - printOUT("checking for blast-dbs:\t"); - push @log, "\nCHECKING FOR BLAST DATABASES\n"; - for (my $i = 0; $i < @refspec; $i++) { - my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]"; - if (-e $blastpathtmp . $blastdbend) { - push @log, "\tcheck for $blastpathtmp succeeded"; - printOUT("succeeded\n"); - } - elsif (-e $blastpathtmp . '_prot' . $blastdbend){ - ## the check for the file naming '_prot' is only to maintain backward compatibility - $blastapp = '_prot'; - $blastpathtmp = $blastpathtmp . $blastapp; - push @log, "\tcheck for $blastpathtmp succeeded"; - printOUT("succeeded\n"); - } - else { - push @log, "FATAL: please edit the blastpath. Could not find $blastpathtmp or blast database blastpathtmp.pin does not exist."; - print "$blastpathtmp failed\n"; - $check = 0; - } - } - ## 9.1) Check for presence of the required FASTA file of reference species - printOUT("checking for reference fasta files:\t"); - push @log, "\nCHECKING FOR REFERENCE FASTA FILES\n"; - for (my $i = 0; $i < @refspec; $i++) { - my $referencedb = "$blastpath/$refspec[$i]/$refspec[$i]".".fa"; - my $referencedb_prot = "$blastpath/$refspec[$i]/$refspec[$i]"."_prot.fa"; # backward compatibility - my $ref_dir = "$blastpath/$refspec[$i]"; - my $link = $referencedb; - unless (-e $referencedb) { - $link = `$readlinkprog $referencedb`; - unless ($link =~ /^\./ || $link =~ /^\//) { - my $cwd = cwd(); - die "Linked source for $referencedb not found in $cwd!"; - } - } - # my $ref_location = $referencedb; # not used anywhere else - chomp($link); - if (-e $referencedb || -e $link) { - push @log, "\tinfile ready"; - } elsif (-e "$referencedb_prot"){ - push @log, "\tinfile ready"; - } else { - #the provided reference fasta file does not exist or link to file does not exist: - push @log, "FATAL: FASTA file for the specified reference $refspec[$i] does not exist. PLEASE PROVIDE A VALID REFERENCE SPECIES!\n"; - $check = 0; - return ($check, @log); - } - } - - ## 10) Set the file where the matched seqs are found - my $strictstring = ''; - if (defined $strict) { - $strictstring = '.strict'; - } - $seqs2store_file = $outpath . '/hamstrsearch_' . $dbfile_short . '_' . $hmmset . $strictstring . '.out'; - $cds2store_file = $outpath . '/hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds' . $strictstring . '.out'; - - if (! $append){ - if (-e "$seqs2store_file") { - my $answer = 'Y'; - my $breaker = 0; - if (!$force){ - print "A HaMStR outfile $seqs2store_file already exists and option -force has not been chosen! Shall I overwrite this file [Y|N]: "; - $answer = ; - chomp $answer; - while ($answer !~ /[YN]/i and ($breaker < 4)) { - $breaker ++; - print "Please answer with 'Y' or 'N':\t"; - $answer = ; - chomp $answer; - if (($breaker > 3) and ($answer !~ /[YN]/i)){ - print "No proper answer is given: exiting.\nPlease re-start HaMStR with the -append option, or alternatively remove the file manually, or force the replacement of exsiting files with option -force.\n"; - exit; - } - } - } - if ($answer =~ /Y/i) { - open (OUT, ">$seqs2store_file") or die "failed to open $seqs2store_file\n"; - print OUT ''; - close OUT; - if ($estflag){ - open (OUT, ">$cds2store_file") or die "failed to open $cds2store_file\n"; - print OUT ''; - close OUT; - } - } - else { - print "You chose to not overwrite the existing output files. Please re-start HaMStR with the -append option, or alternatively remove the file manually.\n"; - exit; - } - } - } - ## 11) apply the evalue-cut-off to the hmmsearch program - push @log, "\nPROGRAM OPTIONS\n"; - push @log, "\thmmsearch will run with an e-value limit of $eval"; - - ## 11b) hit limit for the re-blast - if ($hitlimit) { - push @log, "\tre-blast hit_limit: $hitlimit"; - } - else { - push @log, "\tre-blast hit_limit: none applied"; - } - ## 11c) The blast evalue limit - push @log, "\tBlast will run with an evalue limit of $eval_blast\n"; - - ## 12) check for filter setting for BLAST - printOUT("checking for low complexity filter setting:\t"); - $filter =~ tr/ft/FT/; - if ($filter ne 'T' and $filter ne 'F') { - push @log, "FATAL: Filter is set to $filter. Please set the low complexity filter either to F or T."; - print "low complexity filter check failed\n"; - $check = 0; - } - else { - push @log, "\tcheck for low complexity filter setting succeeded. Chosen value is $filter"; - if ($blast_prog ne 'blastall'){ - $filter = 'yes' if $filter eq 'T'; - $filter = 'no' if $filter eq 'F'; - } - printOUT("succeeded\n"); - } - - ## 13) setting up the directories where the output files will be put into. - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec[0]; - $tmpdir = $outpath . '/tmp'; # . $tmpdir; - if (!$strict) { - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec[0]; - } - if ($strict) { - $fa_dir_neu = $outpath . '/fa_dir_' . $dbfile_short . '_' . $hmmset; - $fa_dir_neu .= '_strict'; - } - - if ($relaxed) { - $fa_dir_neu .= '_relaxed'; - } - if ($check == 1) { - if (!(-e "$hmmsearch_dir")) { - `mkdir "$hmmsearch_dir"`; - } - elsif (-e "$hmmsearch_dir" and ! $reuse) { - `rm -rf "$hmmsearch_dir"`; - `mkdir "$hmmsearch_dir"`; - } - if (!(-e "$fa_dir_neu")) { - `mkdir "$fa_dir_neu"`; - } - elsif (-e "$fa_dir_neu" and ! $reuse) { - `rm -rf "$fa_dir_neu"`; - `mkdir "$fa_dir_neu"`; - } - mkdir "$tmpdir" unless -d "$tmpdir"; - if (-d "$tmpdir" and $cleartmp) { - `rm -rf "$tmpdir"`; - mkdir "$tmpdir" unless -d "$tmpdir"; - } - } - ## 14) determin whether or not the -representative flag has been set - if (defined $rep) { - push @log, "\tfDOG will run with the -representative option"; - } - else { - push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!"; - } - - ## check further options - if (defined $nonoverlappingCO){ - push @log, "\tThe flag -nonoverlapping_cos has been set. HaMStR will output co-orthologs even when they align to non-overlapping parts of the reference sequence"; - } - if (defined $checkCoRef){ - push @log, "\tThe flag -CheckCoorthologsRef has been set."; - } - if (defined $bhh){ - push @log, "\tThe flag -rbh has been set. HaMStR will run with the reciprocal best hit option."; - } - if ($sortalign){ - push @log, "\tThe flag -sort_global_align has been set. HaMStR will sort hits according to the global alignment score against the reference sequence. (Default for EST data)." - } - - ## check how hamstr should deal with possible introns in transcripts: - if ($estflag) { - my $breaker = 0; - while ($keepintron !~ /^[kmr]/i and ($breaker < 4)){ - $breaker ++; - print "option intron was set to $keepintron: Please answer either with 'k(eep)', 'm(ask)', or 'r(emove)':\t"; - $keepintron = ; - chomp $keepintron; - if (($breaker > 3) and ($keepintron !~ /^[kmr]/i)){ - print "No proper answer is given: exiting.\nPlease re-start HaMStR with the option -intron=[kmr].\nOptions are 'k(eep)', 'm(ask)', or 'r(emove)'. Default is 'k(eep)' introns.\n"; - exit; - } - } - if ($keepintron =~ /^k/i) { - push @log, "\tKeep introns (Default) has been chosen. HaMStR will keep any introns in lower case in the reported CDS. Thus, CDS cannot be directly translated into the aa sequence."; - } - elsif ($keepintron =~ /^m/i) { - push @log, "\tMask introns has been chosen. HaMStR will keep any introns but masks them as 'N' in the reported CDS. Thus, CDS cannot be directly translated into the aa sequence." - } - elsif ($keepintron =~ /^r/i) { - push @log, "\tRemove introns has been chosen. HaMStR will remove any position that genewise could not align to the reference protein rendering the CDS consistent with the amino acid sequence"; - } - - } - - return ($check, @log); -} -################# -## check4reciprocity is the second major part of the program. It checks -## whether the protein sequence that has been identified by the hmmsearch -## identifies in turn the protein from the reference taxon that was used to -## build the hmm. -sub check4reciprocity { - my $frame; - my ($localid, $query_name, $hitname, $refspec_final, @refspec) = @_; - my $searchdb; - my $strict_suc = -1; # keeps track of success for all taxa - my $relaxed_suc = 0; # keeps track of success for at least one taxon - ## get the sequence that was identified as hit in the pHMM search from the db_file - my $hitseq = `$grepprog -m 1 -A 1 ">$hitname\$" $dboutpath/$dbfile_base | tail -n 1`; - if (!defined $hitseq) { - print "could not retrieve a sequence for $hitname. Skipping...\n"; - return(0, '', '', ''); - } - ## continue with the blast - chomp $hitseq; - ## now run the blast - open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; - print OUT ">$hitname\n$hitseq"; - close OUT; - ## now comes the new part that does one to many blast searches. We need to iterate through all - ## entries in the file $refspec_final and perform the Blast against each reftaxon. Note, unless - ## $strict or $relaxed flags are set, there will be only a single reftaxon. If $relaxed is chosen - ## then we can stop the blast searches as soon as the reciprocity is fulfilled. - for (my $k = 0; $k < @$refspec_final; $k++) { - my $orthocount = $refspec_final->[$k]->{orthocount}; - ## 1) Perform the blast search with the k-th reftaxon - printOUT("Reftaxon: $refspec_final->[$k]->{refspec}\n"); - $tmpdir =~ s/\|/\\\|/g; - if ($blast_prog =~ /blast[px]/) { - !`$blast_prog -db $refspec_final->[$k]->{searchdb} -seg '$filter' -max_target_seqs 10 -evalue $eval_blast -outfmt 5 -query $tmpdir/$$.fa -out $tmpdir/$$.blast` or die "Problem running $blast_prog\n"; - ### postprocess the outfile - } - elsif ($blast_prog =~ /blastall/) { - !`blastall -p $algorithm -d $refspec_final->[$k]->{searchdb} -F $filter -e $eval_blast -m7 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running $blast_prog\n" - } - else { - if ($estflag){ - `$blast_prog -ublast $tmpdir/$$.fa -db $refspec_final->[$k]->{searchdb}.udb -strand both -accel $accel -evalue $eval_blast -blast6out $tmpdir/$$.blast` or die "Problem running $blast_prog\n;" - } - else { - `$blast_prog -ublast $tmpdir/$$.fa -db $refspec_final->[$k]->{searchdb}.udb -accel $accel -evalue $eval_blast -blast6out $tmpdir/$$.blast` or die "Problem running $blast_prog\n;" - } - ## sort the output as ublast does not do it (at least not for ESTs) - `sort -n -r -k 12 $tmpdir/$$.blast >$tmpdir/blastsort.tmp`; - `mv $tmpdir/blastsort.tmp $tmpdir/$$.blast`; - #################### - } - ## 2) now parse the best blast hit - my $hits = &getBestBlasthit("$tmpdir/$$.blast"); - if (defined $hits and @$hits > 0) { - ## at least one blast hit - $frame = $hits->[0]->{frame}; - my $idsref = $refspec_final->[$k]->{refid}; - my @original_ids = @$idsref; - my $suc = 0; # keeps track of success for a single taxon - if ($checkCoRef == 0) { - ## the user does not want to check further in case that id of best blast hit and of reference species differ - printOUT("core_orthologs: @original_ids\n"); - ## now loop through the best hits with the same score and check whether - ## among these I find the same seq as in $original - my $i = 0; - while ($suc == 0 and $i <@$hits) { - printOUT("blast-hit: $hits->[$i]->{name}\n"); - ## now loop through all the refspec-sequences in the hmm file; this is the case when co-orthologs have been determine in the core-ortholog - my $j = 0; - while ($suc == 0 and $j < @original_ids) { - if ($original_ids[$j] eq $hits->[$i]->{name}) { - printOUT("hitting $original_ids[$j]\n"); - $refspec_final->[$k]->{hit} = $j; - $suc = 1; - $relaxed_suc = 1; - } - else { - printOUT("not hitting $original_ids[$j]\n"); - $j ++; - } - if ($suc == 1) { - $relaxed_suc = 1; - if ($strict_suc == -1) { - $strict_suc = 1; - } - } - } - $i++; - } - if ($suc == 0) { - # none of the blast hits matched against the the reftaxon seq - $strict_suc = 0; - } - } - - else { - ## The user has chosen to search more sensitive, asking whether the best blast hit might be a co-ortholog to the reference sequence - my $qhdistance; - my $rhdistance; - printOUT("core_orthologs: $original_ids[0]\n"); - ## we will check only the best blast hit and impose a distance criterion - ## in case of an EST, we will have to predict the reading frame and translate it... - my $bestid = $hits->[0]->{name}; - my $refid = $original_ids[0]; - ## get the sequences from the blast db. Currently, I'm using a simple grep - my $bestseq = `$grepprog -m 1 -A 1 ">$bestid" $refspec_final->[$k]->{searchdb}.fa |tail -n 1` or die "Could not retrieve original sequence for besthit\n"; - my $refseq = `$grepprog -m 1 -A 1 ">$refid" $refspec_final->[$k]->{searchdb}.fa |tail -n 1` or die "Could not retrieve original sequence for refseq\n"; - chomp ($bestseq, $refseq); - printOUT("blast-hit: $bestid"); - my $queryseq = $hitseq; - if ($bestid eq $refid) { - printOUT("\thitting\n"); - $refspec_final->[$k]->{hit} = 0; - $suc = 1; - $relaxed_suc = 1; - } - else { - printOUT("\nBest hit $bestid differs from reference sequence $refid! Doing further checks\n"); - if ($estflag){ - printOUT("Frame is $hits->[0]->{frame} or $frame\n"); - my ($hitseqtr) = &findORF($hitseq, $bestseq, $frame); - ($suc, $qhdistance, $rhdistance) = &checkCoorthologRef($localid, $hitseqtr, $bestseq, $refseq); - } - else { - ($suc, $qhdistance, $rhdistance) = &checkCoorthologRef($localid, $hitseq, $bestseq, $refseq); - } - ## print distances (debug mode) - if ($debug){ - my $distDebugFile = $outpath . "/" . $taxon_global . ".debug.dist"; #$path . "/output/" . $taxon_global . ".debug.dist"; - unless (-e $distDebugFile){ - open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; - print $DISTDEBUG "hmmset\trefid\tbestid\tqueryid\tqhdist\trhdist\n"; - close $DISTDEBUG; - } - if (-e $distDebugFile){ - open (my $DISTDEBUG, ">>$distDebugFile") or die "Error, could not create file: ". "$distDebugFile"; - print $DISTDEBUG "$query_name\t$refid\t$bestid\t$hitname\t$qhdistance\t$rhdistance\n"; - close $DISTDEBUG; - } - } - - if ($suc == 1) { - printOUT("\t Distance query - blast hit: $qhdistance, Distance blast hit - reference: $rhdistance\tAccepting\n"); - $refspec_final->[$k]->{hit} = 0; - } - else { - printOUT("\t Distance query - blast hit: $qhdistance; Distance blast hit - reference: $rhdistance Rejecting\n"); - } - } - if ($suc == 1){ - $relaxed_suc = 1; - if ($strict_suc == -1) { - $strict_suc = 1; - } - } - else { - $strict_suc = 0; - } - } - } - else { - printOUT("no hit obtained\n"); - $strict_suc = 0; - } - ## when the user has chosen the strict flag, there is no reason to continue when $suc - ## has remained 0 (reciprocity criterion not fulfilled). Thus, return to main. - if ($strict and $strict_suc == 0) { - return (0, $hitseq); - } - } - if ($relaxed_suc == 1) { - if ($estflag and $frame eq '-') { - ## reverse sequence - $hitseq = &revComp($hitseq); - } - return (1, $hitseq, $frame); - } - else { - return (0, $hitseq); - } -} - -############# -sub getBestBlasthit { - my $hits; - my $count = 0; - my ($file) = @_; - $file =~ s/\\//g; - my $searchio = Bio::SearchIO->new( - -file => "$file", - -format => $outputfmt) - or die "parse failed"; - while(my $result = $searchio->next_result){ - my $sig; - my $sig_old; - while( my $hit = $result->next_hit) { - my $frameval = $hit->strand('query'); - if ($frameval >0){ - $frame = '+'; - } - elsif ($frameval <0 ) { - $frame = '-'; - } - elsif (!defined $frameval and $estflag) { - die "error in obtaining frame in sub getBestBlasthit\n"; - } - else { - $frame = 'na'; - } - - ## now I enter all top hits having the same evalue into the result - $sig = $hit->score; - if (!defined $sig_old) { - $sig_old = $sig; - } - if ($sig == $sig_old) { - if ($estflag){ - printOUT("frame is $frame\n"); - $hits->[$count]->{frame} = $frame; - } - $hits->[$count]->{name} = $hit->name; - $count ++; - } - else { - ## there is no lower ranking hit with the same score as the best hit. End the loop. - last; - } - } - } - return($hits); -} -################## -sub getTaxon { - my ($hitname) = @_; - if ($hitname =~ /\D/) { - $hitname =~ s/_.*//; - } - my $taxon = `$grepprog -m 1 "^$hitname," $taxon_file | $sedprog -e 's/^.*,//'`; - chomp $taxon; - $taxon =~ s/^[0-9]+,//; - $taxon =~ s/\s*$//; - $taxon =~ s/\s/_/g; - if ($taxon) { - return ($taxon); - } - else { - return(); - } -} -############### -sub determineReferences { - my ($localid, $fileobj, $taxon, $refspec_final, $hitname, $hithmmscore, $hitseq, $hitcount) = @_; - my $refseq = ''; - my $refspec; - ## now we have to distinguish between three cases: - ## 1) hamstr is running in normal mode and one refspec has been determined. In this case, $refspec_final - ## contains data only from a single species. - ## 2) hamstr is running in normal mode and alternative refspecs have been determined by the user. - ## $refspec_final may contain results from more than one species, but we need to consider only the first - ## entry. - ## 3) hamstr is running in the strict mode. In this case $refspec_final contains data from several taxa and we need - ## to select the taxon and sequence that is most similar to the hamstered sequence. - ## 4) hamstr is running in the relaxed mode. In this case $refspec_final may contain data from several taxa and - ## we need to select the taxon and the sequence that is most similar to the hamstered sequence. - if (defined $strict or defined $relaxed) { - ## more than one refspec. Now find the one that fits best - my $max_score = 0; - for (my $i = 0; $i < @$refspec_final; $i++) { - ## first, check whether the reciprocity criterion has been fulfilled - if (defined $refspec_final->[$i]->{hit}) { - my $rcn = $refspec_final->[$i]->{hit}; - my $refseq_cand = $refspec_final->[$i]->{sequence}->[$rcn]; - my $refspec_cand_id = $refspec_final->[$i]->{refid}->[$rcn]; - my $refspec_cand = $refspec_final->[$i]->{refspec}; - my $score = &getAlignmentScore($localid, $refseq_cand, $hitseq); - if ($score > $max_score) { - $refspec = $refspec_cand; - $refseq = $refseq_cand; - $max_score = $score; - } - } - } - } - else { ## no choice, just one refspec - my $rcn = $refspec_final->[0]->{hit}; - $refseq = $refspec_final->[0]->{sequence}->[$rcn]; - $refspec = $refspec_final->[0]->{refspec}; -} -$fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; -$fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; -$fileobj->{$taxon}->{hmmscore}->[$hitcount] = $hithmmscore; -$fileobj->{$taxon}->{refseq}->[$hitcount]= $refseq; -$fileobj->{$taxon}->{refspec}->[$hitcount] = $refspec; -return($fileobj); -} -############### -sub processHits { - my ($localid, $fileobj) = @_; - ## 1) align all hit sequences for a taxon against the reference species - my @taxa = keys(%$fileobj); - for (my $i = 0; $i < @taxa; $i++) { - &orfRanking($localid, $taxa[$i]); - } -} - - -################ -sub predictORF { - my $fileobj_new; - my @taxa = keys(%$fileobj); - for (my $i = 0; $i < @taxa; $i++) { - my $protobj = $fileobj->{$taxa[$i]}->{prot}; - my $idobj = $fileobj->{$taxa[$i]}->{ids}; - my $refseqobj = $fileobj->{$taxa[$i]}->{refseq}; - my $refspecobj = $fileobj->{$taxa[$i]}->{refspec}; - my @ids = @$idobj; - for (my $j = 0; $j < @ids; $j++) { - my $refseq = $refseqobj->[$j]; - my $refspec = $refspecobj->[$j]; - my $est = $protobj->[$j]; - if (! $est) { - die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; - } - ### debuggin IUB code - if ($est =~ /[^AGCT]/i) { - $est =~ s/[^AGCTagct]/n/g; - } - printOUT("running genewise using frame $frame\n"); - my $gw = run_genewise_hamstr->new($est, $refseq, $tmpdir, $keepintron); - my $translation = $gw->translation; - my $cds = $gw->codons; - $translation =~ s/[-!]//g; - $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; - $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; - $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; - $fileobj_new->{$taxa[$i]}->{refseq}->[$j] = $refseq; - $fileobj_new->{$taxa[$i]}->{refspec}->[$j] = $refspec; - } - } - return($fileobj_new); -} -############################ -sub orfRanking { - my ($localid, $spec) = @_; - my $result; - my $refprot; - my $refcds; - my @toalign; - my $protobj = $fileobj->{$spec}->{prot}; - my $idobj = $fileobj->{$spec}->{ids}; - my $refcluster; ## variables to take the cluster and its id for later analysis - my $refid; - if (@$protobj == 1) { - ## nothing to chose from - $refprot = $protobj->[0]; - $refcds = $fileobj->{$spec}->{cds}->[0]; - my $length = length($refprot); - $refid = $idobj->[0]; - } - else { - ## more than one cluster - ## note, I set the refseq fix to the first entry. This is to avoid that in this routine - ## sequences from different taxa are used. - push @toalign, ">$fileobj->{$spec}->{refspec}->[0]"; - push @toalign, $fileobj->{$spec}->{refseq}->[0]; - ## now walk through all the contigs - for (my $i = 0; $i < @$protobj; $i++) { - my @testseq = (">$idobj->[$i]", $protobj->[$i]); - @testseq = (@testseq, @toalign); - open (OUT, ">$tmpdir/$localid.ref.fa") or die "could not open file for writing refseqs\n"; - print OUT join "\n", @testseq; - close OUT; - ## run clustalw - !(`$alignmentprog -infile=$tmpdir/$localid.ref.fa -output=fasta -outfile=$tmpdir/$localid.ref.aln 2>&1 >$tmpdir/$localid.ref.log`) or die "error running clustalw\n"; - ## get the alignment score - $result->[$i]->{score} = `$grepprog "Alignment Score" $tmpdir/$localid.ref.log |$sedprog -e 's/[^0-9]//g'`; - if (!$result->[$i]->{score}) { - die "error in determining alignment score\n"; - } - chomp $result->[$i]->{score}; - ## get the aligned sequence - open (ALN, "$tmpdir/$localid.ref.aln") or die "failed to open alignment file\n"; - my @aln = ; - close ALN; - my $aseq = extractSeq($idobj->[$i], @aln); - ## remove the terminal gaps - $aseq =~ s/-*$//; - $result->[$i]->{aend} = length $aseq; - my ($head) = $aseq =~ /^(-*).*/; - ($result->[$i]->{astart}) = length($head)+1; - ## add the hmmscore to $result - $result->[$i]->{hmmscore} = $fileobj->{$spec}->{hmmscore}->[$i]; - } - ### the results for all seqs has been gathered, now order them according to alignment start in the refseq - $result = &sortRef($result); - ($refprot, $refcds, $refid) = &determineRef($result,$spec); - } - $fileobj->{$spec}->{refprot} = $refprot; - $fileobj->{$spec}->{refcds} = $refcds; - $fileobj->{$spec}->{refid} = $refid; - $fileobj->{$spec}->{refspec_final} = $fileobj->{$spec}->{refspec}->[0]; - return(); -} -########################### -sub sortRef { - my $result = shift; - my @sortref; - for (my $i = 0; $i < @$result; $i++) { - $sortref[$i]->{index} = $i; - $sortref[$i]->{astart} = $result->[$i]->{astart}; - $sortref[$i]->{aend} = $result->[$i]->{aend}; - $sortref[$i]->{score} = $result->[$i]->{score}; - $sortref[$i]->{hmmscore} = $result->[$i]->{hmmscore}; - } - @sortref = sort { $a->{astart} <=> $b->{astart} } @sortref; - for (my $i = 0; $i < @sortref; $i++) { - ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}, $result->[$i]->{hmmscore}) = ($sortref[$i]->{index}, $sortref[$i]->{astart}, $sortref[$i]->{aend}, $sortref[$i]->{score}, $sortref[$i]->{hmmscore}); - } - return($result); -} -######################## -sub determineRef { - my ($result, $spec) = @_; - my $lastend = 0; - my $lastscore = 0; - my $final; - my $count = 0; - my $id = ''; - my $scorekey = 'hmmscore'; - if ($sortalign){ - $scorekey = 'score'; - } - for (my $i = 0; $i < @$result; $i++) { - if ($result->[$i]->{start} < $lastend or $lastend == 0) { - if ($result->[$i]->{$scorekey} > $lastscore) { - $lastend = $result->[$i]->{end}; - $lastscore = $result->[$i]->{$scorekey}; - $id = $result->[$i]->{id}; - printOUT("ref is $id with score $lastscore\n"); - } - } - elsif ($result->[$i]->{start} > $lastend) { - ## a new part of the alignment is covered. Fix the results obtained so far - $final->[$count]->{id} = $id; - $lastend = $result->[$i]->{end}; - $id = $result->[$i]->{id}; - $count++; - } - } - $final->[$count]->{id} = $id; - ## now concatenate the results - my $refprot = ''; - my $refid = ''; - my $refcds = ''; - - ## now comes a dirty hack. The user has the chance to maximize phylogentic information by concatenating - ## orthologous sequences that do no align to the same part of the reference protein (option -concat). If so, - ## the co-ortholog-detection at a later step will not work and will be disabled. - my $looplimit = 1; - if ($concat) { - $looplimit = scalar(@$final); - } - for (my $i = 0; $i < $looplimit; $i++) { - my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; - my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; - my $length = length($seq); - if ($concat){ - $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; - } - else { - $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]"; - } - $refprot .= $seq; - if ($estflag) { - $refcds .= $cdsseq; - } - } - $refid =~ s/PP$//; - return($refprot, $refcds, $refid); -} -############################# -sub extractSeq { - my ($id, @aln) = @_; - my $seq = ''; - my $start = 0; - for (my $i = 0; $i < @aln; $i++) { - if ($aln[$i] =~ $id) { - $start = 1; - } - elsif ($aln[$i] =~ />/ and $start == 1) { - last; - } - elsif ($start == 1) { - $seq .= $aln[$i]; - } - } - $seq =~ s/\s//g; - return ($seq); -} -############################## -sub revComp { - my ($seq) = @_; - chomp($seq); - $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; - $seq = reverse($seq); - return($seq); -} -############################## -# sub parseHmmer3pm { -# my ($file, $path) = @_; -# my $hits; -# my $query; -# my %tmphash; -# if (!defined $path){ -# $path = '.'; -# } -# $file = $path . '/' . $file; -# my $in = Bio::SearchIO->new( -# -format => 'hmmer', -# -file => $file -# ); -# while( my $result = $in->next_result ) { -# # this is a Bio::Search::Result::HMMERResult object -# if (!defined $query){ -# $query = $result->query_name(); -# printOUT("query is $query\n"); -# } -# my $hitcount = 0; -# while( my $hit = $result->next_hit ) { -# my $tmp = $hit->name(); -# my $tmpscore = $hit->score(); -# $tmp =~ s/_RF.*//; -# if (!defined $tmphash{$tmp}){ -# $hits->[$hitcount]->{id} = $tmp; -# $hits->[$hitcount]->{hmmscore} = $tmpscore; -# $hitcount++; -# $tmphash{$tmp}=1; -# if (defined $bhh){ -# last; -# } -# } -# } -# -# if (defined $hits->[0]) { -# ####### a quick hack to obtain the lagPhase value -# my $criticalValue; # takes the value used for candidate discrimination -# my $hitLimitLoc = $hitlimit; -# if (defined $autoLimit) { -# printDebug("Entering getLag Routine\n"); -# ## the user has invoked the autmated inference of a hit limit -# ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount); -# if (!defined $criticalValue) { -# ## there was a problem in the computatation of the lagPhase -# print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n"; -# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); -# } -# } -# elsif (defined $scoreThreshold) { -# printDebug("entering the scoreThreshold routine"); -# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); -# printDebug("hitlimitloc is now $hitLimitLoc"); -# } -# -# return ($query, $hits, $hitLimitLoc, $criticalValue); -# } -# else { -# return ($query); -# } -# } -# } -############################## -sub parseHmmer4pm { - my ($file, $path) = @_; - my $hmmhits; - my $hits; - my $query; - my @rest; - my %tmphash; - my $hitcount = 0; - if (!defined $path){ - $path = '.'; - } - $file = $path . '/' . $file; - - $file =~ s/\|/\\\|/g; - my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`; - for (my $i = 0; $i < @hmmout; $i++) { - ($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]); - - if (!defined $query){ - $query = $hmmhits->[$i]->{query_name}; - printOUT("query is $query\n"); - } - my $tmp = $hmmhits->[$i]->{target_name}; - my $tmpscore = $hmmhits->[$i]->{domain_score}; - $tmp =~ s/_RF.*//; - if (!defined $tmphash{$tmp}){ - $hits->[$hitcount]->{id} = $tmp; - $hits->[$hitcount]->{hmmscore} = $tmpscore; - $hitcount++; - $tmphash{$tmp}=1; - if (defined $bhh){ - last; - } - } - - } - if (defined $hits->[0]) { - ####### limit the list of hmm hits - my $criticalValue; # takes the value used for candidate discrimination - my $hitLimitLoc = $hitlimit; - if (defined $scoreThreshold) { - printDebug("entering the scoreThreshold routine"); - ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount); - printDebug("hitlimitloc is now $hitLimitLoc"); - } - - return ($query, $hits, $hitLimitLoc, $criticalValue); - } - else { - return ($query); - } - -} -############################## -sub parseSeqfile { - my $seqref; - my $id; - my $spec; - my $seqid; - my $seq; - my $file = shift; - open (IN, "$file") or die "failed to open $file\n"; - my @seqs = ; - close IN; - chomp @seqs; - for (my $i = 0; $i < @seqs; $i++) { - if ($seqs[$i] =~ />/) { - $seqs[$i] =~ s/>//; - if (defined $id and defined $seq) { - $seqref->{$id}->{$spec}->{seqid} = $seqid; - $seqref->{$id}->{$spec}->{seq} = $seq; - $seq = undef; - } - ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); - } - else { - $seq .= $seqs[$i]; - } - } - if (defined $id and defined $seq) { - $seqref->{$id}->{$spec}->{seqid} = $seqid; - $seqref->{$id}->{$spec}->{seq} = $seq; - $seq = undef; - } - return ($seqref); -} -################## -sub getAlignmentScore{ - my ($localid, $refseq_cand, $hitseq) = @_; - my @testseq = ('>hitseq', $hitseq, '>refseq', $refseq_cand); - open (OUT, ">$tmpdir/$localid.ref.fa") or die "could not open file for writing refseqs\n"; - print OUT join "\n", @testseq; - close OUT; - ## run clustalw - !(`$alignmentprog -infile=$tmpdir/$localid.ref.fa -output=fasta -outfile=$tmpdir/$localid.ref.aln 2>&1 >$tmpdir/$localid.ref.log`) or die "error running clustalw\n"; - ## get the alignment score - my $score = `$grepprog "Alignment Score" $tmpdir/$localid.ref.log |$sedprog -e 's/[^0-9]//g'`; - if (!$score) { - die "error in determining alignment score! Problem with ClustalW\n"; - } - chomp $score; - return ($score); -} -######################3 -sub determineRefspecFinal { - my ($query_name, @refspec) = @_; - my $refspec_final; - ## now get the id and the sequence used for building the hmm. Note, the latter will be - ## needed at a later step to determine the best hit - my @original; - my $ac = 0; - for (my $i = 0; $i < @refspec; $i++) { - $fafile =~ s/\|/\\\|/g; - @original = `$grepprog -A 1 "^>$query_name|$refspec[$i]" $fafile | grep -v "^\-\-\$" |$sedprog -e "s/.*$refspec[$i]\|//"`; - chomp @original; - if (@original > 0) { - $refspec_final->[$ac]->{refspec} = $refspec[$i]; - $refspec_final->[$ac]->{searchdb} = "$blastpath/$refspec[$i]/$refspec[$i]" . $blastapp; - ## now allow for more than one sequence per core-ortholog cluster and species - $refspec_final->[$ac]->{orthocount} = 0; - for (my $j = 0; $j < @original; $j+= 2) { - $refspec_final->[$ac]->{refid}->[$refspec_final->[$ac]->{orthocount}] = $original[$j]; - $refspec_final->[$ac]->{sequence}->[$refspec_final->[$ac]->{orthocount}] = $original[$j+1]; - $refspec_final->[$ac]->{orthocount} += 1; - } - $ac++; - @original = qw(); - if (!defined $strict and !defined $relaxed) { - ## one reftaxon is enough - last; - } - } - else { - printOUT("original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"); - } - } - if (! defined $refspec_final->[0]->{refid}) { - print "original sequence not found\n"; - return (0, $refspec_final); - } - ## now print some wordy information... - if (!defined $strict and !defined $relaxed) { - printOUT("REFSPEC is $refspec_final->[0]->{refspec}\n"); - } - return(1, $refspec_final); -} - -############## co-ortholog prediction using a alignment score criterion as in InParanoid. -sub identifyCoorthologsProt{ - my ($localid, $spec) = @_; - my @get; - my @infofile; - my $protobject = $fileobj->{$spec}->{prot}; #this is an array ref - my $idobject = $fileobj->{$spec}->{ids}; - my @genes2check = @$idobject; - my $refseq = $fileobj->{$spec}->{refprot}; - my $refid = $fileobj->{$spec}->{refid}; - if ($estflag) { - $refid =~ s/-[0-9]+$//; - } - my $refspec_final = $fileobj->{$spec}->{refspec_final}; - ## initialize the array with the sequences to be aligned with the reference sequence - my @out = qw(); - my @hitids = qw(); - push @out, ">$refspec_final"; - push @out, $fileobj->{$spec}->{refseq}->[0]; - for (my $i = 0; $i < @genes2check; $i++) { - my $seq = $protobject->[$i]; - chomp $seq; - push @out, ">" . $spec .'|' . $genes2check[$i]; - push @out, $seq; - } - ## writing sequences to file - my $tmpdirTmp = $tmpdir; $tmpdirTmp =~ s/\\//g; - open (OUT, ">$tmpdirTmp/$localid.orth.fa") or die "failed to open $localid.orth.fa\n"; - print OUT join "\n", @out; - close OUT; - - ## check sequence length - my $alignmentprog_co_tmp = $alignmentprog_co; - my $tooLong = checkSeqLen("$tmpdir/$localid.orth.fa"); - if ($tooLong == 1) { - $alignmentprog_co_tmp = "mafft-linsi"; - } - printOUT("Aligner: $alignmentprog_co_tmp\n"); - - ## aligning sequences - if ($alignmentprog_co_tmp eq 'mafft-linsi'){ - `mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.orth.fa > "$tmpdirTmp/$localid.orth.aln"`; - } - elsif ($alignmentprog_co_tmp eq 'muscle') { - `muscle -quiet -in $tmpdir/$localid.orth.fa -out "$tmpdirTmp/$localid.orth.aln"`; - } - else { - die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n"; - } - if (! -e "$tmpdirTmp/$localid.orth.aln") { - die "something wrong running $alignmentprog_co_tmp\n"; - } - ## do the matrix caluclation - my $in = Bio::AlignIO->new(-format => 'fasta', - -file => "$tmpdirTmp/$localid.orth.aln"); - my $aln = $in->next_aln; - my $pepstats = Bio::Align::ProteinStatistics->new(); - my $kimura = $pepstats->distance(-align => $aln, - -method => 'Kimura'); - ## do the evaluation - ### get the pairwise distances to the yeast sequences - #### get the represenative id - my $smallestdist = $kimura->get_entry("$refspec_final", "$spec|$refid"); - push @get, $spec.'|'.$refid; - push @infofile, $spec.'|'.$refid.'|'.$smallestdist.'|'.1; - printOUT("smalles dist is $smallestdist, besthit is $refid from $spec\n"); - ## now get any other hit protein that is closer to besthit than the representative seq - ## is to the refspec - my $count = 0; #this counter keeps track of the entries in the coorthologs field of $fileobj->{$spec} - for (my $i = 0; $i < @genes2check; $i++) { - if ($genes2check[$i] ne $refid) { - my $dist = $kimura->get_entry("$spec|$refid", "$spec|$genes2check[$i]"); - if ($dist <= $smallestdist or ($dist =~ /NaN/ and defined $nonoverlappingCO)) { - printOUT("co-ortholog detected: $genes2check[$i] with distance $dist compared to $smallestdist of $refid\n"); - $fileobj->{$spec}->{coorthologs}->[$count] = $i; - $count++; - push @infofile, $spec.'|'.$genes2check[$i].'|'.$dist.'|'.0; - } - else { - printOUT("co-ortholog rejected: $genes2check[$i] with distance $dist compared to $smallestdist of $refid\n"); - } - } - } - my $counter = 0; - while (defined $fileobj->{$spec}->{coorthologs}->[$counter]){ - my $index = $fileobj->{$spec}->{coorthologs}->[$counter]; - $counter ++; - } - printOUT(join "\n", @infofile); -} - -######## sub check sequence length. If a sequence is longer than 12.000 aa, -######## change MUSCLE to MAFFT-LINSI (due to Segmentation fault issue of MUSCLE) -sub checkSeqLen { - my $file =$_[0]; - my $out = `awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length(\$0)}END{print seqlen}' "$file"`; - my @out = split("\n", $out); - foreach my $line (@out) { - if ($line !~ />/ & $line > 12000) { - return(1) - } - } - return(0) -} - -######## sub checkCoorthologRef -sub checkCoorthologRef { - ## relevant steps are - ## 1) get query sequence and query id, - ## 2) get refseq - ## 3) get seq for best blast hit - ## compute the distance query - best blast hit and best blast hit - reference seq - ## return '1' if d(q,b)>d(r,b), else return '0'; - my ($localid, $query, $best, $ref) = @_; - open (OUT, ">$tmpdir/$localid.co.fa") or die "failed to open $localid.co.fa\n"; - print OUT ">query\n$query\n>best\n$best\n>ref\n$ref\n"; - close OUT; - - ## check sequence length - my $alignmentprog_co_tmp = $alignmentprog_co; - my $tooLong = checkSeqLen("$tmpdir/$localid.co.fa"); - if ($tooLong == 1) { - $alignmentprog_co_tmp = "mafft-linsi"; - } - printOUT("Aligner: $alignmentprog_co_tmp\n"); - - ## aligning sequences - if ($alignmentprog_co_tmp eq 'mafft-linsi'){ - `mafft --maxiterate 1000 --localpair --anysymbol --quiet $tmpdir/$localid.co.fa > "$tmpdir/$localid.co.aln"`; - } - elsif ($alignmentprog_co_tmp eq 'muscle') { - `muscle -quiet -in $tmpdir/$localid.co.fa -out "$tmpdir/$localid.co.aln"`; - } - else { - die "$alignmentprog_co_tmp is neither mafft-linsi nor muscle\n"; - } - if (! -e "$tmpdir/$localid.co.aln") { - die "something wrong running $alignmentprog_co_tmp in routine checkCoorthologRef\n"; - } - ## do the matrix caluclation - my $in = Bio::AlignIO->new(-format => 'fasta', - -file => "$tmpdir/$localid.co.aln"); - my $aln = $in->next_aln; - my $pepstats = Bio::Align::ProteinStatistics->new(); - my $kimura = $pepstats->distance(-align => $aln, - -method => 'Kimura'); - ## do the evaluation - ### get the pairwise distances to the yeast sequences - #### get the represenative id - my $querydist = $kimura->get_entry('query', 'best'); - my $refdist = $kimura->get_entry('best','ref'); - if (($querydist > $refdist) or ($querydist == 0 and $refdist == 0)){ - return(1, $querydist, $refdist); - } - else { - return(0, $querydist, $refdist); - } -} -####### sub findORF -sub findORF{ - my ($est, $prot, $frame) = @_; - if ($frame eq '-') { - $est = revComp($est); - } - ### debuggin IUB code - if ($est =~ /[^AGCT]/i) { - $est =~ s/[^AGCTagct]/n/g; - } - printOUT("\trunning genewise using frame $frame\n"); - my $gw = run_genewise_hamstr->new($est, $prot, "$tmpdir"); - my $translation = $gw->translation; - return ($translation, $est); -} -####### sub printOUT -sub printOUT { - my $message = shift; - if (!defined $silent) { - print $message; - } - return(); -} -###### sub getLag -sub getLag { - print "\nInside getlag\n"; - my ($hits, $hitcount) = @_; - my $minScore = $hits->[$hitcount-1]->{hmmscore}; - my $maxScore = $hits->[0]->{hmmscore}; - if ($minScore == $maxScore) { - ## there is nothing to do, since there is either only one hit, or all hits have the same - ## hmmscore. Return the value of $hitcount. - return($hitcount, 1); - } - ## debug - else { - print "hitcount is $hitcount, max is $maxScore, min is $minScore\n"; - my @yData = qw(); - my @xData = qw(); - my @xDataLog = qw(); - ## now we generate a reversed list of the normalized bitscores - for (my $i = 0; $i < $hitcount; $i++) { - push(@yData, 1 - ($hits->[$i]->{hmmscore} - $minScore)/($maxScore - $minScore)); - push(@xDataLog, log(0.1*($i+1))); - push(@xData, (0.1*($i+1))); - } - ## The module requires a sufficient amount of trailing 1 to measure the lag point, - ## so we just append them - for (my $i = $hitcount; $i < ($hitcount+20); $i++) { - push(@yData, 1); - push(@xData, 0.1*($i)); - push(@xDataLog, 0.1*($i)); - } - ### calculate end point of lag phase - my $R = Statistics::R->new(); - # set variables for R - my $lagPoint = computeLagPoint($R, \@xDataLog, \@yData); - if ($lagPoint eq 'NA'){ - print "Least square fit to data failed! Trying log-transformed data.\n"; - my $lagPoint = computeLagPoint($R, \@xDataLog, \@yData); - } - ### compute the cutoff - if ($lagPoint eq 'NA') { - return(); - } - else { - my $hitLimitGetLag; - print "limit is $lagPoint. Abs is " . abs($lagPoint) . "\n"; - for (my $i = 0; $i < @xData; $i++) { - if ($xData[$i] > abs($lagPoint)) { - $hitLimitGetLag = $i + 1; - print "Setting hl to $hitLimitGetLag\n"; - last; - } - } - print "hitlimit in getLag is $hitLimitGetLag\n"; - return ($hitLimitGetLag, $lagPoint); - } - } -} -########################## -sub getHitLimit { - my ($hits, $hitcount) = @_; - my $hitLimitLoc = 0; - my $maxScore = $hits->[0]->{hmmscore}; - my $limit = $maxScore / 100 * (100 - $scoreCutoff); - for (my $i = 0; $i < $hitcount; $i++) { - if ($hits->[$i]->{hmmscore} >= $limit) { - $hitLimitLoc++; - } - else { - last; - } - } - return ($hitLimitLoc, $limit); -} -## debug -########################## -sub printDebug{ - my @message = @_; - if ($debug){ - print join "\n", @message; - print "\n"; - } -} -########################## -sub computeLagPoint { - my ($R, $xdata, $ydata) = @_; - $R->set( 'x', \@$xdata); - $R->set( 'y', \@$ydata ); - # define function - $R->run( q`func = function(t,params){ 1/(1 + exp(4 * params[1] * (params[2] - x) + 2)) }`); - # do Nonlinear Least Squares Fitting - $R->run(q`try <- try(nls(y ~ 1/(1 + exp(4 * mean * (lamda - x) + 2)), - start = list(mean=1.4, lamda=0.5), - control = list(maxiter=500)), TRUE)`); - $R->run(q`if(class(try) != "try-error"){ - f = nls(y ~ 1/(1 + exp(4 * mean * (lamda - x) + 2)), - start = list(mean=1.4, lamda=0.5), - control = list(maxiter=500)) - p = coef(f) - lagPoint = p[2] - } else { - lagPoint = "NA" - }`); - - - ### return lag point - my $lagPoint = $R->get('lagPoint'); - return($lagPoint); -} diff --git a/fdog/bin/oneSeq.pl b/fdog/bin/oneSeq.pl deleted file mode 100755 index e0d6aa7..0000000 --- a/fdog/bin/oneSeq.pl +++ /dev/null @@ -1,2855 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use File::Copy; -use File::Copy qw(move); -use File::Basename; -use File::Path; -use File::Path qw/make_path/; -use File::Path 'rmtree'; -use File::Which; -use lib dirname(__FILE__); -use Parallel::ForkManager; -use IO::Handle; -use Getopt::Long; -use Bio::DB::Taxonomy; -use Bio::Tree::Tree; -use Bio::TreeIO; -use Bio::Tools::Run::StandAloneBlast; -use Bio::Seq; -use Bio::SeqIO; -use Bio::SearchIO; -use Term::Cap; -use POSIX; - -use Capture::Tiny qw/capture/; -use IPC::Run qw( run timeout ); -use Time::HiRes; -use List::Util qw(shuffle); -use Cwd; -use Cwd 'abs_path'; -use Array::Utils qw(:all); -use Try::Tiny; - -my $startTime = gettime(); - -# Copyright (C) 2009 INGO EBERSBERGER, ebersberger@bio.uni-frankfurt.de -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published -# by the Free Software Foundation; either version 3 of the License -# or any later version. - -# This program is distributed in the hope that it will be useful -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# You should have received a copy of the GNU General Public License -# along with this program; If not, see http://www.gnu.org/licenses - -# PROGRAM DESCRIPTION: oneSeq.pl is a program for targeted ortholog search in protein sequence data. - -# PROGRAM HISTORY -## This script is based on a perl script authored by Peter Schmitzberger in the course -## of his Master's project at the CIBIV, MFPL, Vienna, Austria - -## MODIFIED: 13. Aug. 2015 - solved path issues. Script will now work together with -## HaMStR 13.2.5 - -## Modified 14. Aug. 2015 - added the options -outpath and -hmmpath and a more refined -## clean up after the search, in cases where a custom outpath has been chosen. - -## Modified: 01. Feb. 2016: restructured major parts - -## Modified: 04. Feb. 2016: - Additions - feature architecture similarity (fas) score support -## - alternations in the program flow -## - global/local option for alignments -## - additional options -## - autocleanup -## - ENV SWATDIR for alignment support (local copy required or use SWATDIR=/home/holger/appz/phredphrap) -## - if you run oneSeq.pl in DB mode, please adapt /bin/run-query.sh to your username and passwort -## - - -## Modified 07. Aug. 2017: - Changes: - change of alignment program, swat replaced by -## ssearch (local:local) and glsearch (global:local) and ggsearch (global:global) -## - selection of best fitting ortholog candidate modified -## - coreFilter: strict, relaxed and none -## - -## Modified 19. Jan. 2018: - Additions - added option to prioritize closer taxon if two taxa have a similar score -## - after a taxon has been choosen every taxa closer gets discarded in the next cycles -## - added commandline parameter to choose the deviation allowed for two taxa to be considered similar -## - -## Modified 09. Feb. 2018: - Changes - Now the HaMStR candidate search climbs the tree and evaluates only one taxon at a time -## - The FAS score for a candidate will now only be calculated, if the alignment score is high enough, -## to top the current best candidate -## - If a candidate reaches the maximum score the search stops and a new round starts -## - If a candidate is within deviation range of the maximum score only the taxa, which are on the same tree branch, -## will get evaluated and then the search gets canceled and a new round starts - -## Modified 24. Nov. 2018: Release - release oneSeq v1.3.1 -## - Not included feature/feature-updated-fas-util - -## Modified 19. July 2019: - Changes - added option to run muscle instead of mafft - -## Modified 22. July 2019: - invoked priority mode for the fas score computation if t = 30 - -## Modified 2. Dec. 2019 -## Bug Fix: Check for taxa with invalid NCBI Taxonomy Id runs now properly and crashes are avoided -## Implemented cleanup of the core ortholog directory to avoid accumulation of feature annotations - -## Modified 05. Feb. 2020 (Vinh): - added option to set number of CPUs for FAS annotation -## - input faste file must not be present in data folder or working directory -## - output files will be stored either in user defined directory set via -outpath option, or in working directory by default - -## Bug fix 14. April 2020 (Ingo): - fixed bug that inactivated the -append option - -## Modified 14. April 2020 (Vinh): - added option for using user-defined blast_dir, genome_dir and weight_dir -## - reference species (and taxa for core-set compilation) specified from blast_dir - -## Modified 16. Juni 2020 (Vinh): major change in FAS score calculation (v1.7.0) -## - no need for profile_prog, architecture_prog and visualsPath -## - final FAS score calculation is done using hamstrFAS - -## Modified 16. Juni 2020 v1.7.1 (Vinh) - replace greedyFAS by calcFAS -## Modified 07. July 2020 v1.7.2 (Vinh) - check if FAS executable -## Modified 10. July 2020 v1.7.3 (Vinh) - solved problem when gene ID contains PIPE -## Modified 13. July 2020 v1.8.0 (Vinh) - added initial check, no longer use .mod files -## Modified 22. July 2020 v1.9.0 (Vinh) - moved tmp blast files to output folder and delete them when finished -## Modified 27. Aug 2020 v2.1.0 (Vinh) - option to input newick tree for search taxa -## Modified 07. Sep 2020 v2.2.0 (Vinh) - append seed sequence to output extended.fa if no ortholog was found in refspec -## Modified 22. Sep 2020 v2.2.1 (Vinh) - make sure that seed sequence always at the beginning of extended.fa output -## Modified 23. Sep 2020 v2.2.3 (Vinh) - use full taxonomy name instead of abbr taxon name for LOG -## Modified 01. Dec 2020 v2.2.4 (Vinh) - fixed bug while creating final extended.fa (and replaced grep and sed by bioperl) -## Modified 16. Feb 2021 v2.2.5 (Vinh) - core compilation works with fasoff -## Modified 18. Feb 2021 v2.2.6 (Vinh) - fixed searchTaxa and coreTaxa options -## Modified 19. March 2021 v2.2.7 (Vinh) - check for long sequence ID -## Modified 24. March 2021 v2.2.8 (Vinh) - skip fa.mapping while checking genome_dir -## Modified 29. March 2021 v2.2.9 (Vinh) - check for zero $maxAlnScore -## - solved problem with long input path for fasta36 tools -## Modified 23. April 2021 v2.3.0 (Vinh) - parse fasta36 output for long IDs (longer than 60 chars) -## Modified 31. May 2021 v2.3.1 (Vinh) - added auto annotation for fdogFas -## Modified 11. June 2021 v2.3.2 (Vinh) - fixed --append option -## Modified 16. June 2021 v2.4.0 (Vinh) - add checkOff option -## Modified 10. Mar 2022 v2.4.1 (Vinh) - fixed bug missing results in multiprocessing - -############ General settings -my $version = 'oneSeq v.2.4.1'; -##### configure for checking if the setup.sh script already run -my $configure = 0; -if ($configure == 0){ - die "\n\nPLEASE RUN fdog.setup BEFORE USING fdog\n\n"; -} -##### hostname -my $hostname = `hostname`; -chomp $hostname; -############# -# my $termios = new POSIX::Termios; $termios->getattr; -# my $ospeed = $termios->getospeed; -# my $t = Tgetent Term::Cap { TERM => undef, OSPEED => $ospeed }; -# my ($norm, $under, $bold) = map { $t->Tputs($_,1) } qw/me md us/; -#### Paths -my $path = abs_path(dirname(__FILE__)); -$path =~ s/\/bin//; -$path =~ s/\/$//; -printDebug("Path is $path"); - -#### Programs and output -my $sedprog = 'sed'; -my $grepprog = 'grep'; -my $readlinkprog = 'readlink'; - -my $globalaligner = 'ggsearch36'; -my $glocalaligner = 'glsearch36'; -my $localaligner = 'ssearch36'; -my $fasta36Path = which('fasta36'); -if ( !(defined $fasta36Path) || $fasta36Path eq "") { - $globalaligner = $path.'/bin/aligner/bin/'.'ggsearch36'; - $glocalaligner = $path.'/bin/aligner/bin/'.'glsearch36'; - $localaligner = $path.'/bin/aligner/bin/'.'ssearch36'; - unless (-e $globalaligner) { - print "fasta36 not found! Please install it before using fdog!\n"; - exit(); - } -} - -my $algorithm = "blastp"; -my $blast_prog = 'blastp'; -my $outputfmt = 'blastxml'; -my $eval_blast_query = 0.0001; -my $filter = 'F'; # default for blastp -my $annotation_prog = "fas.doAnno"; -my $fas_prog = "fas.run"; -my $fdogFAS_prog = "fas.runFdogFas"; - -##### ublast Baustelle: not implemented yet -my $runublast = 0; -my $ublast = 0; -my $accel = 0.8; - -############ database connection details -my $dbname=""; -my $username=""; -my $pw=""; -my $database = "DBI:mysql:database=dbdmpng;host=$dbname"; -my $getThemAll = 0; -my $updateBlast_dir = 0; - -############ directory paths -my $currDir = getcwd; -my $coreOrthologsPath = "$path/core_orthologs/"; -my $outputPath = $currDir; #"$path/output"; ## DEFAULT OUTPUT PATH -my $hamstrPath = "$path/bin/hamstr"; -my $homeDir = $path; -my $alignmentscoreMatrix = "BP62"; ## opt given by ssearch and glsearch [codaa.mat idnaa.mat P250 P120 BL50 MD40 MD20 MD10 BL62 BL80 BP62 VT160 OPT5] -my $genome_dir = "$path/genome_dir"; -my $taxaPath = "$genome_dir/"; -my $blastPath = "$path/blast_dir/"; -my $idx_dir = "$path/taxonomy/"; -my $dataDir = $path . '/data'; -my $weightPath = "$path/weight_dir/"; - -my @defaultRanks = ( - 'superkingdom', 'kingdom', - 'superphylum', 'phylum', 'subphylum', - 'superclass', 'class', 'subclass', 'infraclass', - 'superorder', 'order', 'suborder', 'parvorder', 'infraorder', - 'superfamily', 'family', 'subfamily', - 'tribe', 'subtribe', - 'genus', 'subgenus', - 'species group', 'species subgroup', 'species' -); - -################## some variables -my $finalOutput; -my $dbHandle; -my $core_hitlimit = 3; # number of hmm hits to consider for reblast during core set generation -# number of hmm hits to consider for reblast during final ortholog search. -# Note, this limits the number of co-orthologs that can be found. -my $hitlimit = 10; -## lagPhase test. Setting the autolimit option to decide from the score distribution how many hits to evaluate. -my $autoLimit; -my $scoreThreshold = 1; # evaluate only hmmsearch hits whose score is within the 10% margin of the best hmmsearch hit -my $scoreCutoff = 10; #value in percent of the hmmscore of the best hit -# Setup for FAS score support (FAS support is used by default) -# Note, fas_t is set to 0.75 by default. Changes will influence sensitivity and selectivity -my $fas_support = 1; -my $countercheck = 0; -my $fasoff = 0; -my $fasstrict = 0; -my $fas_T = 0.75; -my $priThreshold = '-t 30'; -my %profile = (); -my %fas_score_keeper = (); -my $eval_filter = 0.001; -my $inst_eval_filter = 0.01; - -my $help; -my @profile = qw(); -# my $showTaxa; -my $refSpec; -my $seqFile = ''; -my $seqId= ''; -my $seqName; -my $minDist; -my $maxDist; -my $minCoreOrthologs; -my $coreTaxa; -my $strict; -my $force = 0; -my $group; -my $groupNode; -my $blast; -my $batch; -my $blastNode; -my $representative; -my $core_rep; -my $checkOff; -my $debug; -my $corestrict; -my $inputSeq = ""; -my $rbh; -my $append = 0; -# Note, the evalue defaults ($eval_blast, $eval_hmmer) will be relaxed for final ortholog run by $eval_relaxfac -my $eval_blast = 0.00001; #1E-5 -my $eval_hmmer = 0.00001; #1E-5 -my $eval_relaxfac = 10; #checked in checkInput -my $coreOnly; -my $cpu = 1; #sets number of forks for final ortholog search (can be set via option -cpu=<>) -my $corecpu = 1; #sets number of forks for core-ortholog assembly (MUST BE 1, due to directed search process through the tree) -my $hyperthread; -my $silent; -my $checkcoorthologsref; -my $cccr; -my $tree; -my $wholeTree; -my $treeDelFlag; -my $currentNoRankDistNode; -my $currentChildsToIgnoreNode; -my $currentDistNode; -my @logOUT = qw(); -### Details about the alignment strategy -### Note, the alignment strategy can be local, glocal, or global -### Default: local -my $local; -my $global; -my $glocal; -my $core_filter_mode; -my $dbmode = 0; ## default run in dbmode. consider setting this in the configure step -my $vlevel = 2; ## verbosity level -my @taxonlist = qw(); -my @refTaxonlist = qw(); -my $seqio_object; -my %taxa; -my %refTaxa; -my $autoclean; -my $getversion; -my $coreex; ## flag to set when the core set already exists -my $addenv; -my $ignoreDistance = 0; ## flag to normalise the score by the distance in the tree -my $distDeviation = 0.05; ## Span in which a score is consideren similar -my $breakAfter = 5; ## Number of Significantly bad candidates after which the current run cancels -my %hashTree; -my $aln = 'muscle'; -my $searchTaxa; -################# Command line options -GetOptions ( - "h" => \$help, - "append" => \$append, - # "showTaxa" => \$showTaxa, - "refSpec=s" => \$refSpec, - "db" => \$dbmode, - "filter=s" => \$filter, - "seqFile=s" => \$seqFile, - "seqId=s" => \$seqId, - "seqName=s" => \$seqName, - "silent" => \$silent, - "minDist=s" => \$minDist, - "maxDist=s" => \$maxDist, - "coreOrth=i" => \$minCoreOrthologs, - "coreTaxa=s" => \$coreTaxa, - "strict" => \$strict, - "rbh" => \$rbh, - "evalBlast=s" => \$eval_blast, - "evalHmmer=s" => \$eval_hmmer, - "evalRelaxfac=s" => \$eval_relaxfac, - "checkCoorthologsRef" => \$checkcoorthologsref, - "coreCheckCoorthologsRef" => \$cccr, - "hitlimitHamstr=s" => \$hitlimit, - "coreHitlimitHamstr=s" => \$core_hitlimit, - "autoLimitHamstr" => \$autoLimit, - "scoreCutoff=s" => \$scoreCutoff, - "scoreThreshold" => \$scoreThreshold, - "coreRep" => \$core_rep, - "coreStrict" => \$corestrict, - "coreOnly" => \$coreOnly, - "group=s" => \$group, - "blast" => \$blast, - "batch=s" => \$batch, - "fas" => \$fas_support, - "countercheck" => \$countercheck, - "fasoff" => \$fasoff, - "coreFilter=s" => \$core_filter_mode, - "minScore=s" => \$fas_T, - "local" => \$local, - "global" => \$global, - "glocal" => \$glocal, - "rep" => \$representative, - "cpu=s" => \$cpu, - "outpath=s" => \$outputPath, - "hmmpath=s" => \$coreOrthologsPath, - "blastpath=s" => \$blastPath, - "searchpath=s" => \$genome_dir, - "weightpath=s" => \$weightPath, - "checkOff" => \$checkOff, - "debug" => \$debug, - "coreHitlimit=s" => \$core_hitlimit, - "hitlimit=s" => \$hitlimit, - "force" => \$force, - "cleanup" => \$autoclean, - "addenv=s" => \$addenv, - "version" => \$getversion, - "reuseCore" => \$coreex, - "ignoreDistance" => \$ignoreDistance, - "distDeviation=s" => \$distDeviation, - "aligner=s" => \$aln, - "hyperthread" => \$hyperthread, - "searchTaxa=s" => \$searchTaxa -); - -$outputPath = abs_path($outputPath); -unless (-d $coreOrthologsPath) { - make_path($coreOrthologsPath); -} -$coreOrthologsPath = abs_path($coreOrthologsPath)."/"; -$blastPath = abs_path($blastPath)."/"; -$weightPath = abs_path($weightPath)."/"; -$genome_dir = abs_path($genome_dir)."/"; -$taxaPath = $genome_dir; - -############# do initial check -if (!defined $help && !defined $getversion) { #} && !defined $showTaxa) { - print "Validity checking....\n"; - my $checkStTime = gettime(); - unless($checkOff) { - initialCheck($seqFile, $seqName, $blastPath, $taxaPath, $weightPath, $fasoff); - } - - if (!defined $coreex) { - if (!grep(/$minDist/, @defaultRanks)) { - die "ERROR: minDist $minDist invalid!\n"; - } - - if (!grep(/$maxDist/, @defaultRanks)) { - die "ERROR: maxDist $maxDist invalid!\n"; - } - - if (!defined $minCoreOrthologs) { - die "ERROR: coreOrth not defined (must be integer)!"; - } - } - print "Check finished in " . roundtime(gettime() - $checkStTime). " sec!\n"; -} - -############# show version -if ($getversion){ - print "You are running $version\n"; - print "This version supports FAS comparison.\n"; - exit; -} - -############# show help -if($help) { - my $helpmessage = helpMessage(); - print $helpmessage; - exit; -} - -############# connect to the database -if ($dbmode) { - $dbHandle = DBI->connect($database, $username, $pw) - or die "Can not open the database!"; -} - -############# show all taxa -# if ($showTaxa) { -# #get all taxa from database -# #hash example: sacce_2336 -> NCBI ID for sacce_2336 -# printTaxa(); -# exit; -# } - -#switched from online version to flatfile because it is much faster -#taxon files can be downloaded from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/ -my $indexStart = gettime(); -print "Please wait while the taxonomy database is indexing...\n"; -my $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); -my $indexTime = gettime() - $indexStart; -my $db_bkp = $db; -print "Indexing done in ",roundtime($indexTime)," sec!\n"; - -%taxa = getTaxa(); -%refTaxa = getRefTaxa(); -## debugging message -my $taxcount = keys(%taxa); -printDebug("receiving hash of taxa with $taxcount elements from sub getTaxa"); -### -for (keys %taxa){ - printDebug("value of $_ is $taxa{$_}"); -} - -my $outputFa = $coreOrthologsPath . $seqName . "/" . $seqName . ".fa"; -my $outputAln = $coreOrthologsPath . $seqName . "/" . $seqName . ".aln"; -my $tmpdir = $outputPath . '/' . $seqName . '/tmp'; -make_path($tmpdir); -checkOptions(); -createFoldersAndFiles($outputFa, $seqName, $inputSeq, $refSpec); - -my $curCoreOrthologs = 0; -my $hamstrSpecies = $refSpec; -my $addedTaxon = $refSpec; -my $noMoreOrthologs = 0; -my $coremode; -my %finalcontent; -my %candicontent; -my $maxAlnScore = 0; - -# create weight_dir in oneseq's home dir (used for annotations,weighting,feature extraction) -# get annotations for seed sequence if fas support is on -if ($fas_support){ - if (!$weightPath) { - createWeightFolder(); - } - getAnnotation($outputFa); -} - -my $coreStTime = gettime(); #time; -#core-ortholog search -if (!$coreex) { - print "\nCore compiling...\n"; - $coremode = 1; - $taxaPath = $blastPath; - #### moved from above - my $taxBuildSt = gettime(); - unless ($silent) { - print "Building up the taxonomy tree...\n"; - } - push @logOUT, "Building up the taxonomy tree...\n"; - $tree = getRefTree(); - $treeDelFlag = 0; - if($group) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - unless ($silent) { - print "Finished building the taxonomy tree in ". roundtime(gettime() - $taxBuildSt) ." sec\n"; - } - push @logOUT, "Finished building the taxonomy tree in ". roundtime(gettime() - $taxBuildSt) ." sec\n"; - ## Tree without deletions - $wholeTree = getRefTree(); - if($group) { - foreach($wholeTree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $wholeTree->set_root_node($groupNode); - } - ## initialise control nodes - $currentDistNode = $wholeTree->find_node(-ncbi_taxid => $refTaxa{$refSpec}); - $currentNoRankDistNode = $currentDistNode->ancestor; ## the node from which the distance to other species will be calculated - $currentChildsToIgnoreNode = $currentDistNode; ## the node containing all child species which will not be included in the candidates file - - %hashTree = buildHashTree(); - removeMaxDist(); - printDebug("Subroutine call removeMinDist\nRefspec is $refSpec\nTaxon is $refTaxa{$refSpec}\n"); - $treeDelFlag = removeMinDist($refTaxa{$refSpec}); - #### end moved from above - - if ($ignoreDistance){ - $distDeviation = 0; - $breakAfter = -1; - } - - ## some variables used later - my $firstRun = 1; - - while (get_leaves($tree, $treeDelFlag) > 0 && $curCoreOrthologs < $minCoreOrthologs && $noMoreOrthologs == 0) { - - # checking the tree which determines the taxa that are going to be searched for hits - # printDebug("Subroutine call from core-ortholog compilation\nNumber of leaves is ".get_leaves($tree)."\nCurrent core-orthologs: $curCoreOrthologs\nVar \$noMoreOrthologs is set to $noMoreOrthologs\n"); - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - - #generate new aln - if($curCoreOrthologs > 0) { - createAlnMsf(); - } - - unless ($silent) { - print "In round $curCoreOrthologs running hmmbuild on $outputAln\n"; - } - hmmbuild($coreOrthologsPath.$seqName."/hmm_dir/".$seqName.".hmm", $outputAln); - - ## get the max alignment score per ortholog - printDebug("Discovering maximum alignmentscore"); - - ## Align every current core ortholog against all curretn core orthologs - ## the maximum found in this alignment is the maximun any other sequence can reach - copy($outputFa, $outputFa . ".extended") or die "Error, could not copy to file: ". "$outputFa" . ".extended\n"; - - ## get the max alnscore - my %maxAlnScores = getCumulativeAlnScores(); - foreach my $score (values %maxAlnScores){ - if ($score > $maxAlnScore){ - $maxAlnScore = $score; - } - } - printDebug("The maximum alignmentscore is: $maxAlnScore"); - if ($maxAlnScore == 0) { - die("Maximum alignment score is Zero! Something went wrong with fasta36 functions!\n") - } - clearTmpFiles(); - - my $addedTaxon = getBestOrtholog(); - my $addedTaxonName = getTaxonName($addedTaxon); - print "Added TAXON: $addedTaxon\t$addedTaxonName\n"; - #if a new core ortholog was found - if($addedTaxon ne "") { - $hamstrSpecies = $hamstrSpecies . "," . $addedTaxon; - - clearTmpFiles(); - - ++$curCoreOrthologs; - printDebug("Subroutine call from core-ortholog compilation\nTaxon is $addedTaxon\nNCBI Id is $refTaxa{$addedTaxon}\n"); - $treeDelFlag = removeMinDist($refTaxa{$addedTaxon}); - } - else { - #there are no more core orthologs - $noMoreOrthologs = 1; - print "\nThe desired number of core orthologs could not be reached.\n"; - } - } - - ## This is now the final round of alignment and profile hidden Markov model building - ## It concludes the core ortholog set compilation - if ($curCoreOrthologs < $minCoreOrthologs ){ - print "\nWARNING: The desired number of core orthologs could not be reached. Training with only $curCoreOrthologs sequences\n"; - } - createAlnMsf(); - hmmbuild($coreOrthologsPath.$seqName."/hmm_dir/".$seqName.".hmm", $outputAln); -} -print "==> Core set compilation finished in " . roundtime(gettime() - $coreStTime). " sec!\n"; -push @logOUT, "Core set compilation finished in " . roundtime(gettime() - $coreStTime). " sec!"; - -#after having calculated the core orthologous set, -#start fdog to find all orthologs -# my $finalOutput = $outputPath . '/' . $seqName . '.extended.fa'; -my $orthoStTime = gettime(); -if (!$coreOnly) { - $coremode = 0; - push @logOUT, "Performing the final ortholog search..."; - print "\nPerforming the final ortholog search...\n"; - my $startTmp = gettime(); - #using $eval_relaxfac to relax the evalues for final search - my $final_eval_blast = $eval_blast*$eval_relaxfac; - my $final_eval_hmmer = $eval_hmmer*$eval_relaxfac; - - $taxaPath = $genome_dir; - my @searchTaxa; - unless ($searchTaxa) { - unless($groupNode) { - @searchTaxa = keys %taxa; - } else { - # %taxa = getTaxa(); - # print "GET TAXA TIME: ", roundtime(gettime() - $startTmp),"\n"; - my $tree = getTree(); - # print "GET TREE TIME: ", roundtime(gettime() - $startTmp),"\n"; - if($groupNode) { - foreach($tree->get_nodes()) { - if($_->id == $groupNode->id) { - $groupNode = $_; - } - } - $tree->set_root_node($groupNode); - } - foreach (get_leaves($tree)) { - push(@searchTaxa, @{$_->name('supplied')}[0]); - } - } - } else { - open(SEARCH, $searchTaxa) || die "Cannot open $searchTaxa file!\n"; - @searchTaxa = ; - close (SEARCH); - } - # print "PREPARE TIME: ", roundtime(gettime() - $startTmp),"\n"; - - my $pm = new Parallel::ForkManager($cpu); - if ($hyperthread) { - $pm = new Parallel::ForkManager($cpu*2); - } - - foreach (sort @searchTaxa) { - chomp(my $searchTaxon = $_); - my $pid = $pm->start and next; - if ($coreex) { - $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . 'nodes.dmp', - -namesfile => $idx_dir . 'names.dmp', - -directory => $idx_dir); - $db_bkp = $db; - } - my $searchTaxonName = getTaxonName($searchTaxon); - if (defined($searchTaxonName)) { - unless ($silent) { - print $searchTaxon, "\t", $searchTaxonName, "\n"; - } else { - unless ($searchTaxonName eq "Unk") { - print $searchTaxonName, "\n"; - } else { - print $searchTaxon, "\n"; - } - } - } - runHamstr($searchTaxon, $seqName, $finalOutput, $refSpec, $hitlimit, $representative, $strict, $coremode, $final_eval_blast, $final_eval_hmmer, $aln, 0); - $pm->finish; - } - $pm->wait_all_children; - - ### join result files - unless (-e $finalOutput) { - open(EXTENDEDFA, ">$finalOutput") or die "Cannot create $finalOutput\n"; - } else { - open(EXTENDEDFA, ">>$finalOutput") or die "Cannot create $finalOutput\n"; - } - opendir(my $dh, $outputPath) || die "Cannot open $outputPath: $!"; - while (readdir $dh) { - if ($_ =~ /hamstrsearch_(.)+_$seqName(\.strict)*\.out$/) { - open(RESULT, "<$outputPath/$_") or warn "Cannot open $outputPath/$_!"; - while (my $line = ) { - chomp $line; - my @tmp = split(/\|/, $line); - my $seq = pop(@tmp); - splice(@tmp, 1, 1); - my $id = join("|", @tmp); - print EXTENDEDFA ">$id\n$seq\n"; - } - close(RESULT); - unlink("$outputPath/$_") or warn "Cannot delete $outputPath/$_!" - } - } - closedir $dh; - close(EXTENDEDFA); -} -### remove duplicated seq in extended.fa -if (-e $finalOutput) { - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); -} -push @logOUT, "Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!"; -print "==> Ortholog search completed in ". roundtime(gettime() - $orthoStTime) ." sec!\n"; - -## Evaluation of all orthologs that are predicted by the final run -if(!$coreOnly){ - my $fasStTime = gettime(); - my $processID = $$; - - # check if final extended.fa exists - unless (-e $finalOutput) { - die "ERROR: Could not find $finalOutput\n"; - } - # check and add seed to final extended.fa if needed - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $finalOutput); - - # calculate FAS scores for final extended.fa - if ($fas_support) { - print "Starting the feature architecture similarity score computation...\n"; - my $fdogFAScmd = "$fdogFAS_prog -i $finalOutput -w $weightPath -t $tmpdir -o $outputPath --cores $cpu --redo_anno"; - unless ($countercheck) { - $fdogFAScmd .= " --bidirectional" - } - system($fdogFAScmd) - # print $fdogFAScmd,"\n"; - } else { - fasta2profile($finalOutput, $seqName) - } - push @logOUT, "FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - print "==> FAS calculation completed in " . roundtime(gettime() - $fasStTime). " sec!\n"; - if($autoclean){ - print "Cleaning up...\n"; - runAutoCleanUp($processID); - } -} - -## Delete tmp folder -unless ($debug) { - my $delTmp = "rm -rf $tmpdir"; - system ($delTmp) == 0 or die "Error deleting tmp files in $tmpdir\n"; - my $delcommandTmp = "rm -rf $outputPath/tmp"; - system ($delcommandTmp) == 0 or die "Error deleting tmp files in $outputPath/tmp\n"; -} -print "==> fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; -push @logOUT, "fdog finished after " . roundtime(gettime() - $startTime) . " sec!\n"; - -#### writing the log -open (LOGOUT, ">>$outputPath/fdog.log") or die "Could not open $outputPath/fdog.log for writing\n"; -print LOGOUT "\n\n"; -my $fdogVersion = `fdog.run --version`; -print LOGOUT "fDOG v$fdogVersion\n"; -print LOGOUT join "\n", @logOUT; -close LOGOUT; -exit; - - -######################## SUBROUTINES ######################## - -################################# -## Clears Temporary files -sub clearTmpFiles { - #clear temporary result file - if(-e $outputFa.".extended") { - unlink($outputFa.".extended"); - } - - #clear all alignment files - my @scorefiles = glob("*.scorefile"); - foreach my $file (@scorefiles) { - unlink($file); - } - my @fastaInfiles = glob("*_fasta36.fa"); - foreach my $file (@fastaInfiles) { - unlink($file); - } -} - -sub getCandicontent{ - my %candicontent; - my $candidatesFile = $outputFa . ".extended"; - if (-e $candidatesFile) { - - ######################## - ## step: 2 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - open (CANDI, "<".$candidatesFile) or die "Error: Could not find $candidatesFile\n"; - my $head; - %candicontent = (); - while(){ - my $line = $_; - chomp($line); - if ($line =~ m/^>/){ - $line =~ s/>//; # clip '>' character - $head = $line; - }else{ - $candicontent{$head} = $line; - } - } - close (CANDI); - } - return %candicontent; -} - -################################# -## Get the alinment score for the current candidate file -## only works for files holding only one candidate -sub getCumulativeAlnScores{ - chdir($coreOrthologsPath . $seqName); - my $candidatesFile = $outputFa . ".extended"; - my $fileId = $$; - my $scorefile = $fileId . ".scorefile"; - my $fasta36file1 = $fileId . ".1_fasta36.fa"; - my $fasta36file2 = $fileId . ".2_fasta36.fa"; - my %scores; - - ######################## - ## step: 1 - ## set alignment parameters for fasta36 - my $fasta36cmd = $fasta36file1 . "\" \"" . $fasta36file2 . "\" -s " . $alignmentscoreMatrix . " -m 9 -d 0 -z -1 -E 100" . " > " . $scorefile; - - ######################## - ## step: 2 - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 3 - ## get alignment scores - chdir($coreOrthologsPath . $seqName); - symlink($outputFa, $fasta36file1); - symlink($candidatesFile, $fasta36file2); - if ($glocal){ - #glocal global:local glsearch36 Needleman-Wunsch - my $globlocCommand = "$glocalaligner \"" . $fasta36cmd; - printDebug($globlocCommand); - # print $globlocCommand,"\n";<>; - system($globlocCommand); - }elsif ($global){ - #global global:global ggsearch36 Needleman-Wunsch - my $globglobCommand = "$globalaligner \"" . $fasta36cmd; - printDebug($globglobCommand); - # print $globglobCommand,"\n";<>; - system($globglobCommand); - }elsif ($local){ - #local local:local ssearch36 Smith-Waterman - my $loclocCommand = "$localaligner \"" . $fasta36cmd; - printDebug($loclocCommand); - # print $loclocCommand,"\n";<>; - system($loclocCommand); - } - ######################## - ## step: 4 - ## collect alignment score - ## keep track about min and max for each query/coreortholog vs candidate set - my $max = -10000000; - my $min = 10000000; - - %scores = cumulativeAlnScore($scorefile, \%candicontent); - return %scores; -} - -################################# -## Get the alinment scores for the current candidate file -sub getAlnScores{ - chdir($coreOrthologsPath . $seqName); - my %scores = getCumulativeAlnScores(); - ## Normalize Alignment scores (unity-based) - printDebug("Normalize alignment scores:\n"); - foreach my $key (keys %scores){ - my $score = $scores{$key}; - unless ($silent) { - print "Cumulative alignmentscore is: $score\n"; - } - $scores{$key} = $scores{$key} / $maxAlnScore; - $score = $scores{$key}; - unless ($silent) { - print "Normalised alignmentscore is: $score\n"; - } - } - return %scores; -} - -################################# -## Get the fas scores for the current candidate file -sub getFasScore{ - printDebug("Changing to $coreOrthologsPath$seqName", "Candidate file is $outputFa".'.extended'); - chdir($coreOrthologsPath . $seqName); - my %fas_box; - my $scorefile = $$ . ".scorefile"; - my $rankscore; - - ######################## - ## step: 1 - ## setup - ## candidates to hash - ## %candicontent keeps info about all candidates (header and sequence) - my %candicontent = getCandicontent(); - - ######################## - ## step: 2 - ## get FAS score - ## fas support: on/off - my @candidateIds = keys(%candicontent); - if ($fas_support){ - my ($name,$gene_set,$gene_id,$rep_id) = split(/\|/, $candidateIds[0]); - unless (-e "$weightPath/$gene_set.json") { - print "ERROR: $weightPath/$gene_set.json not found! FAS Score will be set as zero.\n"; - $fas_box{$candidateIds[0]} = 0.0; - } else { - my $lnCmd = "ln -fs $weightPath/$gene_set.json \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\""; - system($lnCmd); - my $fasOutTmp = `$fas_prog -s \"$coreOrthologsPath$seqName/$seqName.fa\" -q $blastPath/$gene_set/$gene_set.fa --query_id \"$gene_id\" -a \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\" -o \"$coreOrthologsPath$seqName/fas_dir/annotation_dir/\" --raw --tsv --domain --cpus 1 | grep "#" | cut -f 3,4`; - my @fasOutTmp = split(/\t/,$fasOutTmp); - $fas_box{$candidateIds[0]} = $fasOutTmp[1]; - } - } else { - $fas_box{$candidateIds[0]} = 1; - } - return %fas_box; -} - -################################# -## Add fas and alignment score together while using specified filters (coreFilter option) -sub getFilteredRankScore{ - my $alnScore = $_[0]; - my $fasScore = $_[1]; - my $rankscore = 0; - # $rankscore: keeps alignment and fas score, decider about $bestTaxon - if ($core_filter_mode){ - if ($core_filter_mode eq "strict"){ - # case 1: filter - if ($fasScore < $fas_T){ - #eliminate candidate $key - print "Deleting candidate from list due to insufficient FAS score.\n"; - $rankscore = 0; - }else{ - #keep - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - }elsif ($core_filter_mode eq "relaxed"){ - # case 2: disadvantage - if ($fasScore < $fas_T){ - # ignore FAS score for rankscore - printDebug("Candidate will be disadvantaged.\n"); - if ($alnScore){ - $rankscore = $alnScore; - }else{ - $rankscore = 0; - } - } - else{ - #keep - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - } - }else{ - # case 3: no filter - if($fasScore) { - if ($alnScore){ - $rankscore = $fasScore + $alnScore; - }else{ - $rankscore = $fasScore; - } - } - } - return $rankscore; -} - -sub getHeaderSeq{ - my $bestTaxon = $_[0]; - open (EXTFA, $outputFa.".extended"); - my $sequenceLine = 0; - my $bestSequence = ""; - - ######################## - ## step: 7 - ## get best sequence from candidate file - ## (will be added to the model) - while() { - my $line = $_; - chomp($line); - if($sequenceLine == 1) { - $bestSequence = $line; - $sequenceLine = -1; - } - - if($line eq $bestTaxon) { - $sequenceLine = 1; - } - } - close EXTFA; - my @best = split '\|', $bestTaxon; - my $header = $best[0] . '|' . $best[1] . '|' . $best[2]; - return ($header, $bestSequence); -} - -## create profile from extended.fa -sub fasta2profile{ - my ($file, $out) = ($_[0], $_[1]); - my ($fO_base, $fO_path, $fO_suffix) = fileparse( $file, qr/\.[^.]*/ ); - my $outFile = $fO_path.$out.".phyloprofile"; - open(FA, $file); - open(PPOUT, ">$outFile"); - print PPOUT "geneID\tncbiID\torthoID\n"; - foreach my $line() { - if ($line =~ /^>/) { - chomp($line); # test|ANOGA@7165@1|Q7Q3C2|1 - $line =~ s/>//; - my @lineTMP = split(/\|/, $line); - my $geneID = $lineTMP[0]; - my @orthoTMP = split(/@/, $lineTMP[1]); - my $ncbiID = "ncbi".$orthoTMP[1]; - print PPOUT $geneID, "\t", $ncbiID, "\t", $line,"\n"; - } - } - close(FA); - close(PPOUT); -} - -## auto clean up can be invoked via the "-cleanup" option -# $processID: given process ID -sub runAutoCleanUp { - my $processID = $_[0]; - unless ($silent) { - print "Deleting $outputPath/tmp\n"; - } - my $delCommandTmp = "rm -rf \"$outputPath/tmp\""; - system ($delCommandTmp) == 0 or die "Error deleting result files\n"; - my $seedName = $seqName . '_seed'; - my $annopath = $coreOrthologsPath.$seqName."/fas_dir/annotation_dir"; - if( -l "$currDir/$seqFile" ) { - my $delLnSeedFile = "rm $currDir/$seqFile"; - system ($delLnSeedFile); - } - unless ($silent) { - print "Deleting $annopath\n"; - } - if (!$fasoff) { - opendir(ANNODIR, $annopath) or warn "Could not open $annopath in sub runAutoCleanup\n"; - my @annodirs = grep (!/$seedName/, readdir(ANNODIR)); - unless ($silent) { - print scalar(@annodirs) . " content of $annopath\n"; - } - for (my $anno = 0; $anno < @annodirs; $anno++){ - if ($annodirs[$anno] ne '..' and $annodirs[$anno] ne '.' and $annodirs[$anno] ne $seqName.".json") { - unless ($silent) { - print "Deleting $annopath/$annodirs[$anno]\n"; - } - rmtree ($annopath."/".$annodirs[$anno]); - } - } - closedir (ANNODIR); - } -} - -## starting annotation_prog for given seed sequence file -# $seedseqFile: fasta file with seed sequence -sub getAnnotation { - my ($seedseqFile) = ($_[0]); - my $inputAnno = $seedseqFile; - $inputAnno =~ s/\|/\\\|/g; - my $outputAnno = $coreOrthologsPath . $seqName . "/fas_dir/annotation_dir"; - $outputAnno =~ s/\|/\\\|/g; - my $annotationCommand = "$annotation_prog" . " -i $inputAnno" . " -o $outputAnno --cpus 1" . " --name \"$seqName\""; #" --name " . $seqName . "_seed" . " --cpus 1"; - system($annotationCommand); -} - -## determines the reference species and/or the sequence id of the input sequence. -sub determineRef { - my ($infile, @refspec) = @_; - #run blast for all available proteomes if the given sequence is not in the database - unless ($silent) { - print "One moment please!\nLooking for the most similar sequence to your input sequence.\n"; - } - my $bestHit->{score} = 1; - $bestHit->{evalue} = 10; - my $outname = $$; - ## Baustelle: Currently, we need to loop through all possible taxa to id the best matching one - for (my $i = 0; $i < scalar(@refspec); $i++) { - my $curTaxon = $refspec[$i]; - ## run the blast search - printDebug("running blast on $curTaxon"); - my $resultFile = runBlast($seqFile, $dataDir, $outname, $tmpdir, "$blastPath/$curTaxon/$curTaxon"); - my $hits = &getBestBlasthit($tmpdir, $resultFile); - if (defined $hits and @$hits > 0) { - #only use the best hit with the index [0]. Note, $hits is an array ref of hashrefs. - if($hits->[0]->{score} > $bestHit->{score}) { - $bestHit->{name} = $hits->[0]->{name}; - $bestHit->{score} = $hits->[0]->{score}; - $bestHit->{evalue} = $hits->[0]->{evalue}; - $bestHit->{species} = $curTaxon; - } - } - } - return($bestHit); -} - -sub checkGroup { - my $group = shift; - my $node = $db->get_taxon(-name => $group); - if($node) { - $groupNode = $node; - } - else { - print "Your selected group " . $group . " was not found in the taxonomic tree... TERMINATING\n"; - exit; - } -} - -################################# -sub checkOptions { - if($eval_relaxfac < 1){ - # rethink - if($eval_relaxfac <= 0){ - printOut("\nThe specified factor for evalue relaxation is <= 0. Please see the help text for option -eval_relaxfac. We recommend a factor > 1. Default is 10.\n",1); - my $answer = ''; - my $breaker = 0; - while (($answer !~ /[0-9]/i) and ($breaker < 4)) { - $breaker++; - $answer = getInput("Please choose a new factor (Integer) for evalue relaxation. [1,100]"); - if (($breaker > 3) and ($answer !~ /[0-9]/i)){ - print "No proper factor given ... exiting.\n"; - exit; - } - } - if ($answer =~ /[0-9]/i) { - $eval_relaxfac = $answer; - } - } - } - ### check for colision of force and append. Change in favor of append - if ($force == 1 and $append ==1) { - $force = 0; - } - ### check the presence of the pre-computed core set - if ($coreex) { - if (! -e "$coreOrthologsPath/$seqName/$seqName.fa") { - print "You selected the option -reuseCore, but the core ortholog group $coreOrthologsPath/$seqName/hmm_dir/$seqName.hmm does not exist\n"; - exit; - } - } - ### begin move up - ### checking reference species - my $optbreaker = 0; - while ((!$refSpec or !$refTaxa{$refSpec}) && !$blast) { - if ($optbreaker >= 3){ - print "No proper refspec given ... exiting.\n"; - exit; - } - my $output = ''; - for (my $i = 0; $i < @refTaxonlist; $i++) { - $output = $output . "[$i]" . "\t" . $refTaxonlist[$i] . "\n"; - } - ### for debug? - # for (keys %taxa){ - # print "value of $_ is \'$taxa{$_}\'"; - # } - # printDebug("taxa contains $taxa{$refSpec}"); # cannot print this if $taxa{$refSpec} not exists! - my $refSpecIdx = getInput("\n" . $output . "\n" . "You did not provide a valid reference species ($refSpec). Please choose the number for the reference species your input sequence is derived from", 1); - $optbreaker++; - $refSpec = $refTaxonlist[$refSpecIdx]; - checkBlastDb($refSpec, $refSpec); - } - ### end move up - ### adding new routine to generate the input sequence if -reuseCore has been set - if ($coreex) { - my @refseq=`$grepprog -A 1 ">$seqName|$refSpec" $coreOrthologsPath/$seqName/$seqName.fa | grep -v "^\-\-\$"`; - chomp @refseq; - unless ($silent) { - print "$refseq[0]\n"; - } - (my $tmp1, my $tmp2, $seqId) = split '\|', $refseq[0]; - if (length($seqId) == 0){ - die "error in retrieving sequence while using -reuseCore\n"; - } - print "overruling the provided seed sequence since you used the option -reuseCore. Setting seed id to $seqId\n"; - open OUT, (">$currDir/$seqName.fa") or die "could not open $currDir/$seqFile for writing in retrieve refseq\n"; - print OUT join "\n", @refseq; - close OUT; - $seqFile = "$seqName.fa"; - } - ### end mod - ### check input file - $optbreaker = 0; - while ((length $seqFile == 0) or ((! -e "$currDir/$seqFile") and (! -e "$dataDir/$seqFile"))) { - if ($optbreaker >= 3){ - print "No proper file given ... exiting.\n"; - exit; - } - if (length $seqFile > 0){ - if (-e $seqFile) { - my @seqFileTMP = split(/\//, $seqFile); - unless (-e "$currDir/$seqFileTMP[@seqFileTMP-1]") { - system("ln -fs \"$seqFile\" \"$currDir/$seqFileTMP[@seqFileTMP-1]\""); - } - $seqFile = $seqFileTMP[@seqFileTMP-1]; - } else { - printOut("\nThe specified file $seqFile does not exist!\n",1); - } - } - } - if (-e "$currDir/$seqFile"){ - $dataDir = $currDir; - printDebug("Setting datadir to $currDir in sub checkOptions"); - } - - ### checking the number of core orthologs. Omit this check if the option -reuseCore has been selected - $optbreaker = 0; - while(!$minCoreOrthologs and !$coreex) { - if ($optbreaker >= 3){ - print "No proper number given ... exiting.\n"; - exit; - } - $minCoreOrthologs = getInput("Please specify the desired number of core orthologs!", 1); - $minCoreOrthologs = checkInt($minCoreOrthologs); - $optbreaker++; - } - ## check for blast filter - if ($blast_prog ne 'blastall'){ - $filter = 'yes' if $filter eq 'T'; - $filter = 'no' if $filter eq 'F'; - } - - $inputSeq = fetchSequence($seqFile, $dataDir); - - ## the user has not provided a sequence id, however, the refspec is determined. - if($seqId eq '') { - my $besthit; - if (!$blast){ - ## a refspec has been determined - #use blast to search in the proteome of the specified reference species for the input sequence - #in order to obtain a valid sequence id - $besthit = determineRef($seqFile, ($refSpec)); - } - else { - $besthit = determineRef($seqFile, @refTaxonlist); - } - $seqId = $besthit->{name}; - $refSpec = $besthit->{species}; - my $details = "Evalue: " . $besthit->{evalue}; - printOut("Seq id has been determined as $seqId in $refSpec with $details", 2); - # if(length("$seqName|$refSpec|$seqId") > 60) { - # die "Output file will have header longer than 60 characters ($seqName|$refSpec|$seqId). Please consider shorten the sequence IDs! More at https://github.com/BIONF/fDOG/wiki/Check-data-validity\n"; - # } - if($seqId eq '') { - print "There was no significant hit for your sequence in " . $refSpec . ".\nPlease specify a sequence id on your own.\n"; - exit; - } - } - - if($coreTaxa) { - if(! -e $coreTaxa) { - print "Please specify a valid file with taxa for the core orthologs search\n"; - exit; - } - my @userTaxa = parseTaxaFile($coreTaxa); - my %newTaxa = (); - foreach (@userTaxa) { - $newTaxa{$_} = $taxa{$_}; - } - $newTaxa{$refSpec} = $refTaxa{$refSpec}; - %refTaxa = %newTaxa; - } - - if($group) { - checkGroup($group); - } - - if(!$seqName) { - my $i = 0; - while($i < 7) { - my $j = chr(int(rand(127))); - if($j =~ /[a-zA-Z]/) { - $seqName .=$j; - $i++; - } - } - print "Your sequence was named: " . $seqName . "\n\n"; - } - $outputPath = $outputPath . "/$seqName"; - if (! -d "$outputPath"){ - mkdir "$outputPath", 0777 or die "could not create the output directory $outputPath"; - } - ## check whether a result file already exists: - $finalOutput = $outputPath . '/' . $seqName . '.extended.fa'; - if ($outputPath && -e "$finalOutput"){ - ## an ouput file is already existing - if (!$force && !$append){ - ## The user was not aware of an existing output file. Let's ask him - my $input = ''; - my $breaker = 0; - - while (($input !~ /^[aor]/i) and ($breaker < 4)) { - $breaker++; - die "\nAn outputfile $finalOutput already exists. Please consider option --force for overwriting it or option --append for appending to it.\n" - } - } - if ($force){ - ## the user wants to overwrite - printOut("Removing existing output directory $outputPath", 1); - rmtree ([ "$outputPath" ]) or die "could not remove existing output directory $outputPath\n"; - mkdir $outputPath or die "could not re-create the output directory $outputPath\n"; - } - elsif ($append) { - if (-e "$outputPath/$seqName.extended.fa") { - ## read in the content for latter appending - printOut("Appending output to $outputPath/$seqName.extended.fa", 1); - open (IN, "<$outputPath/$seqName.extended.fa") or die "failed to open $outputPath/$seqName.extended.fa after selection of option -append\n"; - while () { - my $line = $_; - if ($line =~ /\|/) { - chomp $line; - my @keys = split '\|', $line; - $profile{$keys[1]} = 1; - } - } - } - elsif ($fasoff) { - ## no extended.profile file exists but not necessary, because user switched off FAS support -> do nothing - } - else { - printOut("Option --append was selected, but the existing output was incomplete. Please restart with the --force option to overwrite the output", 1); - exit; - } - } - else { - printOut("Renaming existing output file to $finalOutput.old", 2); - my $bu_dir = $outputPath.'_bkp'; - !`mv $outputPath $bu_dir` or die "Could not rename existing output file $outputPath to $bu_dir\n"; - mkdir $outputPath or die "could not recreate $outputPath after renaming the old output\n" - } - } - - #### checking for the min and max distance for the core set compilation - #### omit this check, if the option reuseCore has been selected (added 2019-02-04) - $optbreaker = 0; - if (!$coreex) { - my $node; - $node = $db->get_taxon(-taxonid => $refTaxa{$refSpec}); - $node->name('supplied', $refSpec); - if (lc($maxDist) eq "root"){ - $maxDist = 'no rank'; - } - while (!$maxDist or (checkRank($maxDist, $node) == 0)) { - if ($optbreaker >= 3){ - print "No proper maxDist given ... exiting.\n"; - exit; - } - print "You have not defined a valid maximum distance rank!\n"; - printTaxonomy($node); - my $in = getInput('Please choose a rank by giving the number in square brackets', 1); - $optbreaker++; - $maxDist = parseInput($node, $in); - print "You selected ". $maxDist . " as maximum rank\n\n"; - } - while (!$minDist or (checkRank($minDist, $node) == 0)) { - if ($optbreaker >= 3){ - print "No proper minDist given ... exiting.\n"; - exit; - } - print "You have not defined a minimum distant rank!\n"; - printTaxonomy($node); - my $in = getInput('Please choose a rank by giving the number in square brackets', 1); - $optbreaker++; - $minDist = parseInput($node, $in); - print "You selected " . $minDist . " as minimum rank\n\n"; - } - } - $optbreaker = 0; - - #### checking in fas options - if($fasoff){ - print "You have turned FAS support off. Candidate orthologs will not be evaluated by their FAS.\n"; - # turn FAS support off - $fas_support = 0; - } - ## check if user defined fas_T is off limits - if ($fas_T < 0 or $fas_T > 1){ - print "You chose an odd FAS score filter (-minScore), default is 0.75.\n"; - my $answer = ''; - $optbreaker = 0; - while ($answer < 0 or $answer > 1) { - if ($optbreaker >= 3){ - print "No proper fas filter given ... exiting.\n"; - exit; - } - $answer = getInput("Please choose a FAS score filter [0,1] between 0 (relaxed) and 1 (stringent):"); - $optbreaker++; - } - if ($answer > 0 and $answer < 1) { - $fas_T = $answer; - } - } - ### rather strict fas filter for core orthologs: OFF - if(!$core_filter_mode){ - unless ($silent) { - print "No FAS filter for core-orthologs set.\n"; - } - }elsif($core_filter_mode eq "relaxed"){ - #core ortholog candidates with a FAS score below the threshold will be disadvantaged - }elsif($core_filter_mode eq "strict"){ - #core ortholog candidates with a FAS score below the threshold will not be considered any more - }else{ - print "No known filter mode for core-orthologs specified. Continuing with default settings\n"; - $core_filter_mode = 0; - } - - ### check alignment strategy - if (($local && $global) or ($local && $glocal) or ($global && $glocal)){ - print "Please specify only one alignment strategy!\n"; - print "Possible options are: -glocal, -local, or -global\n"; - print "... exiting.\n"; - exit; - }elsif(!$local && !$global && !$glocal){ - unless ($silent) { - print "No specific alignment strategy set. Continuing with local alignments (Smith-Waterman-Algorithm).\n"; - } - $local = 1; - } -} - -####################### sub check the systematic rank -sub checkRank { - my $rank = $_[0]; - my $node = $_[1]; - my $rankExists = 0; - while($node->ancestor && $rankExists == 0) { - if($node->rank eq $rank) { - $rankExists = 1; - } - $node = $node->ancestor; - } - - if($node->rank eq $rank) { - $rankExists = 1; - } - - return $rankExists; -} - -############ -## modified by Ingo - Added Option to run Muscle -sub createAlnMsf { - my $linsiCommand = ''; - if (!defined $aln or $aln eq 'mafft-linsi') { - my $linsiCommand = "mafft --maxiterate 1000 --localpair --anysymbol --quiet \"" . $outputFa . "\" > \"" . $outputAln . "\""; - } - elsif ($aln eq 'muscle') { - $linsiCommand = "muscle -quiet -in \"" . $outputFa . "\" -out \"" .$outputAln. "\""; - } - else { - die "issues with the msa. You need to select either mafft or muscle\n"; - } - system($linsiCommand) == 0 or die "Could not run alignment\n$linsiCommand\n"; -} - -################ creating folders for fas support usage -sub createWeightFolder{ - #create weight_dir in hamstr1seq home dir - my $weightdir = $path."/"."weight_dir"; - mkdir "$weightdir", 0777 unless -d "$weightdir"; -} - -################ -sub createFoldersAndFiles { - my ($outputFa, $seqName, $inputSeq, $refSpec) = (@_); - #create core orthologs directory - my $dir = $coreOrthologsPath . $seqName; - if (!$coreex){ - mkdir "$dir", 0755 unless -d "$dir"; - my $header = $seqName . "|" . $refSpec . "|" . $seqId; - - #create FA file - open (OUTPUT, ">$outputFa") or die "Error creating fa file $outputFa\n"; - print OUTPUT ">" . $header . "\n"; - print OUTPUT $inputSeq; - close OUTPUT; - - #create the Aln file initially only with a single species in there - open (OUTPUT, ">$outputAln") or die "Error creating fa file $outputAln\n"; - print OUTPUT ">" . $header . "\n"; - print OUTPUT $inputSeq; - close OUTPUT; - - #create the folder for the hmm output - my $hmmdir = $dir . "/hmm_dir"; - mkdir "$hmmdir", 0755 unless -d "$hmmdir"; - } - #create the fas_dir for core orthologs if fas support is ON - if ($fas_support){ - my $fasdir = $dir. "/fas_dir"; - mkdir "$fasdir", 0777 unless -d "$fasdir"; - - my $annodir = $fasdir."/annotation_dir"; - mkdir "$annodir", 0777 unless -d "$annodir"; - } -} -################# -sub fetchSequence { - my ($file, $filepath) = @_; - if (! defined $filepath){ - $filepath = '.'; - } - my $seq = ""; - open (INPUT, "<$filepath/$file") or die print "Error opening seq file\n"; - while() { - my $line = $_; - chomp($line); - unless($line =~ /^\>.*/) { - $seq = $seq . $line; - } - } - close INPUT; - $seq =~ s/\s*//g; - unless ($silent) { - printOut($seq, 2); - } - return $seq; -} -################################# -## choose the ortholog which reaches the highest score -sub getBestOrtholog { - ## max possible score is either one or two - my $maxScore = 1; - if ($fas_support){ - $maxScore += 1; - } - - ## get leavs to evaluate - my @leaves = get_leaves($tree, $treeDelFlag); - ## sort by distance in taxonomy tree - if (!$ignoreDistance){ - @leaves = sort_leaves(@leaves); - } - ## don't sort by distance - else{ - my @unsortedLeaves = @leaves; - @leaves = qw(); - push @leaves, \@unsortedLeaves; - } - - ## create needed variables - my $bestTaxon = ''; - my $rankScore = 0; - my $header = ''; - my $seq = ''; - my $newNoRankDistNode; ## this will be the new Distance node, after a new candidate has been choosen - my $newChildsToIgnoreNode; ## all leaves under this node will be ignored in future runs, after a new candidate has been choosen - my $sufficientlyClose = 0; ## flag to break outer loop - my $candidatesFile = $outputFa . ".extended"; - - ## iterate over each array with leaves of same distance - foreach my $array (@leaves) { - ## break loop if a candidate was close to the max score and no more candidates remain with the same distance - if ($sufficientlyClose){ - unless ($silent) { - print "Best Taxon is sufficiently close to max score and no more candidates with same distance remain.\nStopping evaluation.\n"; - } - last; - } - ## iterate over each leaf with the same distance - foreach my $key (@$array){ - my $keyName = @{$key->name('supplied')}[0]; - my $nodeId = $wholeTree->find_node(-ncbi_taxid => $refTaxa{$keyName})->id; - unless ($silent) { - print "fdog species: " . $key->scientific_name . " - " . @{$key->name('supplied')}[0] . "\n"; - } - my $coreTaxon = @{$key->name('supplied')}[0]; - my $coreTaxonName = getTaxonName($coreTaxon); - if (defined($coreTaxonName)) { - unless ($silent) { - print $coreTaxon, "\t", $coreTaxonName, "\n"; - } else { - print $coreTaxonName, "\n"; - } - } - runHamstr($coreTaxon, $seqName, $outputFa, $refSpec, $core_hitlimit, $core_rep, $corestrict, $coremode, $eval_blast, $eval_hmmer, $aln, 1); - ## check weather a candidate was found in the searched taxon - if(-e $candidatesFile) { - - ## get found candidates for one taxon in hash to iterate over - my %candicontent = getCandicontent(); - - ## get scores in hashes because there might be more than one candidate sequence per taxon - my %alnScores = getAlnScores(); - my %fas_box; - my $gotFasScore = 0; - ## iterate over found candidates - foreach my $candiKey (keys %candicontent){ - ## candidates alnScore is high enought, that it would be better with a fasScore of one - ## -> evaluate - if ($alnScores{$candiKey} > $rankScore * (1 + $distDeviation) - 1){ - %fas_box = getFasScore(); - if (!$gotFasScore and $fas_support){ - # %fas_box = getFasScore(); - $gotFasScore = 1; - } - ## get rankscore - my $newRankScore = getFilteredRankScore($alnScores{$candiKey}, $fas_box{$candiKey}); - ## candidate is significantly better, than the last one - if ($newRankScore > $rankScore * (1 + $distDeviation)){ #uninit - $bestTaxon = ">" . $candiKey; - $rankScore = $newRankScore; - ($header, $seq) = getHeaderSeq($bestTaxon); - $newNoRankDistNode = $currentNoRankDistNode; - $newChildsToIgnoreNode = $currentChildsToIgnoreNode; - my $newNodeId = $key->id; - ## set new distance nodes, which will replace the old ones given, that this candidate will remain the best - while (!defined $hashTree{$newNoRankDistNode}{$newNodeId}){ - $newNoRankDistNode = $newNoRankDistNode->ancestor; - $newChildsToIgnoreNode = $newChildsToIgnoreNode->ancestor; - } - unless ($silent) { - print "New Best Taxon: $bestTaxon\n"; - } - } - } - ## candidate has the same distance, as the last one and could be better, with a fasScore of one - elsif (defined $hashTree{$newNoRankDistNode}{$key->id} and $alnScores{$candiKey} > $rankScore - 1){ - %fas_box = getFasScore(); - if (!$gotFasScore and $fas_support){ - # %fas_box = getFasScore(); - $gotFasScore = 1; - } - ## get rankscore - my $newRankScore = getFilteredRankScore($alnScores{$candiKey}, $fas_box{$candiKey}); - ## candidate is better, than the last one - if ($newRankScore > $rankScore){ - $bestTaxon = ">" . $candiKey; - $rankScore = $newRankScore; - ($header, $seq) = getHeaderSeq($bestTaxon); - printDebug("New Taxon has the same distance, choosing the one with higher score"); - unless ($silent) { - print "New Best Taxon: $bestTaxon\n"; - } - } - } - } - ## candidate reached the maximum score, no need to evaluate further - if ($rankScore >= $maxScore){ - $sufficientlyClose = 1; - printDebug("Rankscore is at maximum. Breaking loop..."); - last; - } - ## rankscore got sufficiently close to the maximum, only evaluate candidates with the same distance now - elsif ($rankScore >= $maxScore * (1 - $distDeviation) and !$ignoreDistance){ - printDebug("Sufficiently close to max score. Only evaluating leafs with same distance now."); - unless ($silent) { - print "MaxScore: $maxScore\n"; - print "RankScore: $rankScore\n"; - } - $sufficientlyClose = 1; - } - clearTmpFiles(); - } - ## no candidate file was created -> so no candidate was found - else{ - unless ($silent) { - print "No Candidate was found for $keyName\n"; - } - } - } -} - -my @best = (split '\|', $bestTaxon); -$currentNoRankDistNode = $newNoRankDistNode; -$currentChildsToIgnoreNode = $newChildsToIgnoreNode; -clearTmpFiles(); - -if ($bestTaxon ne ''){ - open (COREORTHOLOGS, ">>$outputFa") or die "Error: Could not open file: " . $outputFa . "\n"; - print COREORTHOLOGS "\n" . $header . "\n" . $seq; - close COREORTHOLOGS; - return $best[1]; -}else{ - return ''; -} -} - -###################### -## param: %candicontent - hashed information about candidates (id-> sequence) -## param: $scorefile - filename with alignment tool output -## cumulative alignment scores -## candidates vs sofar core ortholog set -## return: hash of scores (id->score) -sub cumulativeAlnScore{ - my $file = $_[0]; - my %content = %{$_[1]}; - - my %cumscores; - foreach my $key(keys%content) { - my $gotScore = 0; - open (RESULT, $file) or die "Error: Could not open file with candidate taxa\n"; - while() { - my $line = $_; - $line =~ s/[\(\)]//g; - my @line = split('\s+',$line); - my $shortedId = substr($key, 0, 60); - # if($line[0] && ($line[0] eq $key)){ - if($line[0] && ($line[0] eq $shortedId)){ - if(exists $cumscores{$key}) { - $gotScore = 1; - $cumscores{$key} = $cumscores{$key} + $line[2]; - }else{ - $gotScore = 1; - $cumscores{$key} = $line[2]; - } - } - } - close RESULT; - if ($gotScore == 0){ - $cumscores{$key} = 0; - } - } - return %cumscores; -} - -###################### -sub get_leaves { - my $tree = $_[0]; - my $delFlag = 0; - if(defined($_[1])){ - $delFlag = $_[1]; - } - - my $node = $tree->get_root_node; - my @leaves; - my @children = ($node); - for (@children) { - push @children, $_->each_Descendent(); - } - for (@children) { - push @leaves, $_ if defined($_->name('supplied')); - } - # if the tree is set to be deleted - if ($delFlag){ - @leaves = qw(); - return @leaves; - }else{ - return @leaves; - } -} - -################################# -## sorts given leaves by distance -## and delets all leaves to close to the current core -sub sort_leaves { - my @leaves = @_; - my $distNode = $currentChildsToIgnoreNode; - my @candiLeaves; - my @finalLeaves; - - for (@leaves) { - if (!defined $hashTree{$distNode}{$_->id}){ - push @candiLeaves, $_ if defined($_->name('supplied')); - } - } - while ($distNode->id != $tree->get_root_node->id and scalar @candiLeaves != 0){ - $distNode = $distNode->ancestor; - my @nextCandiLeaves; - my @sameDistLeaves; - for (@candiLeaves){ - if (defined $hashTree{$distNode}{$_->id}){ - push @sameDistLeaves, $_ if defined($_->name('supplied')); - } - else{ - push @nextCandiLeaves, $_ if defined($_->name('supplied')); - } - } - @sameDistLeaves = shuffle @sameDistLeaves; - if (scalar @sameDistLeaves != 0){ - push @finalLeaves, \@sameDistLeaves; - } - @candiLeaves = @nextCandiLeaves; - } - return @finalLeaves; -} -####### get all taxa from the database (or the $genome_dir) where a genome is available -sub getTaxa { - if ($dbmode) { - my ($sql) = "select l.taxon_id, l.taxon_db, l.max_source_id, t.ncbi_id from cproteome_list.list l, taxon t where t.taxon_id = l.taxon_id and t.ncbi_id != 0"; - my ($query) = $dbHandle->prepare($sql); - $query->execute(); - while(my @result = $query->fetchrow_array) { - ## modified by ingo: make sure to capture the max_source_id - my $tax_src = $result[1] . '@' . $result[3] . '@' . $result[2]; - push @taxonlist, $tax_src; - $taxa{$tax_src} = $result[3]; - printDebug("ncbiid of $tax_src is $taxa{$tax_src}"); - if ($getThemAll){ - getProteome($tax_src); - } - } - } - else { - ## removal of misplaced files in genome_dir - if (-e "$genome_dir/query.sql"){ - unlink("$genome_dir/query.sql"); - } - if (-e "$genome_dir/@@.fa"){ - unlink("$genome_dir/@@.fa"); - } - @taxonlist = `ls $genome_dir`; - chomp @taxonlist; - for (my $i = 0; $i < @taxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $taxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $taxonlist[$i]; - $taxa{$taxon_name} = $ncbi_id; - } - } - ### if the blast option is chosen, we will need blast databases for all taxa - ### Baustelle: have one database including all taxa to run just a single instead of n blast searches - if ($blast or $updateBlast_dir){ - for (my $i = 0; $i < @taxonlist; $i++){ - checkBlastDb($taxonlist[$i], $taxonlist[$i]); - } - if ($updateBlast_dir){ - print "\nMissing blast databases updated. Exiting.\n"; - exit; - } - } - my $hashcount = keys(%taxa); - printDebug("Returning $hashcount taxa from subroutine getTaxa"); - return(%taxa); -} -####### get all available reference taxa -sub getRefTaxa { - @refTaxonlist = `ls $blastPath`; - chomp @refTaxonlist; - for (my $i = 0; $i < @refTaxonlist; $i++) { - my ($taxon_name, $ncbi_id, $src_id) = split /@/, $refTaxonlist[$i]; - if (!$src_id) { - $src_id = ''; - } - $taxon_name = $refTaxonlist[$i]; - $refTaxa{$taxon_name} = $ncbi_id; - } - return(%refTaxa); -} -#################### -sub getTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %taxa) { - my $node = $db->get_taxon(-taxonid => $taxa{$key}); - printDebug("\$key in sub getTree is $key and taxid is $taxa{$key}\n"); - if (!defined $node){ - print "ISSUE in sub getTree. No correspodence found in taxonomy file for $key and taxid $taxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - return $tree; -} - -sub getRefTree { - # the full lineages of the species are merged into a single tree - my $tree; - foreach my $key (sort {lc $a cmp lc $b} keys %refTaxa) { - my $node = $db->get_taxon(-taxonid => $refTaxa{$key}); - printDebug("\$key in sub getRefTree is $key and taxid is $refTaxa{$key}\n"); - if (!defined $node){ - print "ISSUE in sub getRefTree. No correspodence found in taxonomy file for $key and taxid $refTaxa{$key}. Skipping...\n"; - next; - } - else { - $node->name('supplied', $key); - if($tree) { - $tree->merge_lineage($node); - } - else { - $tree = Bio::Tree::Tree->new(-verbose => $db->verbose, -node => $node); - } - } - } - if ($debug){ - print "\nTaxonomic Tree as text:\n"; - my $tree_as_string = $tree->as_text("tabtree"); - print $tree_as_string; - print "\n"; - } - return $tree; -} - -sub getTaxonName { - my $taxAbbr = $_[0]; - my @tmp = split(/@/,$taxAbbr); - my $taxon = $db_bkp->get_taxon($tmp[1]); - if (defined($taxon)) { - return($taxon->scientific_name); - } else { - return("Unk"); - } -} - -##################### perform the search for orthologs -# using the core-orthologs found in the previous steps -sub runHamstr { - my ($taxon, $seqName, $outputFa, $refSpec, $hitlimit, $rep, $sub_strict, $subcoremode, $ev_blst, $ev_hmm, $aln, $core) = (@_); - my $taxaDir = $taxaPath . $taxon; - printDebug("Running fdog: $taxon\t$seqName\t$outputFa\t$refSpec\t$taxaDir"); - if (! -e $taxaDir) { - ## backward compatibility. I used to name the dirs with the ending .dir - if (-e "$taxaDir.dir"){ - $taxaDir = $taxaDir . '.dir'; - } - } - $taxaDir =~ s/\s*//g; - if(! -e $taxaDir and $dbmode) { - getProteome($taxon); - } - if (-e $taxaDir) { - unless ($silent) { - print "fdog for taxon: " . $taxon . "\n"; - } - chdir($taxaDir) or die "Error: Directory for " . $taxon . " does not exist!\n"; - my $seqfile = $taxon . ".fa"; - - if(! -e $seqfile) { - printOut("Could not find $seqfile. Check naming conventions of the files. Exiting..."); - exit; - } - - if($seqFile ne "") { - my $taxon_id = substr($taxon, 6, length($taxon)); - my @hamstr = ($hamstrPath, "-sequence_file=".$seqfile, "-fasta_file=".$outputFa, "-hmmpath=".$coreOrthologsPath , "-outpath=".$outputPath, - "-blastpath=".$blastPath , "-protein", "-hmmset=".$seqName, "-taxon=".$taxon, "-force", - "-eval_blast=".$ev_blst, "-eval_hmmer=".$ev_hmm, "-central", "-aligner=".$aln); - - my $resultFile; - if (defined $autoLimit) { - push(@hamstr, "-autoLimit"); - } - elsif (defined $scoreThreshold) { - push(@hamstr, "-scoreThreshold"); - push(@hamstr, "-scoreCutoff=$scoreCutoff"); - } - elsif (defined $hitlimit) { - push(@hamstr, "-hit_limit=$hitlimit"); - } - if($sub_strict) { - push(@hamstr, "-strict"); - $resultFile = $outputPath . "/fa_dir_" . $taxon . '_' . $seqName . "_strict/" . $seqName . ".fa"; - } - else { - push(@hamstr, "-refspec=".$refSpec); - $resultFile = $outputPath . "/fa_dir_" . $taxon . '_' . $seqName . "_" . $refSpec . "/" . $seqName . ".fa"; - } - if($rep) { - push(@hamstr, "-representative"); - } - if ($checkcoorthologsref and $subcoremode==0){ - push @hamstr, '-checkCoorthologsRef'; - } - if ($cccr and $subcoremode==1){ - push @hamstr, '-checkCoorthologsRef'; - } - if ($rbh) { - push @hamstr, "-rbh"; - } - ## added 2019-11-19 - if ($append) { - push @hamstr, "-append"; - } - ## - if ($silent) { - push @hamstr, "-silent"; - } - if ($debug) { - push @hamstr, "-debug"; - } - printDebug(@hamstr); - system(@hamstr) == 0 or die "Error: fdog failed for " . $taxon . "\n"; - - if ($core == 1) { - if ($outputFa !~ /extended/){ - $outputFa .= '.extended'; - } - if(-e $resultFile) { - unless (-e $outputFa) { - open(EXTENDEDFA, ">$outputFa") or die "Cannot create $outputFa\n"; - } else { - open(EXTENDEDFA, ">>$outputFa") or die "Cannot create $outputFa\n"; - } - my $resultFa = Bio::SeqIO->new(-file => $resultFile, '-format' => 'Fasta'); - while(my $resultSeq = $resultFa->next_seq) { - if ($resultSeq->id =~ /$taxon\|(.)+\|[01]$/) { - my @tmpId = split("\\|", $resultSeq->id); - print EXTENDEDFA ">$tmpId[0]\|$tmpId[-3]\|$tmpId[-2]\|$tmpId[-1]\n",$resultSeq->seq,"\n"; - } - } - # addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); - } else { - # add seed sequence to output extended.fa if no ortholog was found in refSpec - if ($taxon eq $refSpec) { - addSeedSeq($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa); - } - printDebug("$resultFile not found"); - } - } - } - #remove the created folders and files - #delete fa_dir - my $delCommandFa; - my $delCommandHmm; - my $delCommandHam; - # my $outputPathTmp = $outputPath; $outputPathTmp =~ s/\|/\\\|/g; - # my $taxonTmp = $taxon; $taxonTmp =~ s/\|/\\\|/g; - # my $seqNameTmp = $seqName; $seqNameTmp =~ s/\|/\\\|/g; - if (!$strict) { - $delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_" . $refSpec . "\""; - $delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\""; - if ($core == 1) { - $delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".out" . "\""; - } - } else { - $delCommandFa = "rm -rf \"" . $outputPath . "/fa_dir_" . $taxon . "_" . $seqName . "_strict" . "\""; - $delCommandHmm = "rm -rf \"" . $outputPath . "/hmm_search_" . $taxon . "_" . $seqName . "\""; - if ($core == 1) { - $delCommandHam = "rm -f \"" . $outputPath . "/hamstrsearch_" . $taxon . "_" . $seqName . ".strict.out" . "\""; - } - } - printDebug("executing $delCommandFa", "executing $delCommandHmm"); - if ($core == 1) { - printDebug("executing $delCommandHam"); - } - if ($autoclean) { - system ($delCommandFa) == 0 or die "Error deleting result files\n"; - system ($delCommandHmm) == 0 or die "Error deleting result files\n"; - if ($core == 1) { - system ($delCommandHam) == 0 or die "Error deleting result files\n"; - } - } - } - else { - print "No protein set available for $taxon. Failed to fetch it from database and nothing at $taxaDir. Skipping!\n"; - } -} - -# add seed sequence to output file if not exists -sub addSeedSeq { - my ($seqId, $seqName, $coreOrthologsPath, $refSpec, $outputFa) = @_; - unless (-e $outputFa) { - system("touch $outputFa"); - } - # get seed sequence and add it to the beginning of the fasta output - open(TEMP, ">$outputFa.temp") or die "Cannot create $outputFa.temp!\n"; - my $seqio = Bio::SeqIO->new(-file => "$coreOrthologsPath/$seqName/$seqName.fa", '-format' => 'Fasta'); - my %idTmp = (); # used to check which seq has already been written to output - while(my $seq = $seqio->next_seq) { - my $id = $seq->id; - if ($id =~ /$refSpec/) { - $idTmp{"$id|1"} = 1; - print TEMP ">$id|1\n", $seq->seq, "\n"; - #last; - } - } - # then write other sequences - my $seqio2 = Bio::SeqIO->new(-file => "$outputFa", '-format' => 'Fasta'); - while(my $seq = $seqio2->next_seq) { - my $id = $seq->id; - unless ($id =~ /$refSpec\|$seqId/) { # /$refSpec/) { - unless ($idTmp{$id}) { - print TEMP ">$id\n", $seq->seq, "\n"; - $idTmp{$id} = 1; - } - } - } - close(TEMP); - system("mv $outputFa.temp $outputFa") -} - -########################## -sub hmmbuild { - # my @hmmbuild = ("hmmbuild", $_[0], $_[1]); - # system(@hmmbuild) == 0 or die "hmmbuild failed"; - my $hmmbuild = `hmmbuild $_[0] $_[1] > /dev/null 2>&1`; -} - -sub parseInput { - my $node = $_[0]; - my $level = $_[1]; - my $rank = $node->rank; - printDebug("\nLEVEL:".$level."\n"); - printDebug("\nRANK:".$rank."\n"); - while($level > 0) { - $node = $node->ancestor; - $rank = $node->rank; - --$level; - } - print "\nRETURN RANK: ".$rank."\n"; - return $rank; -} -########################## -sub parseTaxaFile { - my $coreTaxaFile = $_[0]; - open (INPUT, "<$coreTaxaFile") or die print "Error opening file with taxa for core orthologs search\n"; - my @userTaxa; - while() { - my $line = $_; - chomp($line); - if (length($line) > 0) { - if(!$taxa{$line}) { - print "You specified " . $line . " in your core orthologs file but the taxon is not in the database!\n"; - exit; - } else { - push(@userTaxa, $line); - } - } - } - close INPUT; - return @userTaxa; -} -########################## -# sub printTaxa { -# my @result = qw(); -# if ($dbmode) { -# print "taxon_schema\tsource_id\ttaxon name\n"; -# print "------------\t---------\t----------\n"; -# my ($sql) = "select t.name, c.taxon_db, c.max_source_id from taxon t, cproteome_list.list c where t.taxon_id=c.taxon_id"; -# my ($query) = $dbHandle->prepare($sql); -# $query->execute(); -# @result = $query->fetchrow_array; -# while(my @result = $query->fetchrow_array) { -# print $result[1] . " \t" . $result[2] . "\t" . $result[0] . "\n"; -# } -# } -# else { -# print "Taxon_Name\tNCBI_ID\n"; -# print "-------------\t------------\n"; -# my $taxacall= "ls $genome_dir |$sedprog -e 's/@/\t/'"; -# @result = `$taxacall`; -# chomp @result; -# print join "\n", @result; -# print "\n"; -# } -# } -########################### -sub printTaxonomy { - my $node = $_[0]; - my $i = 0; - if($node->rank eq "species") { - print "[" . $i . "]: " . $node->rank . " (" . $node->scientific_name . ")\n"; - while($node->ancestor) { - $node = $node->ancestor; - ++$i; - print "[" . $i . "]: " . $node->rank . " (" . $node->scientific_name . ")\n"; - } - } -} -############################ -sub remove_branch { - my $node = $_[0]; - my $delFlag = 0; - printDebug("Subroutine remove_branch\nNode is $node\nRank of node: ".$node->rank."\nNumber of leaves before removing branch ".get_leaves($tree)."\n\n"); - - # undef the tree if there is only one leave left which must be removed - if (get_leaves($tree) == 1){ - $delFlag = 1; - }else{ - while (defined $node->ancestor) { - last if $node->ancestor->each_Descendent > 1; - $node = $node->ancestor; - } - $node->remove_all_Descendents; - if(defined $node->ancestor) { - $node->ancestor->remove_Descendent($node); - } - } - printDebug("Subroutine remove_branch\nNode is $node\nRank of node: ".$node->rank."\nNumber of leaves after removing branch ".get_leaves($tree, $delFlag)."\n\n"); - return $delFlag; -} -############################ -sub removeMaxDist { - my $node = $tree->find_node(-ncbi_taxid => $refTaxa{$refSpec}); - my $root = $tree->get_root_node(); - - if ($maxDist eq "no rank"){ - $tree->set_root_node($root); - }else{ - while($node->rank ne $maxDist && $node != $root) { - $node = $node->ancestor; - } - $tree->set_root_node($node); - } -} -############################ -# node determines the node in the tree in accordance to the given ncbi taxon id -sub removeMinDist { - my $ncbiId = $_[0]; - my $node = $tree->find_node(-ncbi_taxid => $ncbiId); - my $root = $tree->get_root_node(); - my $delFlag; - - printDebug("Subroutine removeMinDist\nncbiID is $ncbiId\nNode is $node\nRank of node is ".$node->rank."\nroot is $root\nMinimal distance is $minDist\n"); - - # increasing the rank of the node - while($node->rank ne $minDist && $node != $root && defined($node->ancestor)) { - if ($debug){ - print "Increasig the rank\nRank: ".$node->rank."\nNode: ".$node."\n\n"; - } - - $node = $node->ancestor; - } - - #if the species has the same ranks as the references species - if($node == $root) { - my @toCompare = (); - my $i = @defaultRanks - 1; - while($i >= 0 && $defaultRanks[$i] ne $minDist) { - push(@toCompare, $defaultRanks[$i]); - --$i; - } - $node = $tree->find_node(-ncbi_taxid => $ncbiId); - my $lastToCompare = $toCompare[$#toCompare]; - foreach(@toCompare) { - while($node->rank eq "no rank") { - $node = $node->ancestor; - } - if($node->rank ne $lastToCompare && $node->rank eq $_) { - $node = $node->ancestor; - } - } - } - $delFlag = remove_branch($node); - return $delFlag; -} - -############################ -## builds a 2 dimensional hash in which you can check for a node, -## wheather there is a path down the tree to a given species -sub buildHashTree { - unless ($silent) { - print "Building hash tree\n"; - } - - printDebug("Creating variables..."); - my %hashTree; - my %nextNodes; - my %processed; - my @ancestors; - my $rootNode = $wholeTree->get_root_node(); - - unless ($silent ){ - print "Processing leafs...\n"; - } - ## create entry for leafes - foreach my $leaf (get_leaves($wholeTree)){ - my $key = $leaf->id; - my %leafHash; - $leafHash{$key} = "exists"; - $hashTree{$leaf}{$key} = "exists"; - my $nextNode = $leaf->ancestor; - my $nextNodeKey = $nextNode->id; - my $test = $hashTree{$leaf}{$key}; - my $nodeTest = $nextNodeKey; - printDebug("Leaf $key set to $test"); - ## queue ancestor node for processing, if it hasn't been queued already - if (!$nextNodes{$nextNodeKey}){ - $nextNodes{$nextNodeKey} = $nextNode; - push @ancestors, $nextNode; - printDebug("Queuing ancestor $nextNodeKey for processing...\n"); - } - $processed{$leaf} = 1; - } - unless ($silent) { - print "Finished leafs\n"; - } - - ## create entries for all other nodes - unless ($silent) { - print "Processing ancestor nodes\n"; - } - foreach my $node (@ancestors){ - my $test = $node->id; - printDebug("Processing node: $test\n"); - my $bool = 1; - ## check, weather all childs have already been processed - foreach my $child ($node->each_Descendent()){ - if (!defined $processed{$child}){ - $bool = 0; - } - } - ## if all childs have been processed, process this node - if ($bool == 1){ - printDebug("All children processed for node: $test"); - ## node is not root - if ($node != $rootNode){ - printDebug("Node $test is not root"); - foreach my $child ($node->each_Descendent()){ - while (my ($key, $value) = each %{$hashTree{$child}}){ - $hashTree{$node}{$key} = $value; - printDebug("Node $key $value in node $test"); - } - } - my $nextNode = $node->ancestor; - my $nextNodeKey = $nextNode->id; - ## queue ancestor node for processing, if it hasn't been queued already - if (!$nextNodes{$nextNodeKey}){ - $nextNodes{$nextNodeKey} = $nextNode; - push @ancestors, $nextNode; - printDebug("Queuing ancestor $nextNodeKey for processing..."); - } - } - ## node is root - else{ - printDebug("Node $test is root"); - foreach my $child ($node->each_Descendent()){ - while (my ($key, $value) = each %{$hashTree{$child}}){ - $hashTree{$node}{$key} = $value; - printDebug("Node $key $value in node $test"); - } - } - } - ## mark node as processed - $processed{$node} = 1; - printDebug("Node $test has been processed\n\n"); - } - ## not all childs have been processed - ## queue node again - else{ - push @ancestors, $node; - printDebug("Not all children processed for node: $test"); - printDebug("Queuing $test again...\n\n"); - } - } - unless ($silent) { - print "Finished processing ancestor nodes\n"; - print "Finished building hash tree\n"; - print "Returning hash tree...\n"; - } - return %hashTree; -} - -########################## -sub getProteome { - my $taxstring = shift; - my $outdir = $taxstring; - my $outfile = $taxstring; - my @outfile; - $taxstring =~ /(.*)@([0-9]+)@([0-9]+)/; - my ($schema, $ncbi_id, $src_id) = ($1, $2, $3); - print "\n\nAttempting to fetch information for $schema using source id $src_id\n\n"; - - ## create the relevant directory - if (!-e "$taxaPath/$outdir"){ - print "creating directory $taxaPath/$outdir\n"; - mkpath($taxaPath."/".$outdir); - if (-e "$taxaPath/$outdir") { - print "succeeded\n"; - } - else { - print "create directory failed\n"; - } - } - - ### This is the sql statement required for fetching the sequence information from the database - ## Using Here Documents ####### - my $sql = <<"________END_OF_STATEMENT"; - use $schema; - select concat('>',i.id, '\\n', p.seq) from ids i, protein p - where - i.protein_id = p.id and - i.representative = 1 and - i.src_id = $src_id and - length(p.seq) > 30; -________END_OF_STATEMENT - ## the previous line must be exactly like it is to match the end note (Here Doc) - - printDebug("$sql\n"); - open (OUTQUERY, ">$taxaPath/$outdir/query.sql") or die "Could neither find nor create query.sql in $taxaPath/$outdir"; - print OUTQUERY $sql; - close OUTQUERY; - print "attempting to enter $taxaPath/$outdir\n"; - chdir("$taxaPath/$outdir") or die "could not enter $taxaPath/$outdir"; - `$homeDir/bin/run-query.sh $schema $ncbi_id $src_id`; -} -############ -#Baustelle: run generation of BlastDb in a sub routine -## now create the relevant blast directories if necessary -sub checkBlastDb { - my ($taxstring, $filename) = @_; - ## $taxstring identifies the species directory, $filename identifies the name of the file containing the protein set - while (! -e "$taxaPath/$taxstring/$filename.fa"){ - my $count = 0; ## avoid endless loop - printDebug("could not find $taxaPath/$taxstring/$filename.fa\n"); - getProteome($taxstring); - if ($count == 5){ - die "could not find $taxaPath/$taxstring/$filename.fa and could not retrieve this information from the database.\nTerminating...\n\n"; - } - } - if (! -e "$blastPath/$taxstring" or $updateBlast_dir){ - `mkdir $blastPath/$taxstring`; - } - if (! -e "$blastPath/$taxstring/$filename.fa" or $updateBlast_dir){ - `ln -s $taxaPath/$taxstring/$filename.fa $blastPath/$taxstring/$filename.fa`; - } - if (! -e "$blastPath/$taxstring/$filename.pin" or $updateBlast_dir){ - chdir("$blastPath/$taxstring") or die "failed to change to dir\n"; - if ($blast_prog eq 'blastall'){ - `formatdb -i $filename.fa -t $filename -n $filename`; - } - elsif ($blast_prog eq 'blastp'){ - printOut("attempting to run makeblastdb", 2); - `makeblastdb -in $filename.fa -dbtype prot -title $filename -out $filename`; - } - } -} -################# -sub printDebug{ - my @message = @_; - if ($debug){ - print join "\n", @message; - print "\n"; - } -} -sub printVariableDebug{ - my @values = @_; - print "\n\nDEBUG\n"; - foreach (@values){ - print $_."\n"; - } - print "\nEND OF DEBUG\n\n"; -} -################# -sub getInput { - my ($message, $dieopt) = @_; - if ($dieopt){ - $message .= ', or type \'q\' to quit'; - } - print ("\n" . $message . ": "); - my $input = ; - chomp $input; - if ($input =~ /^q$/i and $dieopt) { - die "Quitting!\n"; - } - else { - return ($input); - } -} -################# -sub runBlast { - my ($query, $inpath, $outname, $outpath, $blastdb) = @_; - printDebug("running $blast_prog on database $blastdb using input $inpath/$query and writing to $outpath/$outname.blast"); - - if ($blast_prog =~ /blast[px]/) { - !`$blast_prog -db $blastdb -seg $filter -max_target_seqs 10 -evalue $eval_blast_query -outfmt 5 -query $inpath/$query -out $outpath/$outname.blast` or die "Problem running $blast_prog\n"; - } - elsif ($blast_prog =~ /blastall/) { - !`$blast_prog -p $algorithm -d $blastdb -F $filter -e $eval_blast_query -m7 -i $inpath/$query -o $outpath/$outname.blast` or die "Problem running $blast_prog\n" - } - else { - `$blast_prog -ublast $inpath/$query -db $blastdb -accel $accel -evalue $eval_blast_query -blast6out $outpath/$outname.blast` or die "Problem running $blast_prog\n"; - - ## sort the output as ublast does not do it (at least not for ESTs) - `sort -n -r -k 12 $outpath/$outname.blast >$outpath/blastsort.tmp`; - `mv $outpath/blastsort.tmp $outpath/$outname.blast`; - } - printDebug("returning $outname.blast for subroutine runBlast\n"); - return("$outname.blast"); -} -############# -sub getBestBlasthit { - my $hits; - my $frame; - my $count = 0; - my ($inpath, $resultfile) = @_; - printDebug("Sub getBestBlasthit running on $inpath/$resultfile"); - my $searchio = Bio::SearchIO->new( - -file => "$inpath/$resultfile", - -format => $outputfmt) - or die "parse failed"; - while(my $result = $searchio->next_result){ - my $sig; - my $sig_old; - while( my $hit = $result->next_hit) { - my $frameval = $hit->strand('query'); - if ($frameval >0){ - $frame = '+'; - } - elsif ($frameval <0 ) { - $frame = '-'; - } - else { - $frame = 'na'; - } - ## now I enter all top hits having the same score into the result - $sig = $hit->score; - if (!defined $sig_old) { - $sig_old = $sig; - } - if ($sig == $sig_old) { - $hits->[$count]->{name} = $hit->name; - $hits->[$count]->{score} = $sig; - $hits->[$count]->{evalue} = $hit->significance; - $count ++; - } - else { - ## there is no lower ranking hit with the same score as the best hit. End the loop. - last; - } - } - } - return($hits); -} -########################## -sub printOut { - my ($message, $mlevel) = @_; - if ($mlevel <= $vlevel){ - print "$message\n"; - } - ###################### - sub checkInt { - my $number = shift; - if ($number =~ /[^0-9]/){ - return(); - } - else{ - return($number); - } - } -} - -########################### -sub initialCheck { - my ($seed, $ogName, $blastDir, $genomeDir, $weightDir, $fasoff) = @_; - # check tools exist - my @tools = ("hmmsearch", "muscle", "mafft", $globalaligner, $localaligner, $glocalaligner); - if ($^O eq "darwin") { - push(@tools, "clustalw2") - } else { - push(@tools, "clustalw") - } - my $flag = 1; - foreach my $tool (@tools) { - my $check = `which $tool`; - if (length($check) < 1) { - print "$tool not found\n"; - $flag = 0; - } - } - if ($flag < 1) { - die "ERROR: Some required tools not found! Please install fdog again!\n"; - } - - # check executable FAS - my $fasCheckMsg = `fas.setup -t ./ -c 2>&1`; - if ($fasoff != 1 && $fasCheckMsg =~ /ERROR/) { - die "ERROR: FAS not ready to use! Please check https://github.com/BIONF/FAS/wiki/setup\n"; - } - - # check seed fasta file - unless (-e $seed) { - $seed = "$dataDir/$seed"; - } - my $seqio = Bio::SeqIO->new(-file => $seed, '-format' => 'Fasta'); - while(my $seq = $seqio->next_seq) { - my $string = $seq->seq; - if ($string =~ /[^a-zA-Z]/) { - die "ERROR: $seed contains special characters!\n"; - } - } - - # check ortholog group name - if (!defined $ogName) { - die "ERROR: Ortholog group name (-seqName) invalid!\n"; - } else { - if ($ogName =~ /[\|\s+\"\'\`\´\!\^]/) { - die "ERROR: Ortholog group name (-seqName) cannot contain PIPE|space or \" \' \` \´ \! \^\n"; - } - } - - # check genome_dir - my @genomeDir = checkValidFolderName($genomeDir); - foreach my $genomeFd (@genomeDir) { - unless ($genomeFd =~ /^\./) { - my $genome = getGenomeFile("$genomeDir/$genomeFd", $genomeFd); - unless (-e "$genome.checked") { - die "ERROR: $genome.checked not found!\nPlease run fdog.checkData before running fdog!\n"; - } - } - } - # check blast_dir - my @blastDir = checkValidFolderName($blastDir); - foreach my $blastFd (@blastDir) { - unless ($blastFd =~ /^\./) { - my $genome = getGenomeFile("$blastDir/$blastFd", $blastFd); - unless (-e "$genome.checked") { - die "ERROR: $genome.checked not found!\nPlease run fdog.checkData before running fdog!"; - } - } - } - # check weight_dir - if ($fasoff != 1) { - my %seen; - my @allTaxa = grep( !$seen{$_}++, @genomeDir, @blastDir); - my @notFolder; - for (my $i = 0;$i < scalar(@allTaxa); $i++){ - if (-f "$blastDir/$allTaxa[$i]" || -f "$genomeDir/$allTaxa[$i]") { - push(@notFolder, $allTaxa[$i]); - splice(@allTaxa, $i, 1); - } - } - if (scalar(@notFolder) > 0) { - print "*** WARNING: Found files in $genomeDir or $blastDir:\t@notFolder\n"; - } - chomp(my $allAnno = `ls $weightDir | $sedprog \'s/\\.json//\'`); - my @allAnno = split(/\n/, $allAnno); - my @missingAnno = array_minus(@allTaxa, @allAnno); - if (scalar @missingAnno > 0) { - my $missingAnno = join("\n", @missingAnno); - die "ERROR: Some taxa do not have annotation! Please turn off FAS calculation (with -fasoff), or annotate their genomes before continue.\n$missingAnno\n"; - } - } -} - -sub getGenomeFile { - my ($folder, $filename) = @_; - chomp(my $faFile = `ls $folder/$filename.fa* | $grepprog -v \"\\.checked\\|\\.mod\\|\\.mapping\\|\\.tmp\"`); - my $out = $faFile; - chomp(my $link = `$readlinkprog -f $faFile`); - if ($link ne "") { - $out = $link; - } - return($out); -} - -sub checkValidFolderName { - my $folder = $_[0]; - # check if folder and its subfolders contain illegal character (e.g. pipe) - opendir(my $dh, $folder) || die "Can't open $folder: $!"; - if ($folder =~ /[\|\s+]/) { - die "ERROR: $folder contains illegal character (e.g. PIPE or space)!\n"; - } - my @folders = readdir($dh); - foreach my $fd (@folders) { - next if ($fd eq "." or $fd eq ".."); - if ($fd =~ /[\|\s+]/) { - die "ERROR: $folder/$fd contains illegal character (e.g. PIPE or space)!\n"; - } - } - closedir $dh; - my @notFd = (".", ".."); - return(array_minus(@folders, @notFd)); -} - -sub gettime { sprintf"%d.%03d",Time::HiRes::gettimeofday } -sub roundtime { sprintf("%.2f", $_[0]); } - -########################### -sub helpMessage { - my $helpmessage = " -YOU ARE RUNNING $version on $hostname - -This program is freely distributed under a GPL. -Copyright (c) GRL limited: portions of the code are from separate copyrights - -\nUSAGE: oneSeq.pl -seqFile=<> -seqId=<> -seqName=<> -refSpec=<> -minDist=<> -maxDist=<> [OPTIONS] - -OPTIONS: - -GENERAL - --h - Invoke this help method --version - Print the program version - -REQUIRED - --seqFile=<> - Specifies the file containing the seed sequence (protein only) in fasta format. - If not provided the program will ask for it. --seqId=<> - Specifies the sequence identifier of the seed sequence in the reference protein set. - If not provided, the program will attempt to determin it automatically. --refSpec=<> - Determines the reference species for the ortholog search. It should be the species the seed sequence was derived from. - If not provided, the program will ask for it. --minDist=<> - specify the minimum systematic distance of primer taxa for the core set compilation. - If not provided, the program will ask for it. --maxDist=<> - specify the maximum systematic distance of primer taxa to be considered for core set compilation. - If not provided, the program will ask for it. --coreOrth=<> - Specify the number of orthologs added to the core set. - -USING NON-DEFAULT PATHS - --outpath=<> - Specifies the path for the output directory. Default is $outputPath; --hmmpath=<> - Specifies the path for the core ortholog directory. Default is $coreOrthologsPath; --blastpath=<> - Specifies the path for the blastDB directory. Default is $blastPath; --searchpath=<> - Specifies the path for the search taxa directory. Default is $genome_dir; --weightpath=<> - Specifies the path for the pre-calculated feature annotion directory. Default is $weightPath; - -ADDITIONAL OPTIONS - --append - Set this flag to append the output to existing output files --seqName=<> - Specifies a name for the search. If not set a random name will be set. --db - Run in database mode. Requires a mySql database. Only for internatl use. --filter=[T|F] - Switch on or off the low complexity filter for the blast search. Default: T --silent - Surpress output to the command line --coreTaxa=<> - You can provide a list of primer taxa that should exclusively be used for the compilation - of the core ortholog set --strict - Run the final ortholog search in 'strict mode'. An ortholog is only then accepted when the reciprocity is fulfilled - for each sequence in the core set. --force - Force the final ortholog search to create output file. Existing files will be overwritten. --coreStrict - Run the compilation of the core set in strict mode. --checkCoorthologsRef - During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the - core ortholog itself, but a co-ortholog of it. --CorecheckCoorthologsRef - Invokes the 'checkCoorthologsRef' behavior in the course of the core set compilation. --rbh - Requires a reciprocal best hit during the ortholog search to accept a new ortholog. --evalBlast=<> - This option allows to set the e-value cut-off for the Blast search. Default: 1E-5 --evalHmmer=<> - This options allows to set the e-value cut-off for the HMM search. Default: 1E-5 --evalRelaxfac=<> - This options allows to set the factor to relax the e-value cut-off (Blast search and HMM search) for the final ortholog run. Default: 10 --hitLimit=<> - Provide an integer specifying the number of hits of the initial pHMM based search that should be evaluated - via a reverse search. Default: 10 --coreHitLimit=<> - Provide an integer specifying the number of hits of the initial pHMM based search that should be evaluated - via a reverse search. Default: 3 --autoLimit - Setting this flag will invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically - a hit limit for each query. Note, when setting this flag, it will be effective for both the core ortholog compilation - and the final ortholog search. --scoreThreshold - Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less - than x percent of the hmm score of the best hit are further evaluated. Default is x = 10. - You can change this cutoff with the option -scoreCutoff. Note, when setting this flag, it will be effective for - both the core ortholog compilation and the final ortholog search. --scoreCutoff=<> - In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a - candidate of the hmmsearch will be subjected for further evaluation. Default: 10%. --coreOnly - Set this flag to compile only the core orthologs. These sets can later be used for a stand alone ortholog search. --reuseCore - Set this flag if the core set for your sequence is already existing. No check currently implemented. --ignoreDistance - Set this flag to ignore the distance between Taxa and to choose orthologs only based on score --distDeviation=<> - Specify the deviation in score in percent (1=100%, 0=0%) allowed for two taxa to be considered similar --blast - Set this flag to determine sequence id and refspec automatically. Note, the chosen sequence id and reference species - does not necessarily reflect the species the sequence was derived from. --rep - Set this flag to obtain only the sequence being most similar to the corresponding sequence in the core set rather - than all putative co-orthologs. --coreRep - Set this flag to invoke the '-rep' behaviour for the core ortholog compilation. --cpu - Determine the number of threads to be run in parallel --hyperthread - Set this flag to use hyper threading --batch=<> - Currently has NO functionality. --group=<> - Allows to limit the search to a certain systematic group --cleanup - Temporary output will be deleted. --aligner - Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle --local - Specify the alignment strategy during core ortholog compilation. Default is local. --glocal - Set the alignment strategy during core ortholog compilation to glocal. --searchTaxa - Input file containing list of search taxa. -SPECIFYING FAS SUPPORT OPTIONS - --fasoff - Turn OFF FAS support. Default is ON. --coreFilter=[relaxed|strict] - Specifiy mode for filtering core orthologs by FAS score. In 'relaxed' mode candidates with insufficient FAS score will be disadvantaged. - In 'strict' mode candidates with insufficient FAS score will be deleted from the candidates list. Default is None. - The option '-minScore=<>' specifies the cut-off of the FAS score. --minScore=<> - Specify the threshold for coreFilter. Default is 0.75. --countercheck - Set this flag to counter-check your final profile. The FAS score will be computed in two ways (seed vs. hit and hit vs. seed). - -SPECIFYING EXTENT OF OUTPUT TO SCREEN - --debug - Set this flag to obtain more detailed information about the programs actions --silent - Surpress output to screen as much as possbile -\n\n"; - return($helpmessage); -} diff --git a/fdog/bin/run_genewise_hamstr.pm b/fdog/bin/run_genewise_hamstr.pm deleted file mode 100755 index b631e64..0000000 --- a/fdog/bin/run_genewise_hamstr.pm +++ /dev/null @@ -1,260 +0,0 @@ -package run_genewise_hamstr; -use strict; -#$ENV{'WISECONFIGDIR'} = "/usr/local/src/wise2.2.0/wisecf/"; -# this module runs genewise on a DNA sequence and a protein sequence -# and then allows to parse this result. -# the constructor creates an object containing a reference to an array -# containing the file content - -# Modified 11.01.2010 renamed the file names for the genewise run to avoid overwriting of files when multipe runs are performed in parallel on the same sequence file -# LAST Modified: 31.07.2015. Added the option to keep, mask or remove partial codons and introns from -# the transcript. - -1; -sub new { - my $self_tmp = []; - my $self; - my ($class, $dna, $prot, $path, $keepintron) = @_; - if (!defined $path) { - $path = '/tmp'; - } - if (!defined $keepintron) { - $keepintron = 2; - } - my $pid=$$; - # the file names - my $protname = $pid.'_protein'; - my $dnaname = $pid . '_dna'; - ## print the two sequences to default path /tmp/ - open (DNA, ">$path/$dnaname") or die "could not open $path/$dnaname for writing\n"; - print DNA ">$dnaname\n$dna"; - close DNA; - open (PROTEIN, ">$path/$protname") or die "could not open $path/$protname for writing\n"; - print PROTEIN ">$protname\n$prot"; - close PROTEIN; - - ## run genewise on the two sequences - `echo \$WISECONFIGDIR`; - - $self_tmp = [`genewise -trans -cdna -pep -sum $path/$protname $path/$dnaname`]; - for (my $i = 0; $i < @$self_tmp; $i++) { - $self_tmp->[$i] =~ s/\s{1,}$//; - } - $self->{gw} = $self_tmp; - $self->{nt_seq} = $dna; - $self->{prot_seq} = $prot; - $self->{protname} = $protname; - $self->{dnaname} = $dnaname; - $self->{gw_count} = @$self_tmp; - - if ($keepintron =~ /^k/i ) { - $self->{get_indel} = 2; ## per default the indel-part is recovererd in lower case letters rather than masked or removed. See code for details - } - elsif ($keepintron =~ /^m/i) { - $self->{get_indel} = 1; ## The indel-part is masked. See code for details; - } - else { - $self->{get_indel} = 0; ## the indel-part is removed making the cDNA consistent with the translaton. See code for details; - } - print "intron is $self->{get_indel}\n"; - - $self->{indels} = _GetIndels($self_tmp); - bless ($self, $class); - return $self;} -################# -## sub score extract the score for the alignment -sub score { - my $self = shift; - my $score; - for (my $i = 0; $i < $self->{gw_count}; $i ++) { - if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) { - $score = $1; - last; - } - } - return ($score); -} -################## -sub protein { - my $self = shift; - my $gw = $self->{gw}; - my $prot = ''; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts - my $count = 1; - while ($gw->[$i+$count] ne '//') { - my $protpart = $gw->[$i+$count]; - chomp $protpart; - $prot .= $protpart; - $count ++; - } - } - elsif (length $prot > 0) { - last; - } - } - return($prot); - } -################## -sub translation { - my $self = shift; - my $finish = 0; - my $translated_seq = ''; - my @transtmp; - - ## step 1: extract the relevant info from the genewise output - for (my $i = 0; $i < $self->{gw_count}; $i++) { - if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts - while ($self->{gw}->[$i] !~ '//') { - push @transtmp, $self->{gw}->[$i]; - $i++; - } - last; # end the for loop since nothing left to be done - } - } - - ## step two: get the sequences - my $count = -1; - my $trans; - for (my $i = 0; $i < @transtmp; $i++) { - if ($transtmp[$i] =~ />/) { - $count++; - $trans->[$count]->{seq} = ''; # initialize - if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { - $trans->[$count]->{start} = $1; - $trans->[$count]->{end} = $2; - } - } - else { - $trans->[$count]->{seq} .= $transtmp[$i]; - } - } - - ## step 3: connect the fragments - if (@$trans == 1) { - $translated_seq = $trans->[0]->{seq}; - } - else { - for (my $i = 0; $i < @$trans; $i++) { - $translated_seq .= $trans->[$i]->{seq}; - if ($i < (@$trans - 1)) { - my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; - if ($self->{get_indel} > 0) { - $translated_seq .= 'X'; - } - } - } - } - return($translated_seq); - } - -################## -sub codons { - my $self = shift; - my $finish = 0; - my $codon_seq = ''; - my @transtmp; - - ## step 1: extract the relevant info from the genewise output - for (my $i = 0; $i < $self->{gw_count}; $i++) { - if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts - while ($self->{gw}->[$i] !~ '//') { - push @transtmp, $self->{gw}->[$i]; - $i++; - } - last; # end the for loop since nothing left to be done - } - } - - ## step two: get the sequences - my $count = -1; - my $trans; - for (my $i = 0; $i < @transtmp; $i++) { - if ($transtmp[$i] =~ />/) { - $count++; - $trans->[$count]->{seq} = ''; # initialize - if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { - $trans->[$count]->{start} = $1; - $trans->[$count]->{end} = $2; - } - } - else { - $transtmp[$i] =~ tr/a-z/A-Z/; - $trans->[$count]->{seq} .= $transtmp[$i]; - } - } - - ## step 3: connect the fragments - if (@$trans == 1) { - $codon_seq = $trans->[0]->{seq}; - } - else { - for (my $i = 0; $i < @$trans; $i++) { - $codon_seq .= $trans->[$i]->{seq}; - if ($i < (@$trans - 1)) { - my $indel = ''; - my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; - ## now decide whether the nts that did not got translated are masked by - ## 'N' or whether they will be represented as lower case letters - if ($self->{get_indel} == 2) { - $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing); - $indel =~ tr/A-Z/a-z/; - } - elsif ($self->{get_indel} == 1) { - $indel = 'N' x $missing; - } - else { - $indel = ''; - } - ## now append gap characters until the frame is recovered. Note that the gap - ## characters are added to the end of the indel-part. Thus, the codons are - ## not considered. - while (length($indel)%3 != 0) { - $indel .= '-'; - } - - $codon_seq .= $indel; - } - } - } - return ($codon_seq); - } -########################### -sub protein_borders { - my $self = shift; - my $gw = $self->{gw}; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits.*introns$/) { - my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; - return($start, $end); - } - else { - die "no protein-start and end could not be determnined. Check genewise command\n"; - } - } -} -########################## -sub cdna_borders { - my $self = shift; - my $gw = $self->{gw}; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits.*introns$/) { - my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; - return($start, $end); - } - else { - die "no cdna-start and end could not be determnined. Check genewise command\n"; - } - } -} -########################## -sub _GetIndels { - my $gw = shift; - my $indel; - for (my $i = 0; $i < @$gw; $i++) { - if ($gw->[$i] =~ /Bits/) { - $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/; - return($indel); - } - } -} diff --git a/fdog/bin/translate.pl b/fdog/bin/translate.pl deleted file mode 100755 index 68ee444..0000000 --- a/fdog/bin/translate.pl +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/perl -use strict; -use File::Basename; -use lib dirname(__FILE__); -use Getopt::Long; -use Bio::Perl; -use File::Copy; - -# PROGRAMNAME: translate.pl - -# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at - -# PROGRAM DESCRIPTION: - -# DATE: Tue May 12 14:03:34 CEST 2009 - - -# DATE LAST MODIFIED: 03.11.2010: Bug fix suggested by Todd Oakley. -# BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT -# CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO - -## Last modified: 10.01.2014 -## added option -outpath -######################## start main ############################# -my $help; -my @out; -my @estout; -my $infile; -my $trunc = 1; -my $outfile = "translate_tc.out"; -my $outpath = '.'; -my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are -## too long, then one can run into troubles with the parsing of the hmmsearch results. -######### -my $usage = "Name:\n\ttranslate.pl\n -Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n -Description:\n\tThis program takes a batch fasta-file with DNA -\tsequences as an input and translates the individual DNA sequences in -\tall six reading frames. -\t-infile: provide the relative or absolute path of the infile\n -\t-outfile: provide the relative or absolute path of the outfile -\tDefault is: translate_tc.out\n -\t-outpath: provide the path to the -\toutfile. Default is '.'\n -\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below). -\t-h: prints this help-message\n -NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n -NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included."; -########## - -GetOptions ( - "h" => \$help, - "infile=s" => \$infile, - "outfile=s" => \$outfile, - "outpath=s" => \$outpath, - "trunc=s" => \$trunc); -if ($help) { - print "$usage"; - exit; -} -if (-e "$outfile") { - print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n"; - my $newname = "$outfile.old"; - rename($outfile, $newname); -} -#my @seq_object = read_all_sequences($infile, 'fasta'); - -open (LOG, ">>$outpath/hamstrsearch.log") or warn "could not open logfile for writing\n"; -print LOG "\n### TRANSLATE.PL: \n"; - -### changes suggested by Todd Oakley -my $tempseqio; -$tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta'); -my @seq_object; - -while( my $seq = $tempseqio->next_seq() ) { - $seq->alphabet('dna'); - push(@seq_object,$seq); -} -### End changes Todd Oakley - -## determine whether the seq-ids are unique given the chosen value for $trunc -my ($message, $cont, $check) = &checkIds(); -if ($cont == 1) { - ## the check for unique identifiers has failed and the programm is exiting - print LOG "$message\n"; - close LOG; - exit; -} -else { - print LOG "All sequence identifier are unique!\n"; - if ($check == 2) { - my $newname = "$infile.original"; - rename($infile, $newname); - print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n"; - } - for (my $j = 0; $j < @seq_object; $j++) { - my $finalid = $seq_object[$j]->{finalid}; - my $estseq = $seq_object[$j]->seq; - my $inid = $seq_object[$j]->display_id; - my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]); - for (my $i = 0; $i < @all_trans; $i++) { - my $count = $i+1; - my $id = $all_trans[$i]->display_id; - my $seq = $all_trans[$i]->seq; - $id =~ s/$inid/$finalid/; - $id =~ s/-[0-9][RF]/_RF$count.0/; - push @out, ">$id\n$seq"; - } - push @estout, ">$finalid\n$estseq"; - if ($j%100 == 0) { - print "$j Sequences processed\n"; - open (OUT, ">>$outpath/$outfile") or die "failed to open outfile\n"; - print OUT join "\n", @out; - print OUT "\n"; - @out = qw(); - close OUT; - if ($check == 2) { - ## part of the description was added to the seq-id - open (OUT, ">>$infile"); - print OUT join "\n", @estout; - print OUT "\n"; - @estout = qw(); - } - } - } - open (OUT, ">>$outpath/$outfile") or die "failed to open outfile\n"; - print OUT join "\n", @out; - print OUT "\n"; - @out = qw(); - close OUT; - if ($check == 2) { - ## part of the description was added to the seq-id - open (OUT, ">>$infile"); - print OUT join "\n", @estout; - print OUT "\n"; - close OUT; - @estout = qw(); - } -} -close LOG; -exit; -########################## start sub ################ -sub checkIds { - my $message; - my $check = 1; - my $cont = 1; - my $counter; - ## Everything up to the first whitespace - ## in the fasta header will be taken as sequence id by bioperl. If this - ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer - ## be unique. This will be checked and if necessary the id will not be truncated - ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1) - ## for $check == 1, the complete id will be checked - ## for $check == 2, the first 20 characters of the concatenated id and description - ## will be checked - if ($trunc == 1) { - $check = 0; - } - - while ($check < 3 and $cont == 1) { - $cont = 0; - for (my $i=0; $i < @seq_object; $i++) { - my $id = $seq_object[$i]->display_id; - $id =~ s/(.{0,$limit}).*/$1/; - if ($check == 0) { - $id =~ s/|.*//; - } - elsif ($check == 2) { - $id = $id . '_' . $seq_object[$i]->desc; - $id =~ s/(.{0,$limit}).*/$1/; - } - if (defined $counter->{$id}) { - if ($check == 0) { - $message = "trying next without truncating the id"; - } - elsif ($check == 1) { - $message = 'trying next to include sequence description'; - } - else { - $message = "Sequence identifier are not unique, using the first 20 characters. Aborting..."; - } - print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; - $check ++; - $cont = 1; - $counter = undef; - last; - } - else { - $counter->{$id} = 1; - $seq_object[$i]->{finalid} = $id; - } - } - } - ## return the value of $cont. If this is 1, then the sequence id check has failed. - return($message, $cont, $check); -} diff --git a/fdog/checkData.py b/fdog/checkData.py index 3aafe44..e513665 100644 --- a/fdog/checkData.py +++ b/fdog/checkData.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to check fdog data which are present in -# genome_dir, blast_dir and weight_dir +# searchTaxa_dir, coreTaxa_dir and annotation_dir # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -23,33 +23,21 @@ from os.path import isfile, join from pathlib import Path import subprocess +import shutil from Bio import SeqIO +from ete3 import NCBITaxa import re from datetime import datetime -import csv +import multiprocessing as mp +from tqdm import tqdm +from pkg_resources import get_distribution +from Bio.Blast.Applications import NcbiblastpCommandline -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) -def countLine(file,pattern,contain): - nline = 0 - with open(file, 'r') as f: - for line in f: - if contain: - if pattern in line: - nline = nline + 1 - else: - if not pattern in line: - nline = nline + 1 - return(nline) +import fdog.libs.zzz as general_fn +import fdog.libs.blast as blast_fn +import fdog.libs.fasta as fasta_fn -def join2Lists(first_list, second_list): - in_first = set(first_list) - in_second = set(second_list) - in_second_but_not_in_first = in_second - in_first - out = first_list + list(in_second_but_not_in_first) - return(out) def checkOptConflict(concat, replace, delete): if concat: @@ -62,51 +50,65 @@ def checkOptConflict(concat, replace, delete): if delete: sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') -def checkValidFasta(file): + +def check_valid_fasta(file): + """ Check if valid fasta file + Check if: + (1) Input file is a fasta file + (2) If headers are longer than 30 characters + (3) If headers and sequences contain any space/tab + (4) If sequences are written in a single line + """ spaceChr = (' ', '\t') with open(file, 'r') as f: f_bkp = f - # check if input file a FASTA file + # check if input file is a FASTA file fasta = SeqIO.parse(f, 'fasta') if not any(fasta): - return('notFasta') + return({'notFasta': 1}) else: # check for long header inSeq = SeqIO.to_dict((SeqIO.parse(open(file), 'fasta'))) for id in inSeq: if len(id) > 30: - return('longHeader') + return({'longHeader': id}) # check space or tab if any(s in f.read() for s in spaceChr): return('space') # check single line - nHeader = countLine(file, '>', True) - nSeq = countLine(file, '>', False) + nHeader = general_fn.count_line(file, '>', True) + nSeq = general_fn.count_line(file, '>', False) if not nHeader == nSeq: - return('multiLine') - return('ok') + return({'multiLine': 1}) + return({'ok': 1}) -def checkValidFolderName(folder): + +def check_valid_folder_name(folder): + """ Check if folder name contains any special characters """ invalidChr = (' ','|','\t','\'','"','`','´','^','!','$','%','&') if any(e in folder for e in invalidChr): sys.exit('*** ERROR: Invalid character found in %s' % folder) -def checkValidSeqs(faFile): + +def check_valid_seqs(fa_file): + """ Check if any sequence contains space/tab or special characters """ spaceChr = (' ', '\t') - faSeq = SeqIO.parse(open(faFile),'fasta') + faSeq = SeqIO.parse(open(fa_file),'fasta') for fa in faSeq: id, seq = fa.description, str(fa.seq) c = '' if any(e in id for e in spaceChr): - sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, faFile)) + sys.exit('*** ERROR: Invalid character found in \">%s\" in %s' % (id, fa_file)) if any(c for c in seq if not c.isalpha()): - print('*** ERROR: Invalid character "%s" found in the sequence of gene \"%s\" in %s' % (c, id, faFile)) + print('*** ERROR: Invalid character "%s" found in the sequence of gene \"%s\" in %s' % (c, id, fa_file)) sys.exit('You can use "--replace" or "--delete" to solve this issue!') -def rewriteSeqs(faFile, replace, delete): + +def rewrite_seqs(fa_file, replace, delete): + """ Rewrite fasta sequence by replacing or deleting special characters """ spaceChr = (' ', '\t') - faSeq = SeqIO.parse(open(faFile),'fasta') - with open(faFile + '.mod', 'w') as tmpOut: + faSeq = SeqIO.parse(open(fa_file),'fasta') + with open(fa_file + '.mod', 'w') as tmpOut: for fa in faSeq: id, seq = fa.description, str(fa.seq) if replace: @@ -114,174 +116,338 @@ def rewriteSeqs(faFile, replace, delete): if delete: seq = re.sub('[^a-zA-Z]', '', seq) tmpOut.write('>%s\n%s\n' % (id, seq)) - os.replace(faFile + '.mod', faFile) + os.replace(fa_file + '.mod', fa_file) + -def writeCheckedFile(faFile): - with open(faFile+'.checked', 'w') as f: +def write_faChecked(fa_file): + """ Add fa.checked file in searchTaxa_dir """ + with open(fa_file+'.checked', 'w') as f: f.write(str(datetime.now())) -def checkDataFolder(checkDir, replace, delete, concat): - taxaList = [] - for fd in listdir(checkDir): - if not fd.startswith('.'): - taxon = fd - checkValidFolderName(checkDir+'/'+taxon) - getFaCmd = 'ls %s/%s/%s.fa*' % (checkDir, taxon, taxon) - try: - faFiles = subprocess.check_output([getFaCmd], shell=True).decode(sys.stdout.encoding).strip().split('\n') - for faFile in faFiles: - if os.path.islink(faFile): - faFile = os.path.realpath(faFile) - checkFileExist(faFile) - if not '.mapping' in faFile: - if not '.checked' in faFile: - if not os.path.exists(faFile+".checked"): - checkFaFile = checkValidFasta(faFile) - if checkFaFile == 'notFasta': - sys.exit('*** ERROR: %s does not look like a fasta file!' % faFile) - elif checkFaFile == 'longHeader': - sys.exit('*** ERROR: %s contains long headers!' % faFile) - elif checkFaFile == 'space': - sys.exit('*** ERROR: %s contains spaces/tabs!' % faFile) - elif checkFaFile == 'multiLine': - if not concat: - print('*** ERROR: %s contains multiple-line sequences!' % faFile) - sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') - else: - rewriteSeqs(faFile, replace, delete) - elif checkFaFile == 'ok': - if not (delete or replace): - checkValidSeqs(faFile) - else: - rewriteSeqs(faFile, replace, delete) - writeCheckedFile(faFile) - print(fd) - taxaList.append(fd) - except subprocess.CalledProcessError as e: - print('*** ERROR: Problem while searching for fasta file') - print(e.output.decode(sys.stdout.encoding)) - sys.exit() - return(taxaList) - -def checkMissingJson(weightDir, taxaList): - allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] - taxaAnno = [s + '.json' for s in taxaList] - s = set(allAnno) - missingAnno = [x for x in taxaAnno if x not in s] - return(missingAnno) - -def checkCompleteAnno(weightDir, genomeDir): - allAnno = [f for f in listdir(weightDir) if isfile(join(weightDir, f))] - for f in allAnno: + +def check_fasta(args): + """ Check fasta file in searchTaxa_dir and coreTaxa_dir """ + # taxon_list = [] + # for fd in general_fn.read_dir(checkDir): + # taxon = fd + # check_valid_folder_name('%s/%s' % (checkDir, taxon)) + # for file in listdir('%s/%s' % (checkDir, taxon)): + # if file.endswith('.fa'): + # fa_file = '%s/%s/%s' % (checkDir, taxon, file) + # if os.path.islink(fa_file): + # fa_file = os.path.realpath(fa_file) + # general_fn.check_file_exist(fa_file) + # checkfa_file = check_valid_fasta(fa_file) + # taxon_list.append(taxon) + # if not os.path.exists('%s.checked' % fa_file): + # print(taxon) + # if list(checkfa_file.keys())[0] == 'notFasta': + # sys.exit('*** ERROR: %s does not look like a fasta file!' % fa_file) + # elif list(checkfa_file.keys())[0] == 'longHeader': + # sys.exit('*** ERROR: %s contains long headers! E.g. %s' % (fa_file, list(checkfa_file.values())[0])) + # elif list(checkfa_file.keys())[0] == 'space': + # sys.exit('*** ERROR: %s contains spaces/tabs!' % fa_file) + # elif list(checkfa_file.keys())[0] == 'multiLine': + # if not concat: + # print('*** ERROR: %s contains multiple-line sequences!' % fa_file) + # sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') + # else: + # rewrite_seqs(fa_file, replace, delete) + # elif list(checkfa_file.keys())[0] == 'ok': + # if not (delete or replace): + # check_valid_seqs(fa_file) + # else: + # rewrite_seqs(fa_file, replace, delete) + # write_faChecked(fa_file) + # if not os.path.exists('%s.fai' % fa_file): + # fasta_fn.read_fasta(fa_file) + # return(taxon_list) + (taxon, file, checkDir, replace, delete, concat) = args + fa_file = '%s/%s/%s' % (checkDir, taxon, file) + if os.path.islink(fa_file): + fa_file = os.path.realpath(fa_file) + general_fn.check_file_exist(fa_file) + checkfa_file = check_valid_fasta(fa_file) + # taxon_list.append(taxon) + if not os.path.exists('%s.checked' % fa_file): + print(taxon) + if list(checkfa_file.keys())[0] == 'notFasta': + sys.exit('*** ERROR: %s does not look like a fasta file!' % fa_file) + elif list(checkfa_file.keys())[0] == 'longHeader': + sys.exit('*** ERROR: %s contains long headers! E.g. %s' % (fa_file, list(checkfa_file.values())[0])) + elif list(checkfa_file.keys())[0] == 'space': + sys.exit('*** ERROR: %s contains spaces/tabs!' % fa_file) + elif list(checkfa_file.keys())[0] == 'multiLine': + if not concat: + print('*** ERROR: %s contains multiple-line sequences!' % fa_file) + sys.exit('Please use "--concat" with "--replace" or "--delete" to join them into single lines') + else: + rewrite_seqs(fa_file, replace, delete) + elif list(checkfa_file.keys())[0] == 'ok': + if not (delete or replace): + check_valid_seqs(fa_file) + else: + rewrite_seqs(fa_file, replace, delete) + write_faChecked(fa_file) + if not os.path.exists('%s.fai' % fa_file): + fasta_fn.read_fasta(fa_file) + return(taxon) + + +def run_check_fasta(checkDir, replace, delete, concat): + """ Run check_fasta fn """ + jobs = [] + for taxon in general_fn.read_dir(checkDir): + check_valid_folder_name('%s/%s' % (checkDir, taxon)) + for file in listdir('%s/%s' % (checkDir, taxon)): + if file.endswith('.fa'): + jobs.append([taxon, file, checkDir, replace, delete, concat]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + taxon_list = [] + for _ in tqdm(pool.imap_unordered(check_fasta, jobs), total=len(jobs)): + taxon_list.append(_) + return(taxon_list) + + +def check_blastdb(args): + """ Check for outdated blastdb """ + (query, taxon, coreTaxa_dir) = args + blast_db = '%s/%s/%s' % (coreTaxa_dir, taxon, taxon) + try: + blastp_cline = NcbiblastpCommandline(query = query, db = blast_db) + stdout, stderr = blastp_cline() + except: + return([query, blast_db]) + if not os.path.exists('%s/%s/%s.fa.fai' % (coreTaxa_dir, taxon, taxon)): + fai_in_genome = "../../searchTaxa_dir/%s/%s.fa.fai" % (taxon, taxon) + fai_in_blast = "%s/%s/%s.fa.fai" % (coreTaxa_dir, taxon, taxon) + os.symlink(fai_in_genome, fai_in_blast) + + +def run_check_blastdb(coreTaxa_dir, fdogPath): + """ Run check_blastdb fn """ + query = '%s/data/infile.fa' % fdogPath + jobs = [] + for fd in general_fn.read_dir(coreTaxa_dir): + jobs.append([query, fd, coreTaxa_dir]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for _ in tqdm(pool.imap_unordered(check_blastdb, jobs), total=len(jobs)): + out.append(_) + return([1]) + + +def create_blastdb(args): + """ Redo (or update) blastdb """ + (taxon, coreTaxa_dir, searchTaxa_dir, outPath) = args + fa_file = '%s/%s/%s.fa' % (coreTaxa_dir, taxon, taxon) + if os.path.islink(fa_file): + fa_file = os.path.realpath(fa_file) + if not os.path.exists(fa_file): + fa_file = '%s/%s/%s.fa' % (searchTaxa_dir, taxon, taxon) + if os.path.exists(fa_file): + ### remove old files + blast_path = '%s/%s' % (coreTaxa_dir, taxon) + shutil.rmtree(blast_path) + ### Redo blastdb + Path(blast_path).mkdir(parents = True, exist_ok = True) + blast_fn.make_blastdb([taxon, fa_file, outPath, True]) + ### make symlink to fasta files + fa_in_genome = "../../searchTaxa_dir/%s/%s.fa" % (taxon, taxon) + fai_in_genome = "../../searchTaxa_dir/%s/%s.fa.fai" % (taxon, taxon) + fa_in_blast = "%s/%s.fa" % (blast_path, taxon) + fai_in_blast = "%s/%s.fa.fai" % (blast_path, taxon) + if not os.path.exists(fa_in_blast): + os.symlink(fa_in_genome, fa_in_blast) + if not os.path.exists(fai_in_blast): + os.symlink(fai_in_genome, fai_in_blast) + return None + else: + return(taxon) + + +def run_create_blastdb(coreTaxa_dir, searchTaxa_dir): + """ Run create_blastdb fn """ + outPath = '/'.join(coreTaxa_dir.split('/')[0:-1]) + jobs = [] + for fd in general_fn.read_dir(coreTaxa_dir): + jobs.append([fd, coreTaxa_dir, searchTaxa_dir, outPath]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for _ in tqdm(pool.imap_unordered(create_blastdb, jobs), total=len(jobs)): + out.append(_) + return([i for i in out if i is not None]) + + +def check_missing_json(annotation_dir, taxon_list): + """ Check missing annotation for any taxa in coreTaxa_dir and searchTaxa_dir """ + all_anno = [f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))] + taxa_anno = [s + '.json' for s in taxon_list] + s = set(all_anno) + missing_anno = [x for x in taxa_anno if x not in s] + return(missing_anno) + + +def check_complete_anno(args): + """ Check if an annotation is complete + I.e. if it contains annotation for all proteins of a species + """ + (gf,jf, annotation_dir) = args + cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, annotation_dir) + try: + subprocess.call([cmd], shell = True, stdout=subprocess.DEVNULL) + except subprocess.CalledProcessError as e: + print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') + print(e.output.decode(sys.stdout.encoding)) + sys.exit() + + +def run_check_complete_anno(annotation_dir, searchTaxa_dir): + """ Run check_complete_anno fn """ + all_anno = [f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))] + jobs = [] + for f in all_anno: tax = f.replace('.json', '') - print('...check annotations for %s' % tax) - jf = '%s/%s.json' % (weightDir, tax) - gf = '%s/%s/%s.fa' % (genomeDir, tax, tax) - cmd = 'fas.checkAnno -s %s -a %s -o %s' % (gf, jf, weightDir) + # print('...check annotations for %s' % tax) + jf = '%s/%s.json' % (annotation_dir, tax) + gf = '%s/%s/%s.fa' % (searchTaxa_dir, tax, tax) + jobs.append([gf,jf, annotation_dir]) + cpus = mp.cpu_count()-1 + pool = mp.Pool(cpus) + out = [] + for _ in tqdm(pool.imap_unordered(check_complete_anno, jobs), total=len(jobs)): + out.append(_) + return None + + +def check_missing_ncbiID(taxon_list): + """ Check all taxa in searchTaxa_dir and coreTaxa_dir + if they are have valid NCBI taxonomy IDs + """ + ncbi = NCBITaxa() + missing_taxa = {} + present_taxa = {} + dup_taxa = [] + for t in taxon_list: + tax_id = t.split('@')[1] try: - subprocess.call([cmd], shell = True) - except subprocess.CalledProcessError as e: - print('*** ERROR: Problem while checking annotation file using fas.checkAnno!') - print(e.output.decode(sys.stdout.encoding)) - sys.exit() - -def checkMissingNcbiID(namesDmp, taxaList): - ncbiId = {} - with open(namesDmp, 'r') as f: - lines = f.readlines() - for x in lines: - taxId = x.split('\t')[0] - if not taxId in ncbiId: - ncbiId[taxId] = 1 - f.close() - missingTaxa = {} - presentTaxa = {} - dupTaxa = [] - for t in taxaList: - taxId = t.split('@')[1] - if not taxId in ncbiId: - if not t+'\t'+str(taxId) in missingTaxa: - missingTaxa[t+'\t'+str(taxId)] = 1 - if not taxId in presentTaxa: - presentTaxa[taxId] = t + taxid2name = ncbi.get_taxid_translator([tax_id]) + if len(taxid2name) < 1: + if not t+'\t'+str(tax_id) in missing_taxa: + missing_taxa[t+'\t'+str(tax_id)] = 1 + except: + if not t+'\t'+str(tax_id) in missing_taxa: + missing_taxa[t+'\t'+str(tax_id)] = 1 + if not tax_id in present_taxa: + present_taxa[tax_id] = t else: - dupTaxa.append('%s\t%s' % (t, presentTaxa[taxId])) - return(missingTaxa.keys(), dupTaxa) + dup_taxa.append('%s\t%s' % (t, present_taxa[tax_id])) + return(missing_taxa.keys(), dup_taxa) + + +def run_check_missing_ncbiID(): + """ Run check_missing_ncbiID fn """ + pass + def main(): - version = '0.0.6' - parser = argparse.ArgumentParser(description='You are running fdog.checkData version ' + str(version) + '.') - parser.add_argument('-g', '--genomeDir', help='Path to search taxa directory (e.g. fdog_dataPath/genome_dir)', action='store', default='') - parser.add_argument('-b', '--blastDir', help='Path to blastDB directory (e.g. fdog_dataPath/blast_dir)', action='store', default='') - parser.add_argument('-w', '--weightDir', help='Path to feature annotation directory (e.g. fdog_dataPath/weight_dir)', action='store', default='') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') + parser.add_argument('-s', '--searchTaxa_dir', help='Path to search taxa directory (e.g. fdog_dataPath/searchTaxa_dir)', action='store', default='') + parser.add_argument('-c', '--coreTaxa_dir', help='Path to blastDB directory (e.g. fdog_dataPath/coreTaxa_dir)', action='store', default='') + parser.add_argument('-a', '--annotation_dir', help='Path to feature annotation directory (e.g. fdog_dataPath/annotation_dir)', action='store', default='') parser.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False) parser.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False) parser.add_argument('--concat', help='Concatenate multiple-line sequences into single-line', action='store_true', default=False) + parser.add_argument('--reblast', help='Re-create blast databases', action='store_true', default=False) ### get arguments args = parser.parse_args() - genomeDir = args.genomeDir - blastDir = args.blastDir - weightDir = args.weightDir + searchTaxa_dir = args.searchTaxa_dir + coreTaxa_dir = args.coreTaxa_dir + annotation_dir = args.annotation_dir replace = args.replace delete = args.delete concat = args.concat + reblast = args.reblast checkOptConflict(concat, replace, delete) caution = 0 - ### get fdog dir and assign genomeDir, blastDir, weightDir if not given + ### get fdog dir and assign searchTaxa_dir, coreTaxa_dir, annotation_dir if not given fdogPath = os.path.realpath(__file__).replace('/checkData.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - if not genomeDir: - genomeDir = dataPath + "/genome_dir" - if not blastDir: - blastDir = dataPath + "/blast_dir" - if not weightDir: - weightDir = dataPath + "/weight_dir" - - ### check genomeDir and blastDir - print('=> Checking %s...' % genomeDir) - genomeTaxa = checkDataFolder(os.path.abspath(genomeDir), replace, delete, concat) - print('=> Checking %s...' % blastDir) - blastTaxa = checkDataFolder(os.path.abspath(blastDir), replace, delete, concat) - - ### check weightDir - print('=> Checking %s...' % weightDir) - missingAnno = checkMissingJson(weightDir, join2Lists(genomeTaxa, blastTaxa)) - if len(missingAnno) > 0: + if not searchTaxa_dir or not coreTaxa_dir or not annotation_dir: + pathconfigFile = fdogPath + '/bin/pathconfig.txt' + if not os.path.exists(pathconfigFile): + sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + if not searchTaxa_dir: + searchTaxa_dir = dataPath + "/searchTaxa_dir" + if not coreTaxa_dir: + coreTaxa_dir = dataPath + "/coreTaxa_dir" + if not annotation_dir: + annotation_dir = dataPath + "/annotation_dir" + + searchTaxa_dir = os.path.abspath(searchTaxa_dir) + coreTaxa_dir = os.path.abspath(coreTaxa_dir) + annotation_dir = os.path.abspath(annotation_dir) + + ### check searchTaxa_dir + print('=> Checking %s...' % searchTaxa_dir) + genomeTaxa = run_check_fasta(searchTaxa_dir, replace, delete, concat) + + ### check coreTaxa_dir + if reblast: + print('=> (Re-)Creating blastDBs...') + failed_blast = run_create_blastdb(coreTaxa_dir, searchTaxa_dir) + if len(failed_blast) > 0: + print('*** WARNING: Some BlastDBs cannot be created:\n%s' % ', '.join(failed_blast)) + else: + print('All old BlastDBs have been updated!') + print('=> Checking %s...' % coreTaxa_dir) + blastTaxa = run_check_fasta(coreTaxa_dir, replace, delete, concat) + check_blast = run_check_blastdb(coreTaxa_dir, fdogPath) + + if not check_blast[0] == 1: + print('*** ERROR: Version incompatible between BlastDB and BLAST program!') + print('For checking, run: blastp -query %s -db %s' % (check_blast[0], check_blast[1])) + print('Consider using --reblast option to update old BlastDBs!') + sys.exit() + + ### check annotation_dir + print('=> Checking %s...' % annotation_dir) + missing_anno = check_missing_json(annotation_dir, general_fn.join_2lists(genomeTaxa, blastTaxa)) + if len(missing_anno) > 0: print('\033[92m*** WARNING: Annotation files not found for:\033[0m') - print(*missingAnno, sep = "\n") + print(*missing_anno, sep = "\n") print('NOTE: You still can run fdog without FAS using the option "-fasoff"') caution = 1 - checkCompleteAnno(weightDir, genomeDir) + run_check_complete_anno(annotation_dir, searchTaxa_dir) - ### check ncbi IDs + # ### check ncbi IDs print('=> Checking NCBI taxonomy IDs...') - namesDmp = fdogPath + '/taxonomy/names.dmp' - checkFileExist(namesDmp) - missingTaxa, dupTaxa = checkMissingNcbiID(namesDmp, join2Lists(genomeTaxa, blastTaxa)) - if (len(missingTaxa) > 0): - print('\033[92m*** WARNING: Taxa not found in current fdog\'s NCBI taxonomy database:\033[0m') - print(*missingTaxa, sep = "\n") - print('NOTE: You still can run fdog, but they will not be included in the core set compilation!') + missing_taxa, dup_taxa = check_missing_ncbiID(general_fn.join_2lists(genomeTaxa, blastTaxa)) + if (len(missing_taxa) > 0): + print('\033[92m*** WARNING: Taxa not found in current local NCBI taxonomy database:\033[0m') + print(*missing_taxa, sep = "\n") + print('==> NOTE: You still can run fDOG with those taxa, but they will not be included in the core set compilation!') caution = 1 - if (len(dupTaxa) > 0): + if (len(dup_taxa) > 0): print('\033[92m*** WARNING: These taxa have the same NCBI taxonomy IDs:\033[0m') - print(*dupTaxa, sep = "\n") - print('NOTE: This could lead to some conflicts!') + print(*dup_taxa, sep = "\n") + print('==> NOTE: This could lead to some conflicts!') caution = 1 print('---------------------------------') if caution == 1: - print('Done! Data are ready to use with caution!') + print('==> Done! Data are ready to use WITH CAUTION!') else: - print('Done! Data are ready to use!') + print('==> Done! Data are ready to use!') if __name__ == '__main__': main() diff --git a/fdog/data/conda_requirements.yml b/fdog/data/conda_requirements.yml new file mode 100644 index 0000000..1b9a34c --- /dev/null +++ b/fdog/data/conda_requirements.yml @@ -0,0 +1,6 @@ +blast +hmmer +fasta3 +clustalw +mafft +muscle diff --git a/fdog/data/dependencies.txt b/fdog/data/dependencies.txt new file mode 100644 index 0000000..8a7853b --- /dev/null +++ b/fdog/data/dependencies.txt @@ -0,0 +1,5 @@ +ncbi-blast+ +hmmer +clustalw +mafft +muscle diff --git a/fdog/bin/__init__.py b/fdog/libs/__init__.py similarity index 100% rename from fdog/bin/__init__.py rename to fdog/libs/__init__.py diff --git a/fdog/libs/addtaxon.py b/fdog/libs/addtaxon.py new file mode 100644 index 0000000..8e38620 --- /dev/null +++ b/fdog/libs/addtaxon.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +from Bio import SeqIO +import subprocess +from ete3 import NCBITaxa +import re +from datetime import datetime +from collections import OrderedDict + +import fdog.libs.blast as blast_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.tree as tree_fn + +##### FUNCTIONS RELATED TO ADDING NEW TAXON TO FDOG DATABASE ##### + +def check_conflict_opts(replace, delete): + """ Check if both replace and delete option are specified """ + if delete: + if replace: + sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') + if replace: + if delete: + sys.exit('*** ERROR: only one option can be choose between "--replace" and "--delete"') + + +def create_folders(outPath, spec_name, coreTaxa, noAnno): + """ Create searchTaxa_dir, coreTaxa_dir and annotation_dir in output folder """ + Path(outPath + '/searchTaxa_dir').mkdir(parents = True, exist_ok = True) + genome_path = outPath + '/searchTaxa_dir/' + spec_name + Path(genome_path).mkdir(parents = True, exist_ok = True) + if coreTaxa: + Path(outPath + '/coreTaxa_dir').mkdir(parents = True, exist_ok = True) + if not noAnno: + Path(outPath + '/annotation_dir').mkdir(parents = True, exist_ok = True) + return(genome_path) + + +def generate_spec_name(tax_id, name, ver): + if name == "": + ncbi_name = tree_fn.check_tax_id(tax_id) + name = tree_fn.abbr_ncbi_name(ncbi_name) + return(name+'@'+tax_id+'@'+ver) + + +def create_genome(args): + """ Create fa and fai in searchTaxa_dir """ + (faIn, genome_path, spec_name, force, replace, delete) = args + ### load fasta seq + in_seq = SeqIO.to_dict((SeqIO.parse(open(faIn), 'fasta'))) + if not os.path.exists(genome_path): + Path(genome_path).mkdir(parents = True, exist_ok = True) + genome_file = '%s/%s.fa' % (genome_path, spec_name) + if (not os.path.exists(os.path.abspath(genome_file))) or (os.stat(genome_file).st_size == 0) or force: + f = open(genome_file, 'w') + pipe = 0 + long_id = 0 + mod_id_index = 0 + id_dict = {} # id_dict[ori_id] = mod_id + for id in in_seq: + ori_id = id + seq = str(in_seq[id].seq) + ### check if ID contains empty char or pipe + if ' ' in id: + sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id) + else: + if '|' in id: + tmp = re.split('[_|]', id) + tmp = list(OrderedDict.fromkeys(tmp)) + pipe = 1 + id = '_'.join(tmp) + if not ori_id in id_dict: + id_dict[ori_id] = id + ### check if id longer than 20 character + if len(id) > 20: + long_id = 1 + mod_id_index = mod_id_index + 1 + id = '%s_%s' % (spec_name.split('@')[1], mod_id_index) + if not ori_id in id_dict: + id_dict[ori_id] = id + ### check if seq contains special characters + if seq[-1] == '*': + seq = seq[:-1] + specialChr = 'no' + if any(c for c in seq if not c.isalpha()): + specialChr = 'yes' + if specialChr == 'yes': + if replace or delete: + if replace: + seq = re.sub('[^a-zA-Z]', 'X', seq) + if delete: + seq = re.sub('[^a-zA-Z]', '', seq) + else: + sys.exit('\033[91mERROR: %s sequence contains special character!\033[0m\nYou can use --replace or --delete to solve it.' % (id)) + f.write('>%s\n%s\n' % (id, seq)) + f.close() + ### create index file + fasta_fn.read_fasta(genome_file) + ### write .checked file + cf = open(genome_file+'.checked', 'w') + cf.write(str(datetime.now())) + cf.close() + ### write ID mapping file and give warning if ID changed + if len(id_dict) > 0: + mapping_file = '%s.mapping' % genome_file + with open(mapping_file, 'w') as mp: + for o,n in id_dict.items(): + mp.write('%s\t%s\n' % (o,n)) + if pipe == 1: + print('\033[94mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m') + if long_id == 'yes': + print('\033[94mWARNING: Some headers longer than 80 characters have been automatically shortened. Please check the %s.mapping file for details!\033[0m' % genome_file) + if pipe == 1: + print('\033[94mWARNING: Please check the %s file for details!\033[0m' % mapping_file) + else: + print(genome_path + '/' + spec_name + '.fa already exists!') + return(genome_file) + + +def create_blastdb(args): + """ Create blastdb for a given fasta genome_file """ + (outPath, spec_name, genome_file, force, silent) = args + blast_path = '%s/coreTaxa_dir/%s' % (outPath, spec_name) + if (not os.path.exists(os.path.abspath('%s/%s.phr' % (blast_path, spec_name)))) or force: + blast_fn.make_blastdb([spec_name, genome_file, outPath, silent]) + ### make symlink to fasta files + fa_in_genome = "../../searchTaxa_dir/%s/%s.fa" % (spec_name, spec_name) + fai_in_genome = "../../searchTaxa_dir/%s/%s.fa.fai" % (spec_name, spec_name) + fa_in_blast = "%s/%s.fa" % (blast_path, spec_name) + fai_in_blast = "%s/%s.fa.fai" % (blast_path, spec_name) + if not os.path.exists(fa_in_blast): + os.symlink(fa_in_genome, fa_in_blast) + if not os.path.exists(fai_in_blast): + os.symlink(fai_in_genome, fai_in_blast) + else: + print('Blast DB already exists!') + + +def create_annoFile(outPath, genome_file, cpus, force): + """ Create annotation json for a given genome_file """ + annoCmd = 'fas.doAnno -i %s -o %s --cpus %s' % (genome_file, outPath+'/annotation_dir', cpus) + if force: + annoCmd = annoCmd + " --force" + try: + subprocess.call([annoCmd], shell = True) + except: + print('\033[91mERROR: Problem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd) diff --git a/fdog/libs/alignment.py b/fdog/libs/alignment.py new file mode 100644 index 0000000..507eaa5 --- /dev/null +++ b/fdog/libs/alignment.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import math +import re +from Bio import SeqIO +from Bio.Align.Applications import MuscleCommandline +from Bio.Align.Applications import MafftCommandline +from io import StringIO + +import fdog.libs.fasta as fasta_fn +import fdog.libs.output as output_fn + +##### FUNCTIONS RELATED TO SEQ ALIGNMENT ##### + +def check_fasta36_executable(fdogPath): + try: + fasta36_cmd = '%s/bin/aligner/bin/ggsearch36' % fdogPath + subprocess.check_output(fasta36_cmd, shell = True, stderr = subprocess.STDOUT) + return('%s/bin/aligner/' % fdogPath) + except subprocess.CalledProcessError as e: + sys.exit('\033[91mERROR: FASTA36 at %s/bin/aligner/bin/ not executable!\033[0m' % fdogPath) + + +def do_align(aligner, fa_file): + """ Do alignment using MUSCLE or MAFFT for a multiple fasta file + Return a dictionary (SeqIO object) containing seq IDs and aligned sequences + Note: if any input seq is longer than 12.000 aa/nt, only MAFFT can be used + """ + if fasta_fn.check_long_seq(fa_file) == 1: + aligner = 'mafft-linsi' + if aligner == 'muscle': + align_cline = MuscleCommandline(input = fa_file) + else: + align_cline = MafftCommandline( + input = fa_file, localpair = True, maxiterate = 1000) + try: + stdout, stderr = align_cline() + aln_io = StringIO(stdout) + aln_seq = SeqIO.to_dict((SeqIO.parse(aln_io,'fasta'))) + return(aln_seq) + except: + sys.exit( + 'ERROR: Error doing alignment with %s for %s\n%s' % (aligner, fa_file, align_cline)) + + +def calc_Kimura_dist(aln_dict, id_1, id_2, debug): + """ Calculate Kimura distance for a pair of sequences + Input is a dictionary of MSA (see do_align function). + The Kimura distance is calculated based on perl module + https://metacpan.org/pod/Bio::Align::ProteinStatistics#D-distance-methods + """ + matches = 0 + total = 0 + if id_1 in aln_dict and id_2 in aln_dict: + for a, b in zip(aln_dict[id_1].seq, aln_dict[id_2].seq): + if a != '-' and b != '-': + if a == b: + matches +=1 + total += 1 + D = 1 - (matches/total) + output_fn.print_debug( + debug, 'Kimura distance', + 'kimura = round(- (math.log( 1 - %s - (0.2 * (%s ** 2)))), 5)' % (D, D)) + try: + kimura = round(- (math.log( 1 - D - (0.2 * (D ** 2)))), 5) + except: + kimura = 999 + return(kimura) + else: + sys.exit('%s or %s not found in %s!' % (id_1, id_2, aln_dict)) + + +def calc_aln_score(fa1, fa2, aln_strategy = 'local', debugCore = False): + """ Calculate alignment score for genes in fa2 vs other genes in fa1 + Return dictionary {gene_id:aln_score} + """ + fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','') + fasta36_options = '%s %s -s BP62 -m 9 -d 0 -z -1 -E 100' % (fa1, fa2) + if aln_strategy == 'global': + fasta36_cmd = '%s/bin/aligner/bin/ggsearch36 %s' \ + % (fdog_path, fasta36_options) + elif aln_strategy == 'glocal': + fasta36_cmd = '%s/bin/aligner/bin/glsearch36 %s' \ + % (fdog_path, fasta36_options) + else: + fasta36_cmd = '%s/bin/aligner/bin/ssearch36 %s' \ + % (fdog_path, fasta36_options) + output_fn.print_debug( + debugCore, 'ALN SCORE', + 'Calculate aln score using FASTA36: %s' % fasta36_cmd) + try: + fasta36_out = subprocess.run( + [fasta36_cmd], shell = True, capture_output = True, check = True) + except: + sys.exit('Error running FASTA36\n%s' % fasta36_out) + # returns score for genes in fa2 + aln_score = {} + cand_dict = SeqIO.to_dict((SeqIO.parse(open(fa2), 'fasta'))) + for id in list(cand_dict.keys()): + aln_score[id[0:60]] = 0 + results = fasta36_out.stdout.decode().split('\n') + for l in results: + if len(l) > 1: + gene_id = l.split()[0] + if gene_id in aln_score: + if re.search('\(\s+\d+\)', l): + l = re.sub(r'\(\s+','(', l) + aln_score[gene_id] = aln_score[gene_id] + int(l.split()[2]) + return(aln_score) diff --git a/fdog/libs/blast.py b/fdog/libs/blast.py new file mode 100644 index 0000000..ff41608 --- /dev/null +++ b/fdog/libs/blast.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import os +import sys +from Bio.Blast.Applications import NcbiblastpCommandline +import xml.etree.ElementTree as ET +import subprocess + + +##### FUNCTIONS RELATED TO BLAST ##### + +def do_blastsearch( + query, blast_db, evalBlast = 0.00001, lowComplexityFilter = False): + """ Perform blastp search for a query fasta file + Return an XML string contains blast result + """ + filter = 'no' + if lowComplexityFilter == True: + filter = 'yes' + try: + blastp_cline = NcbiblastpCommandline( + query = query, db = blast_db, evalue = evalBlast, seg = filter, + max_target_seqs = 10, outfmt = 5) + stdout, stderr = blastp_cline() + return(stdout) + except: + sys.exit( + 'ERROR: Error running blastp search for %s against %s' + % (query, blast_db)) + + +def parse_blast_xml(blast_output): + """ Parse Blast XML output from a string variable + Return a dictionary containing query ID, query length, together with a list + of hits and their bit score, evalue, align len + """ + blast_dict = {} + root = ET.fromstring(blast_output) + blast_dict['query'] = root[8][0][2].text + blast_dict['query_len'] = root[8][0][3].text + blast_dict['hits'] = {} + for type_tag in root.findall( + 'BlastOutput_iterations/Iteration/Iteration_hits/Hit'): + value = type_tag.findall('*') + hit_id = 'NA' + for i in type_tag.findall('*'): + if i.tag == 'Hit_def': + if not i.text in blast_dict['hits']: + hit_id = i.text + blast_dict['hits'][hit_id] = {} + if i.tag == 'Hit_hsps': + if hit_id in blast_dict['hits']: + blast_dict['hits'][hit_id]['bit_score'] = i[0][1].text + blast_dict['hits'][hit_id]['evalue'] = i[0][3].text + blast_dict['hits'][hit_id]['align_len'] = i[0][13].text + return(blast_dict) + + +def make_blastdb(args): + (specName, specFile, outPath, silent) = args + blastCmd = 'makeblastdb -dbtype prot -in %s -out %s/coreTaxa_dir/%s/%s' % (specFile, outPath, specName, specName) + if silent == True: + blastCmd = blastCmd + '> /dev/null 2>&1' + try: + subprocess.call([blastCmd], shell = True) + except: + sys.exit('Problem with running %s' % blastCmd) + fileInGenome = "../../searchTaxa_dir/%s/%s.fa" % (specName, specName) + fileInBlast = "%s/coreTaxa_dir/%s/%s.fa" % (outPath, specName, specName) + if not os.path.exists(fileInBlast): + os.symlink(fileInGenome, fileInBlast) diff --git a/fdog/libs/corecompile.py b/fdog/libs/corecompile.py new file mode 100644 index 0000000..130ba6c --- /dev/null +++ b/fdog/libs/corecompile.py @@ -0,0 +1,408 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import shutil +from pathlib import Path +from ete3 import NCBITaxa +from Bio import SeqIO +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.hmm as hmm_fn +import fdog.libs.alignment as align_fn +import fdog.libs.tree as tree_fn +import fdog.libs.fas as fas_fn +import fdog.libs.output as output_fn +import fdog.libs.orthosearch as ortho_fn + + +##### FUNCTIONS RELATED TO CORE COMPILATION ##### + +def get_core_taxa_ids(coreTaxa, corepath): + """ Get taxonomy IDs for core taxa + Either from coreTaxa_dir, or from user input list (--coreTaxa) + Return dictionary {taxID:@@Ver} + """ + tax_ids = {} + if not coreTaxa == '': + ignored_taxa = [] + if os.path.exists(os.path.abspath(coreTaxa)): + core_taxa = general_fn.read_file(coreTaxa) + else: + core_taxa = coreTaxa.split(',') + + for core_taxon in core_taxa: + if not os.path.exists( + os.path.abspath( + '%s/%s/%s.phr' % (corepath,core_taxon,core_taxon))): + ignored_taxa.append(core_taxon) + else: + id = core_taxon.split('@')[1] + if not id in tax_ids: + tax_ids[id] = core_taxon + if len(ignored_taxa) > 0: + print( + 'WARNING: %s taxa cannot be found at %s\n%s' + % (len(ignored_taxa), corepath, ignored_taxa)) + else: + tax_ids = general_fn.get_ids_from_folder(corepath, 'coreTaxa_dir') + return(tax_ids) + + +def initiate_core_files( + seqFile, seqName, refspec, seed_id, hmmpath, annopath, aligner, fasOff): + hmm_dir = '%s/%s/hmm_dir' % (hmmpath, seqName) + Path(hmm_dir).mkdir(parents = True, exist_ok = True) + aln_file = '%s/%s/hmm_dir/%s.aln' % (hmmpath, seqName, seqName) + aln_seed = align_fn.do_align(aligner, seqFile) + fasta_fn.write_fasta(aln_seed, aln_file) + hmm_file = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + hmm_seed = hmm_fn.create_hmm(aln_file, hmm_file) + + fa_file = '%s/%s/%s.fa' % (hmmpath, seqName, seqName) + seed_id_mod = '%s|%s|%s' % (seqName, refspec, seed_id) + input_seed = SeqIO.parse(seqFile,'fasta') + with open(fa_file, 'w') as initial_core_fa: + for fa in input_seed: + initial_core_fa.write('>%s\n%s\n' % (seed_id_mod, str(fa.seq))) + + seed_json = '' + if not fasOff == True: + seed_json = fas_fn.get_anno_fas( + seqName, refspec, seed_id, str(fa.seq), hmmpath, annopath) + return(aln_file, fa_file, hmm_file, seed_json) + + +def store_cand_reults(args): + """ Save intermediate results for a candidate ortholog + Including: + 1) Candidate joined score of fas & normalised aln score + in dictionary {taxID:score} + 2) Candidate fasta sequence in dictionary {taxID:fasta_seq} + 3) Update current (best) candidate, normalised aln score and joined score + """ + (cand_taxid, cand_score, cand_seq, + curr_cand, curr_aln_score, aln_score_normalized, + fas_score, fas_dict, ortho_id, ortho_seq) = args + cand_score[cand_taxid] = float(fas_score) + float(aln_score_normalized) + cand_seq[cand_taxid] = {ortho_id:ortho_seq} + curr_aln_score = aln_score_normalized + curr_candi_score = float(fas_score) + float(aln_score_normalized) + curr_cand = cand_taxid + return( + cand_score, cand_seq, curr_cand, curr_aln_score, + curr_candi_score, fas_dict) + + +def validate_candidate(args): + """ Validate candidate based on its normalised aln score and fas score """ + (aln_score_normalized, cand_args, calc_fas_args, variable_args, debugCore, distDeviation) = args + (cand_score, cand_seq, curr_cand, curr_aln_score, + curr_candi_score, fas_dict) = variable_args + (cand_taxid, ortho_id, ortho_seq, next_node, first_cand) = cand_args + (fasOff, seqName, seed_json, spec, seq_id, seq, hmmpath, annopath) = calc_fas_args + + if first_cand == True: + threshold = 0 + else: + if next_node == True: + threshold = curr_candi_score * (1 + distDeviation) + type = ' (STRICT) ' + else: + threshold = curr_candi_score + type = '' + + if aln_score_normalized > threshold - 1: + if not '%s_%s' % (spec, seq_id) in fas_dict: + fas_score = fas_fn.calc_fas_cand(calc_fas_args) + fas_dict['%s_%s' % (spec, seq_id)] = fas_score + output_fn.print_debug( + debugCore, '', '-FAS: %s' % fas_score) + else: + fas_score = fas_dict['%s_%s' % (spec, seq_id)] + if float(fas_score) + float(aln_score_normalized) > threshold: + variable_args = store_cand_reults( + [cand_taxid, + cand_score, cand_seq, + curr_cand, curr_aln_score, + aln_score_normalized, fas_score, fas_dict, + ortho_id, ortho_seq]) + else: + output_fn.print_debug( + debugCore, '', + '-Joined score not higher than the prev%s! Skip...' % type) + else: + output_fn.print_debug( + debugCore, '', + '-Aln score %s not higher%s! Skip...' % (aln_score_normalized, type)) + return(variable_args) + + +def compile_core(args): + """ Core compilation """ + (seqFile, seqName, refspec, seed_id, coreArgs, pathArgs, orthoArgs, otherArgs, debug) = args + (minDist, maxDist, coreSize, coreTaxa, distDeviation, alnStrategy, fasOff) = coreArgs + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (cpus, debugCore, silentOff, noCleanup, force, append) = otherArgs + aligner = orthoArgs[-1] + otherArgs.insert(0, 'NA') + + ncbi = NCBITaxa() + ### get taxonomy lineage of refspec + refspec_id = refspec.split('@')[1] + refspec_lineage = ncbi.get_lineage(refspec_id) + + ### get rank ID and its index in the refspec lineage + (min_rank, max_rank) = tree_fn.get_rank_range(refspec_lineage, minDist, maxDist, ncbi) + output_fn.print_debug(debugCore, 'Min & Max-rank', '%s\t%s' % (min_rank, max_rank)) + + ### create taxonomy tree from list of core tax + tax_ids = get_core_taxa_ids(coreTaxa, corepath) + tree = ncbi.get_topology(tax_ids.keys(), intermediate_nodes = True) + if debugCore: + print(tree) + + ### INITIATE FA, ALN, HMM [and anno FAS] FILE FOR SEED + (aln_file, fa_file, hmm_file, seed_json) = initiate_core_files( + seqFile, seqName, refspec, seed_id, hmmpath, annopath, aligner, fasOff) + + ### get list of taxa within min and max rank of refspec + node_dict = tree_fn.get_leaves_dict( + refspec_lineage, tree, + list(min_rank.values())[0], list(max_rank.values())[0]) + output_fn.print_debug(debugCore, 'Node dictionary', node_dict) + + ### traverse the core taxa tree + added_taxa = {} + ignored_taxa = [] + fas_dict = {} + previous_added_taxon = refspec_id + for round in range(coreSize - 1): + output_fn.print_stdout(silentOff, '---------- ROUND %s ----------' % round) + output_fn.print_debug( + debugCore, 'CORE COMPILATION', + '---------- ROUND %s ----------' % round) + aln_scores = align_fn.calc_aln_score(fa_file, fa_file, alnStrategy, debugCore) + max_aln_score = max(aln_scores.values()) #0 + if max_aln_score == 0: + exit('ERROR: Something went wrong with FASTA36. Please run debugCore to investigate!') + flag_round = 0 # use to stop current round if an ortholog was added + output_fn.print_debug(debugCore, '', 'ADDED TAXA: %s' % added_taxa.keys()) + cand_seq = {} + cand_score = {} + curr_cand = '' + curr_aln_score = 0 + curr_candi_score = 0 + next_node = False + for node_id, leaves in node_dict.items(): + if flag_round == 1: + break + output_fn.print_debug( + debugCore, '', + 'NODE %s - %s' % (node_id, ncbi.get_rank([node_id]))) + output_fn.print_debug( + debugCore, '', + '-MAX ALN SCORE: %s' % max_aln_score) + output_fn.print_debug( + debugCore, '', + '-PREVIOUS ADDED: %s' % previous_added_taxon) + leaves.reverse() + flag_node = 0 + for leaf in leaves: + if flag_node == 1: + break + if not leaf == refspec_id and \ + not leaf in added_taxa and \ + not leaf in ignored_taxa: + output_fn.print_debug( + debugCore, '', + 'Leaf %s - %s' % (leaf, tax_ids[leaf])) + if len(curr_cand) > 0 and \ + not curr_cand in node_dict[node_id]: + next_node = True + output_fn.print_debug( + debugCore, '', + '-Current_candidate from different node: %s' \ + % curr_cand) + if curr_candi_score * (1 + distDeviation) > 2: + output_fn.print_debug( + debugCore, '', + '# Current score cannot be defeater! Stop this node!') + break + else: + next_node = False + output_fn.print_debug( + debugCore, '', + '-Current_candidate: %s' % curr_cand) + output_fn.print_debug( + debugCore, '', + '-Current_aln_score: %s' % curr_aln_score) + output_fn.print_debug( + debugCore, '', + '-Current_candi_score: %s' % curr_candi_score) + ### compare taxonomy rank with previous added taxon + ### process only if this leaf is within min and max rank + ### of the previous added taxon + ancestor = tree_fn.get_ancestor(previous_added_taxon, leaf, ncbi) + check_ancestor = tree_fn.check_common_ancestor( + previous_added_taxon, list(ancestor.keys())[0], + minDist, maxDist, ncbi) + if check_ancestor == 1: + output_fn.print_debug( + debugCore, '', + '-Ancestor %s with %s accepted' \ + % (ancestor, previous_added_taxon)) + ### run ortholog search + otherArgs[2] = debug + otherArgs[0] = tax_ids[leaf] + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, + orthoArgs, otherArgs]) + if len(hamstr_out) > 1: + ### calculate alignment score + ortho = list(hamstr_out.items())[-1] + tmp_fa = '%s/%s/%s_%s.fa' \ + % (hmmpath, seqName, seqName, leaf) + with open(tmp_fa, 'w') as tmp_fa_out: + tmp_fa_out.write('>%s\n%s\n' \ + % (ortho[0][0:len(ortho[0])-2], ortho[1])) + aln_score = align_fn.calc_aln_score(fa_file, tmp_fa, alnStrategy, debugCore) + output_fn.print_debug( + debugCore, '', + '-Max: %s - Aln: %s' % (max_aln_score, aln_score)) + aln_score_normalized = \ + list(aln_score.values())[0] / max_aln_score + output_fn.print_debug( + debugCore, '', + '-Normalized_aln_score: %s' % aln_score_normalized) + os.remove(tmp_fa) + + ### validate candidate + if len(cand_score) == 0 \ + and len(curr_cand) == 0: + first_cand = True + else: + first_cand = False + calc_fas_args = (fasOff, seqName, seed_json, + tax_ids[leaf], ortho[0].split('|')[-2], + ortho[1] ,hmmpath, annopath) + cand_args = (leaf, ortho[0][0:len(ortho[0])-2], + ortho[1], next_node, first_cand) + variable_args = (cand_score, cand_seq, + curr_cand, curr_aln_score, + curr_candi_score, fas_dict) + (cand_score, cand_seq, curr_cand, + curr_aln_score, curr_candi_score, + fas_dict) = validate_candidate([ + aln_score_normalized, cand_args, + calc_fas_args, variable_args, debugCore, + distDeviation]) + if curr_candi_score == 2: + flag_node = 1 + output_fn.print_debug( + debugCore, '', + '-Max score achieved! Stop this node!') + else: + ignored_taxa.append(leaf) + output_fn.print_debug( + debugCore, '', + '-No ortholog found!') + else: + ignored_taxa.append(leaf) + output_fn.print_debug( + debugCore, '', + '-Not considered due to ancestor %s with %s\n' \ + % (ancestor, previous_added_taxon)) + else: + output_fn.print_debug( + debugCore, '', + '%s - %s skipped' % (leaf, tax_ids[leaf])) + output_fn.print_debug( + debugCore, '', 'Node candidates: %s' % cand_score) + if len(cand_score) > 0 \ + and cand_score[curr_cand] == 2: + output_fn.print_debug( + debugCore, '', + '# MAX SCORE ACCHIEVED! Stop this round!') + flag_round = 1 + if next_node == True \ + and cand_score[curr_cand] * (1 + distDeviation) > 2: + output_fn.print_debug( + debugCore, '', + '# CURRENT SCORE CANNOT BE DEFEATED! Stop this round!') + flag_round = 1 + + if len(cand_seq) > 0: + output_fn.print_debug( + debugCore, '', + '# ADD THIS TAXON TO CORE GROUP\t%s - %s\n' \ + % (curr_cand, tax_ids[curr_cand])) + previous_added_taxon = curr_cand + added_taxa[curr_cand] = {tax_ids[curr_cand]:cand_score[curr_cand]} + ### update seqName.fa and hmm_dir/seqName.hmm + fasta_fn.append_to_fasta_file(fa_file, cand_seq[curr_cand]) + aln_seed = align_fn.do_align(aligner, fa_file) + fasta_fn.write_fasta(aln_seed, aln_file) + hmm_seed = hmm_fn.create_hmm(aln_file, hmm_file) + os.remove(aln_file) + ### remove temp json files + for file in os.listdir('%s/%s' % (hmmpath, seqName)): + if file.endswith('.json'): + os.remove('%s/%s/%s' % (hmmpath, seqName, file)) + output_fn.print_debug( + debugCore, 'CORE COMPILATION', + 'All added taxa %s' % added_taxa) + if len(added_taxa) < coreSize - 1: + output_fn.print_stdout( + silentOff, + 'WARNING: Only %s/%s orthologs in the core group' \ + % (len(added_taxa), coreSize)) + + +def run_compile_core(args): + (seqFile, seqName, refspec, seed_id, reuseCore, forceCore, coreArgs, + pathArgs, orthoCoreArgs, otherCoreArgs, debug) = args + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (cpus, debugCore, silentOff, noCleanup, force, append) = otherCoreArgs[-6:] + + begin = time.time() + fdogPath = os.path.realpath(__file__).replace('/libs/corecompile.py','') + align_fn.check_fasta36_executable(fdogPath) + + coreHmmfile = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + coreHmmfile = os.path.abspath(coreHmmfile) + compile_core_check = 1 + ncbi = '' + if reuseCore == True: + general_fn.check_file_exist(coreHmmfile) + compile_core_check = 0 + else: + if os.path.exists(coreHmmfile): + if forceCore == True: + print('WARNING: Existing %s core group will be deleted!' % seqName) + shutil.rmtree('%s/%s' % (hmmpath, seqName)) + else: + sys.exit( + 'WARNING: Core group %s exists in %s! ' % (seqName, hmmpath) + + 'You still can run with --forceCore or --reuseCore option') + if compile_core_check == 1: + compile_core([seqFile, seqName, refspec, seed_id, coreArgs, pathArgs, + orthoCoreArgs, otherCoreArgs[-6:], debug]) + end = time.time() + return([seqName, '{:5.3f}s'.format(end - begin)]) diff --git a/fdog/libs/fas.py b/fdog/libs/fas.py new file mode 100644 index 0000000..46f753e --- /dev/null +++ b/fdog/libs/fas.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import shutil +import greedyFAS.annoFAS.annoModules as annoFas + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO FAS ##### + +def check_fas_executable(): + try: + subprocess.check_output(['fas.setup -t ./ --check'], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print('\033[96m%s\033[0m' % e.output.decode(sys.stdout.encoding).strip()) + print('FAS installed but fas.setup still need to be run if you want to use it!') + return(0) + return(1) + + +def get_tool_fas_path(): + """ Get path to FAS annotation tools """ + cmd = 'fas.setup -t ~/ -c' + try: + out = subprocess.run( + [cmd], shell = True, capture_output = True, check = True) + tool_path = out.stdout.decode().split('\n')[0].split()[6].replace('.','') + return(tool_path) + except: + sys.exit('ERROR: fas.setup cannot be called!') + + +def get_anno_fas(seqName, spec, seq_id, seq, hmmpath, annopath): + """ Get annotation for a seq_id from existing json file in annopath """ + out_json = '%s/%s/%s_%s.json' % (hmmpath, seqName, spec, seq_id) + if not os.path.exists(out_json): + tmp_seed_fa = '%s/%s/%s_%s.fa' % (hmmpath, seqName, seqName, spec) + with open(tmp_seed_fa, 'w') as tmp_seed_fa_out: + tmp_seed_fa_out.write('>%s\n%s\n' % (seq_id, seq)) + spec_anno = '%s/%s.json' % (annopath, spec) + try: + anno_dict = annoFas.extractAnno(tmp_seed_fa, spec_anno) + anno_dict['clan'] = annoFas.getClans( + get_tool_fas_path(), anno_dict['feature']) + annoFas.save2json( + anno_dict, '%s_%s' % (spec, seq_id), '%s/%s' % (hmmpath, seqName)) + os.remove(tmp_seed_fa) + except: + sys.exit( + 'ERROR: Annotation for %s cannot be found in %s' + % (seq_id, spec_anno)) + return(out_json) + + +def calc_pairwise_fas(seed_json, query_json, seqName, hmmpath): + """ Calculate FAS score for a pair seed and query protein + Input are two anno json files for seed and query + Return a value between 0 and 1 + """ + general_fn.check_file_exist(seed_json) + general_fn.check_file_exist(query_json) + + fas_cmd = 'fas.run -s %s -q %s' % (seed_json, query_json) + fas_cmd = '%s -a %s/%s --raw --tsv --domain --cpus 1 -o %s/%s' \ + % (fas_cmd, hmmpath, seqName, hmmpath, seqName) + try: + fas_out = subprocess.run( + [fas_cmd], shell = True, capture_output = True, check = True) + except: + sys.exit('ERROR: Error running FAS\n%s' % fas_cmd) + results = fas_out.stdout.decode().split('\n') + for l in results: + if l.startswith('#') and len(l.split('\t')) > 1: + return(l.split('\t')[-1]) + return('') + + +def calc_fas_cand(args): + """ Calculate FAS score for a ortholog candidate against seed + Ortholog candidate defined by spec, seq_id and seq + """ + (fasOff, seqName, seed_json, spec, seq_id, seq, hmmpath, annopath) = args + if not fasOff == True: + query_json = get_anno_fas(seqName, spec, seq_id, seq, hmmpath, annopath) + fas_score = calc_pairwise_fas(seed_json, query_json, seqName, hmmpath) + else: + fas_score = 1 + return(fas_score) + + +def calc_fas_multi (input_fa, outpath, annopath, cpus): + """ Calculate pairwise FAS scores for all orthologs vs seed protein + input_fa is the default .extended.fa output file of fDOG + Output will be _forward. + """ + fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (input_fa, annopath, cpus) + try: + subprocess.call([fasCmd], shell = True) + if os.path.exists(outpath + '/tmp'): + shutil.rmtree(outpath + '/tmp') + except: + sys.exit('Problem running\n%s' % (fasCmd)) diff --git a/fdog/libs/fasta.py b/fdog/libs/fasta.py new file mode 100644 index 0000000..bdd33f7 --- /dev/null +++ b/fdog/libs/fasta.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +from pysam import FastaFile +from Bio import SeqIO + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO FASTA SEQ ##### + +def add_seq_to_dict(dict, id, seq): + """ Add fasta sequence to a dictionary """ + if not id in dict: + dict[id] = seq + return(dict) + + +def read_fasta(fa_file): + """ Read LARGE fasta file and return fasta object + Sequence can be get using fasta_object.fetch(seq_id) + """ + fasta_object = FastaFile(fa_file) + return(fasta_object) + + +def write_fasta(fa_dict, out_file): + """ Write sequences in SeqIO dict into output file """ + with open(out_file, 'w') as out: + for seq in fa_dict: + out.write('>%s\n%s\n' % (seq, fa_dict[seq].seq)) + + +def append_to_fasta_file(fa_file, new_fa_dict): + """ Append a dict of fasta seq to an existing fasta file """ + general_fn.check_file_exist(fa_file) + existing_seq = SeqIO.to_dict(SeqIO.parse(open(fa_file),'fasta')) + with open(fa_file, 'a') as fa_out: + for id, seq in new_fa_dict.items(): + if not id in existing_seq: + fa_out.write('>%s\n%s\n' % (id, seq)) + + +def check_long_seq(fa_file): + """ Check if any sequence longer than 12.000 aa/nt""" + fa_seq = SeqIO.parse(open(fa_file),'fasta') + for fa in fa_seq: + if len(fa.seq) > 12000: + return(1) + return(0) + + +def remove_dup(fa_file): + """ Remove duplicated sequences (filter by headers) """ + tmp = {} + fa_seq = SeqIO.parse(open(fa_file),'fasta') + for fa in fa_seq: + if not fa.id in tmp: + tmp[fa.id] = fa.seq + with open(fa_file, 'w') as out: + for id, seq in tmp.items(): + out.write('>%s\n%s\n' % (id, seq)) diff --git a/fdog/libs/hmm.py b/fdog/libs/hmm.py new file mode 100644 index 0000000..8ab02f3 --- /dev/null +++ b/fdog/libs/hmm.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +import subprocess +import pyhmmer + + +##### FUNCTIONS RELATED TO HMM ##### + +def create_hmm(aln_file, out_file): + """ Create hmm file for an alinment file """ + hmmbuild_cmd = 'hmmbuild %s %s' % (out_file, aln_file) + try: + subprocess.run( + [hmmbuild_cmd], shell = True, + stdout = open(os.devnull, 'wb'), check = True) + except: + sys.exit('ERROR: Error running hmmbuild %s' % hmmbuild_cmd) + + +def do_hmmsearch( + hmm_file, search_fa, evalHmmer = 0.00001, scoreCutoff = 10, + hitLimit = 10, cpus = os.cpu_count()): + """ Perform hmmsearch for a hmm file vs a multiple fasta file + Return a dictionary of hits and their e-value and bit-score + Only "top" hits are returned. The cutoff is defined by + max_score / 100 * (100 - scoreCutoff) + By default, only hits that have at least 90% of the best bit score + are considers + """ + hmm_hits = {} + with pyhmmer.easel.SequenceFile(search_fa, digital = True, alphabet = pyhmmer.easel.Alphabet.amino()) as seq_file: + sequences = list(seq_file) + with pyhmmer.plan7.HMMFile(hmm_file) as hmm_file: + try: + for hits in pyhmmer.hmmsearch( + hmm_file, sequences, E = evalHmmer, cpus = cpus): + if len(hits) > 0: + n = 0 + for hit in hits: + if hit.score >= hits[0].score/100*(100-scoreCutoff): + if n < hitLimit: + hmm_hits[hit.name.decode('ASCII')] = ( + hit.evalue,hit.score) + n += 1 + except: + sys.exit( + 'ERROR: Error running hmmsearch for %s agains %s' + % (hmm_file, search_fa)) + return(hmm_hits) diff --git a/fdog/libs/orthosearch.py b/fdog/libs/orthosearch.py new file mode 100644 index 0000000..564c2c5 --- /dev/null +++ b/fdog/libs/orthosearch.py @@ -0,0 +1,269 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import os +from Bio import SeqIO +import multiprocessing as mp +from tqdm import tqdm +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.blast as blast_fn +import fdog.libs.hmm as hmm_fn +import fdog.libs.alignment as align_fn +import fdog.libs.output as output_fn + + +##### FUNCTION FOR HMM-BASED ORTHOLOG SEARCH (HaMStR) ##### +def hamstr(args): + (seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup) = args + """ Ortholog search algorithm for a hmm core group agains a search taxon + Implemented based on HaMStR https://doi.org/10.1186/1471-2148-9-157 + """ + ### (0) Dict for storing candidate and final orthologs (key=id, value=seq) + ortho_candi = {} + ortho_final = {} + ### (00) Parse input files + hmm_file = '%s/%s/hmm_dir/%s.hmm' % (hmmpath, seqName, seqName) + refspec_db = '%s/%s/%s' % (corepath, refspec, refspec) + refspec_fa = '%s/%s/%s.fa' % (searchpath, refspec, refspec) + search_fa = '%s/%s/%s.fa' % (searchpath, search_taxon, search_taxon) + ### (000) Adapt parameters + if rbh == True: + checkCoorthologsRefOff = True + rep = True + + ### PRINT JOB PARAMETERS + output_fn.print_stdout( + silentOff, + '\n### Ortholog search ###' + + '\nSeed: %s\nRefspec: %s\n' % (seqName, refspec) + + 'Ref_seqID: %s\n' % seed_id + + 'Search taxon: %s' % search_taxon) + output_fn.print_debug( + debug, 'Parameters', + 'HMM evalue cutoff: %s\nHMM hit limit: %s\n' % (evalHmmer, hitLimit) + + 'HMM hit score cutoff: %s\n' % scoreCutoff + + 'BLAST evalue cutoff: %s\n' % evalBlast + + 'Blast low complexity filter: %s\n' % lowComplexityFilter + + 'Turn off check for co-orthologs ref: %s\n' % checkCoorthologsRefOff + + 'Aligner: %s' % aligner) + + ### (1) Do hmmsearch for query hmm against search taxon fasta + hmm_hits = hmm_fn.do_hmmsearch( + hmm_file, search_fa, evalHmmer, scoreCutoff, hitLimit, cpus) + output_fn.print_debug(debug, 'HMM hits', hmm_hits) + ### (2) Read fasta file of refspec and search taxon + refspec_seqs = fasta_fn.read_fasta(refspec_fa) + search_seqs = fasta_fn.read_fasta(search_fa) + ### (3) Do re-blast search for each hmm hit against refspec + for hmm_hit in hmm_hits: + if not hmm_hit == seed_id: # only if search taxon == refspec + hmm_hit_fa = '%s/hmm_%s_%s_%s.fa' % ( + outpath, seqName, search_taxon, hmm_hit) + with open(hmm_hit_fa, 'w') as hmm_fa_out: + hmm_fa_out.write('>%s\n%s' % (hmm_hit, search_seqs.fetch(hmm_hit))) + blast_xml = blast_fn.do_blastsearch( + hmm_hit_fa, refspec_db, evalBlast, lowComplexityFilter) + blast_out = blast_fn.parse_blast_xml(blast_xml) + output_fn.print_debug(debug, 'BLAST hits', blast_out) + if noCleanup == False: + os.remove(hmm_hit_fa) + ### (4) check reciprocity + ### (4a) if refspec_seq_id == best blast hit + if len(blast_out['hits'].keys()) > 0: + best_blast_hit = list(blast_out['hits'].keys())[0] + if best_blast_hit == hmm_hit and len(blast_out['hits'].keys()) > 1: + best_blast_hit = list(blast_out['hits'].keys())[1] + if seed_id == best_blast_hit: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is ref)' % (blast_out['query'])) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + else: + ### (4b) else, check for co-ortholog ref + if checkCoorthologsRefOff == False: + aln_fa = '%s/blast_%s_%s_%s_%s_%s.fa' % ( + outpath, seqName, seed_id, search_taxon, + hmm_hit, best_blast_hit) + with open(aln_fa, 'w') as aln_fa_out: + aln_fa_out.write( + '>%s\n%s\n>%s\n%s\n>%s\n%s' % ( + seed_id, refspec_seqs.fetch(seed_id), + hmm_hit, search_seqs.fetch(hmm_hit), + best_blast_hit, refspec_seqs.fetch(best_blast_hit) + ) + ) + fasta_fn.remove_dup(aln_fa) + aln_seq = align_fn.do_align(aligner, aln_fa) + output_fn.print_debug( + debug, 'Alignment for checking co-ortholog ref', aln_seq) + br_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, seed_id, debug) + bh_dist = align_fn.calc_Kimura_dist(aln_seq, best_blast_hit, hmm_hit, debug) + output_fn.print_debug( + debug, 'Check if distance blast_vs_ref < blast_vs_hmm', + 'd_br = %s; d_bh = %s' % (br_dist, bh_dist)) + if noCleanup == False: + os.remove(aln_fa) + if br_dist == bh_dist == 0 or br_dist < bh_dist: + output_fn.print_stdout( + silentOff, + '%s accepted (best blast hit is co-ortholog to ref)' + % (blast_out['query']) + ) + ortho_candi[hmm_hit] = search_seqs.fetch(hmm_hit) + continue + ### (5) check co-ortholog if more than 1 HMM hits are accepted + if len(ortho_candi) == 0: + output_fn.print_stdout( + silentOff, 'WARNING: Reciprocity not fulfulled! No ortholog found!') + else: + best_ortho = list(ortho_candi.keys())[0] + if not best_ortho == seed_id: + ortho_final = fasta_fn.add_seq_to_dict( + ortho_final, '%s|%s|%s|1' % (seqName, search_taxon, best_ortho), + ortho_candi[best_ortho]) + if rep == False: + if len(ortho_candi) > 1: + aln_co_fa = '%s/coortho_%s_%s.fa' % ( + outpath, seqName, search_taxon) + with open(aln_co_fa, 'w') as aln_co_fa_out: + aln_co_fa_out.write(('>%s\n%s\n') % + (seed_id, refspec_seqs.fetch(seed_id))) + for cand in ortho_candi: + aln_co_fa_out.write(('>%s\n%s\n') % + (cand, ortho_candi[cand])) + aln_co_seq = align_fn.do_align(aligner, aln_co_fa) + output_fn.print_debug( + debug, 'Alignment for checking co-orthologs', aln_co_seq) + if noCleanup == False: + os.remove(aln_co_fa) + best_dist = align_fn.calc_Kimura_dist( + aln_co_seq, seed_id, best_ortho, debug) + for cand in ortho_candi: + if not cand == best_ortho: + candi_dist = align_fn.calc_Kimura_dist( + aln_co_seq, best_ortho, cand, debug) + output_fn.print_debug( + debug, + 'Check if distance bestHmm_vs_ref > ' + + 'other_vs_bestHmm', + 'd_best = %s; d_other = %s' + % (best_dist, candi_dist)) + if candi_dist < best_dist: + if not cand == seed_id: + ortho_final = fasta_fn.add_seq_to_dict( + ortho_final, + '%s|%s|%s|0' \ + % (seqName, search_taxon, cand), + ortho_candi[cand]) + output_fn.print_stdout( + silentOff, + '=> %s orthologs found: %s' + % (len(ortho_final), list(ortho_final.keys()))) + return(ortho_final) + + +def run_hamstr(args): + """ Perform ortholog search based on hamstr approach """ + + (seqName, refspec, pathArgs, orthoArgs, otherArgs) = args + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs + (checkCoorthologsRefOff, rbh, rep, evalBlast, lowComplexityFilter, + evalHmmer, hitLimit, scoreCutoff, aligner) = orthoArgs + (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs + + hamstr_jobs = [] + ### get ref seqID + core_fa = '%s/%s/%s.fa' % (hmmpath, seqName, seqName) + core_seqs = SeqIO.to_dict((SeqIO.parse(open(core_fa), 'fasta'))) + core_ids = core_seqs.keys() + seed_id = [s for s in core_ids if refspec in s][0].split('|')[-1] + + ### get search taxa from user defined list (as a file or directly a list) + if not searchTaxa == '': + ignored_taxa = [] + if os.path.exists(os.path.abspath(searchTaxa)): + search_taxa = general_fn.read_file(searchTaxa) + else: + search_taxa = searchTaxa.split(',') + + for search_taxon in search_taxa: + if os.path.exists( + os.path.abspath( + '%s/%s/%s.fa' % (searchpath,search_taxon,search_taxon))): + hamstr_jobs.append([ + seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup + ]) + else: + ignored_taxa.append(search_taxon) + if len(ignored_taxa) > 0: + print( + 'WARNING: %s taxa cannot be found at %s\n%s' + % (len(ignored_taxa), searchpath, ignored_taxa)) + ### get search taxa from searchpath (searchTaxa_dir) + else: + for search_taxon in general_fn.read_dir(searchpath): + if os.path.exists( + os.path.abspath( + '%s/%s/%s.fa' % (searchpath,search_taxon,search_taxon))): + hamstr_jobs.append([ + seqName, hmmpath, corepath, searchpath, outpath, + refspec, seed_id, search_taxon, + evalHmmer, hitLimit, scoreCutoff, + evalBlast, lowComplexityFilter, + checkCoorthologsRefOff, rbh, rep, + aligner, cpus, debug, silentOff, noCleanup + ]) + + ### do ortholog search + if len(hamstr_jobs) > 0: + output_fn.print_stdout( + silentOff, 'Ortholog search for %s taxa...' % len(hamstr_jobs)) + hamstr_out = {} + if debug == True or silentOff == True or len(hamstr_jobs) == 1: + for job in hamstr_jobs: + tmp_out = hamstr(job) + hamstr_out = {**hamstr_out, **tmp_out} + else: + pool = mp.Pool(cpus) + for _ in tqdm( + pool.imap_unordered(hamstr, hamstr_jobs), + total=len(hamstr_jobs)): + if len(_) > 0: + hamstr_out = {**hamstr_out, **_} + + ### Get seed seq + refspec_fa = '%s/%s/%s.fa' % (searchpath, refspec, refspec) + refspec_seqs = fasta_fn.read_fasta(refspec_fa) + seed_id_mod = '%s|%s|%s|1' % (seqName, refspec, seed_id) + seed_seq = refspec_seqs.fetch(seed_id) + + ### return + return({**{seed_id_mod:seed_seq}, **hamstr_out}) diff --git a/fdog/libs/output.py b/fdog/libs/output.py new file mode 100644 index 0000000..1dc1eb9 --- /dev/null +++ b/fdog/libs/output.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from Bio import SeqIO + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS FOR OUTPUT ##### + +def print_debug(debug, cat, msg): + """ Print msg of a category in debug mode """ + if debug == True: + if cat == '': + print('#DEBUG#\t%s' % msg) + else: + print('#DEBUG#\t%s\n#DEBUG#\t%s' % (cat, msg)) + + +def print_stdout(silentOff, msg): + """ Print stdout """ + if silentOff == True: + print(msg) + + +def check_output_exist(outfile, force, append): + """ Check if outfile exists + And decide depends on the choice of force or append option + """ + if os.path.exists(outfile): + if force == True: + print('WARNING: %s will be deleted!' % outfile) + os.remove(outfile) + elif append == True: + general_fn.check_file_exist(outfile) + print('Result will be appended to %s!' % outfile) + else: + sys.exit( + 'WARNING: %s exists! ' % outfile + + 'You still can run with --force or --append option') + + +def write_hamstr(hamstr_result, outpath, seqName, force, append): + """ Write result of ortholog search into seqName.extended.fa """ + outfile = '%s/%s.extended.fa' % (outpath, seqName) + outfile = os.path.abspath(outfile) + check_output_exist(outfile, force, append) + + ### Write to output.extended.fa + ortho_count = len(hamstr_result) - 1 + if append == True: + if os.path.exists(outfile): + old_result_tmp = SeqIO.to_dict((SeqIO.parse(open(outfile),'fasta'))) + old_result = {} + for old_id in old_result_tmp: + if not old_id in hamstr_result: + old_result[old_id] = str(old_result_tmp[old_id].seq) + hamstr_result = {**hamstr_result, **old_result} + + with open(outfile, 'w') as out_file: + for id, seq in hamstr_result.items(): + out_file.write('>%s\n%s\n' % (id, seq)) + return( + 'Found %s ortholog(s)!\nOutput file: %s' % (ortho_count, outfile)) + + +def hamstr_2_profile(fa_file): + """ Convert extended.fa file into phyloprofile file """ + if os.path.exists(fa_file): + pp_file = fa_file.replace('.extended.fa', '.phyloprofile') + fa = SeqIO.to_dict((SeqIO.parse(open(fa_file),'fasta'))) + with open(pp_file, 'w') as pp: + pp.write('geneID\tncbiID\torthoID\n') + for id in list(fa.keys()): + tmp = id.split('|') + pp.write('%s\t%s\t%s\n' % (tmp[0], tmp[1], id)) diff --git a/fdog/libs/preparation.py b/fdog/libs/preparation.py new file mode 100644 index 0000000..0e599aa --- /dev/null +++ b/fdog/libs/preparation.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +from Bio import SeqIO + +import fdog.libs.zzz as general_fn +import fdog.libs.fasta as fasta_fn +import fdog.libs.blast as blast_fn +import fdog.libs.output as output_fn + + +##### FUNCTIONS FOR DATA/INPUT PREPARATION ##### + +def parsing_paths(args): + """ Getting path to hmm core set, coreTaxa_dir, searchTaxa_dir and annotation_dir""" + (pathFile, outpath, hmmpath, corepath, searchpath, annopath) = args + ### get fdog and data path + data_path = '' + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + pathconfig_file = fdog_path + '/bin/pathconfig.txt' + # pathconfig_file = '/home/vinh/anaconda3/envs/test_fas/lib/python3.9/site-packages/fdog/bin/pathconfig.txt' ###################################### REMOVE THIS + if not os.path.exists(pathconfig_file): + sys.exit( + 'No pathconfig.txt found at %s. Please run fdog.setup ' + + '(https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).' + % pathconfig_file) + if pathFile == '': + with open(pathconfig_file) as f: + data_path = f.readline().strip() + else: + cfg = general_fn.load_config(pathFile) + try: + data_path = cfg['dataPath'] + except: + data_path = 'config' + + if hmmpath == '': + hmmpath = outpath + '/core_orthologs' + Path(hmmpath).mkdir(parents = True, exist_ok = True) + + if corepath == '': + corepath = data_path + '/coreTaxa_dir' + if data_path == 'config': + try: + corepath = cfg['corepath'] + except: + sys.exit('corepath not found in %s' % pathFile) + if searchpath == '': + searchpath = data_path + '/searchTaxa_dir' + if data_path == 'config': + try: + searchpath = cfg['searchpath'] + except: + sys.exit('searchpath not found in %s' % pathFile) + if annopath == '': + annopath = data_path + '/annotation_dir' + if data_path == 'config': + try: + annopath = cfg['annopath'] + except: + sys.exit('annopath not found in %s' % pathFile) + return(hmmpath, corepath, searchpath, annopath) + + +def check_input(args): + (seqFile, refspec, outpath, hmmpath, corepath, + searchpath, annopath, pathFile) = args + fdog_path = os.path.realpath(__file__).replace('/libs/preparation.py','') + # create output directory + Path(outpath).mkdir(parents = True, exist_ok = True) + Path(hmmpath).mkdir(parents = True, exist_ok = True) + # check path existing + hmmpath, corepath, searchpath, annopath = parsing_paths( + [pathFile, outpath, hmmpath, corepath, searchpath, annopath]) + for path in [hmmpath, corepath, searchpath, annopath]: + general_fn.check_file_exist(path) + # check for seqFile + if not os.path.exists(os.path.abspath(seqFile)): + if not os.path.exists(fdog_path + '/data/' + seqFile): + sys.exit( + '%s not found in %s or %s' + % (seqFile, os.getcwd(), fdog_path + '/data/')) + else: + seqFile = fdog_path + '/data/' + seqFile + else: + seqFile = os.path.abspath(seqFile) + # check refspec + if not os.path.exists(os.path.abspath(corepath+'/'+refspec)): + exit('Reference taxon %s not found in %s' % (refspec, corepath)) + return (seqFile, hmmpath, corepath, searchpath, annopath) + + +def identify_seed_id(seqFile, refspec, corepath, debug, silentOff): + refspec_db = '%s/%s/%s' % (corepath, refspec, refspec) + # first check if input seed ID existiert in refspec genome + refspec_fa = fasta_fn.read_fasta('%s.fa' % refspec_db) + seed_fa = SeqIO.parse(open(seqFile),'fasta') + for seed in seed_fa: + try: + if len(refspec_fa.fetch(seed.id)) == len(seed.seq): + return(seed.id) + except: + output_fn.print_debug(debug, 'Identify seed ID', 'Input seed ID not found!') + # otherwise, perform blast search + blast_xml = blast_fn.do_blastsearch(seqFile, refspec_db) + blast_out = blast_fn.parse_blast_xml(blast_xml) + for hit in blast_out['hits']: + if blast_out['hits'][hit]['align_len'] == blast_out['query_len']: + return(hit) + elif abs(int(blast_out['hits'][hit]['align_len']) - int(blast_out['query_len'])) < 10: + output_fn.print_stdout(silentOff, 'WARNING: Found seed sequence shorter than input!') + return(hit) + else: + sys.exit('ERROR: Cannot find seed sequence in genome of reference species for %s!' % blast_out['query']) diff --git a/fdog/libs/tree.py b/fdog/libs/tree.py new file mode 100644 index 0000000..6988224 --- /dev/null +++ b/fdog/libs/tree.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import re +from ete3 import NCBITaxa + +import fdog.libs.zzz as general_fn + + +##### FUNCTIONS RELATED TO TAXONOMY TREE ##### + +def get_rank_index(lineage, rank_name, ncbi): + """ Get ID and index in the species lineage for a given rank + Return {rank_id:rank_index} + """ + ranks = ncbi.get_rank(lineage) + rank_id = list(general_fn.matching_elements(ranks, rank_name).keys())[0] + rank_index = len(ranks) - lineage.index(rank_id) - 1 + return({rank_id:rank_index}) + + +def get_rank_range(lineage, minDist, maxDist, ncbi): + """ Get rank ID and its index in a given species lineage + for a pair of min and max rank. See get_rank_index() + Return a list of 2 dictionary for min and max rank as + [{min_rank_id:min_rank_index}, {max_rank_id:max_rank_index}] + """ + return( + get_rank_index(lineage, minDist, ncbi), + get_rank_index(lineage, maxDist, ncbi)) + + +def check_taxon_group(group_id, tax_id, ncbi): + """ Check if a taxon (tax_id) belongs to a taxonomy group (group_id)""" + lineage = ncbi.get_lineage(tax_id) + if group_id in lineage: + return(True) + return(False) + + +def get_ancestor(id1, id2, ncbi): + """ Get common ancestor ID and rank for 2 taxon IDs + Return dictionary {ancestor_id: ancestor_rank} + """ + tree = ncbi.get_topology([id1, id2], intermediate_nodes = False) + ancestor = tree.get_common_ancestor(id1, id2).name + return(ncbi.get_rank([ancestor])) + + +def check_common_ancestor(ref_id, ancestor, minDist, maxDist, ncbi): + """ Check if ancestor ID lies within the range between min and max rank + of reference species + Return 1 if true + """ + ref_lineage = ncbi.get_lineage(ref_id) + (min_ref, max_ref) = get_rank_range(ref_lineage, minDist, maxDist, ncbi) + ancestor_index = len(ref_lineage) - ref_lineage.index(ancestor) - 1 + if list(min_ref.values())[0] <= ancestor_index <= list(max_ref.values())[0]: + return(1) + return(0) + + +def remove_clade(tree, node_id): + """ Remove a clade from a tree """ + removed_clade = tree.search_nodes(name = str(node_id))[0] + removed_node = removed_clade.detach() + return(tree) + + +def get_leaves_dict(spec_lineage, tree, min_index, max_index): + """ Given a tree and a lineage string of a species + Return a dictionary where keys are the internal nodes defined by the + ranks between min rank (e.g. genus, specified by min_index in the species + lineage) and max rank (e.g. phylum). Values are all leaves in the tree + that belong to the corresponding internal node (rank) + """ + node_dict = {} + already_added = [] + spec_lineage.reverse() + for i in range(len(spec_lineage)): + if i >= min_index and i <= max_index: + curr_node = spec_lineage[i] + node = tree.search_nodes(name = str(curr_node)) + if len(node) > 0: + for leaf in node: + node_dict[spec_lineage[i]] = [] + for t in leaf.traverse(): + if t.is_leaf(): + if not t.name in already_added: + already_added.append(t.name) + node_dict[spec_lineage[i]].append(t.name) + return(general_fn.remove_dup_in_dict(node_dict)) + + +def check_tax_id(tax_id): + """ Check valid taxon ID + Return taxon name (UNK if ID not found in ncbi db) + """ + ncbi = NCBITaxa() + tmp = ncbi.get_rank([tax_id]) + try: + tmp = ncbi.get_rank([tax_id]) + rank = tmp[int(tax_id)] + if not rank == 'species': + print('\033[92mWARNING: rank of %s is not SPECIES (%s)\033[0m' % (tax_id, rank)) + else: + ncbi_name = ncbi.get_taxid_translator([tax_id])[int(tax_id)] + print('\033[92mNCBI taxon info: %s %s\033[0m' % (tax_id, ncbi_name)) + return(ncbi_name) + except: + print('\033[92mWARNING: %s not found in NCBI taxonomy database!\033[0m' % tax_id) + return('UNK%s' % tax_id) + + +def abbr_ncbi_name(ncbi_name): + """ Parse ncbi taxon name into abbr name + E.g. "Homo sapiens" -> "HOMSA" + """ + if not ncbi_name.startswith('UNK'): + ncbi_name = re.sub('[^a-zA-Z1-9\s]+', '', ncbi_name) + tax_name = ncbi_name.split() + name = tax_name[0][:3].upper()+tax_name[1][:2].upper() + else: + name = ncbi_name + return(name) diff --git a/fdog/libs/zzz.py b/fdog/libs/zzz.py new file mode 100644 index 0000000..219ac76 --- /dev/null +++ b/fdog/libs/zzz.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +####################################################################### +# Copyright (C) 2022 Vinh Tran +# +# This file is part of fDOG tool https://github.com/BIONF/fDOG +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for +# more details +# +# Contact: tran@bio.uni-frankfurt.de +# +####################################################################### + +import sys +import os +from pathlib import Path +import ssl +import urllib.request +import yaml +import time + + +##### GENERAL FUNCTIONS FOR FILES, FOLDERS AND GENERAL VARIABLES ##### + +def check_file_exist(file): + """ Exit if a file does not exist""" + if not os.path.exists(os.path.abspath(file)): + sys.exit('%s not found' % file) + + +def read_file(file): + """ Read a file and return list of lines""" + if os.path.exists(file): + with open(file, 'r') as f: + lines = f.read().splitlines() + f.close() + return(lines) + else: + sys.exit('%s not found' % file) + + +def read_dir(dir): + """ Return list of directories from a given path """ + check_file_exist(dir) + out_dirs = [] + p = os.listdir(dir) + for i in p: + if os.path.isdir('%s/%s' % (dir, i)): + out_dirs.append(i) + return(out_dirs) + + +def download_progress(count, block_size, total_size): + global start_time + if count == 0: + start_time = time.time() + return + duration = time.time() - start_time + progress_size = int(count * block_size) + speed = int(progress_size / (1024 * duration)) + percent = int(count * block_size * 100 / total_size) + if percent > 100: + percent = 100 + sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % + (percent, progress_size / (1024 * 1024), speed, duration)) + sys.stdout.flush() + + +def download_file(url, file): + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + download_file = urllib.request.URLopener(context=ctx) + print('Downloading %s' % (url + '/' + file)) + urllib.request.urlretrieve(url + '/' + file, file, download_progress) + print(' ... done!') + + +def count_line(file, pattern, contain): + """ Count lines in file that contain (or not) a pattern """ + nline = 0 + with open(file, 'r') as f: + for line in f: + if contain: + if pattern in line: + nline = nline + 1 + else: + if not pattern in line: + nline = nline + 1 + return(nline) + + +def get_ids_from_folder(folder, type): + """ Get taxonomy IDs for from coreTaxa_dir, searchTaxa_dir or annotation_dir + Return dictionary {taxID:@@Ver} + """ + tax_ids = {} + + for name in read_dir(folder): + if type == 'annotation_dir': + if not name.endswith('.json'): + continue + else: + name = name.replace('.json','') + else: + if not os.path.isdir('%s/%s' % (folder, name)): + continue + id = name.split('@')[1] + if not id in tax_ids: + tax_ids[id] = name + return(tax_ids) + + +def load_config(config_file): + """ Load a YAML file and return as a dictionary """ + with open(config_file, 'r') as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + +def matching_elements(dictionary, search_string): + """ Search for a string in dictionary's values + Return {key:val} where string was found in val + """ + return {key:val for key,val in dictionary.items() if search_string == val} + + +def remove_dup_in_dict(dictionary): + """ Find and remove duplicated or empty values of a dictionary """ + tmp_dict = {'_'.join(val) : key for key, val in dictionary.items()} + res = {val : key.split('_') for key, val in tmp_dict.items()} + res = {key : val for key, val in res.items() if len(val[0]) > 0} + return(res) + + +def join_2lists(first_list, second_list): + """ Join 2 lists """ + in_first = set(first_list) + in_second = set(second_list) + in_second_but_not_in_first = in_second - in_first + out = first_list + list(in_second_but_not_in_first) + return(out) diff --git a/fdog/mergeOutput.py b/fdog/mergeOutput.py index c710ee7..2d5a276 100644 --- a/fdog/mergeOutput.py +++ b/fdog/mergeOutput.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to merge all output files (.extended.fa, .phyloprofile, # _forward.domains, _reverse.domains) in a given directory into one file each. @@ -21,6 +21,7 @@ from os import listdir as ldir import argparse import yaml +from pkg_resources import get_distribution def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): settings = dict( @@ -33,9 +34,10 @@ def createConfigPP(phyloprofile, domains_0, ex_fasta, directory, out): with open('%s.config.yml' % (out), 'w') as outfile: yaml.dump(settings, outfile, default_flow_style = False) + def main(): - version = '0.1.0' - parser = argparse.ArgumentParser(description='You are running fdog.mergeOutput version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('-i', '--input', help='Input directory, where all single output (.extended.fa, .phyloprofile, _forward.domains, _reverse.domains) can be found', action='store', default='', required=True) diff --git a/fdog/removefDog.py b/fdog/removefDog.py index 7b705ea..0998458 100644 --- a/fdog/removefDog.py +++ b/fdog/removefDog.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to uninstall fdog and its data # @@ -20,6 +20,7 @@ import argparse import subprocess import shutil +from pkg_resources import get_distribution def query_yes_no(question, default='yes'): @@ -46,8 +47,8 @@ def query_yes_no(question, default='yes'): def main(): - version = '0.0.1' - parser = argparse.ArgumentParser(description='You are running fdog.remove version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') parser.add_argument('--data', help='Remove fdog together with all files/data within the installed fdog directory', action='store_true', default=False) args = parser.parse_args() data = args.data @@ -65,7 +66,7 @@ def main(): print('fdog will be uninstalled. Some files/data still can be found in %s! Enter to continue' % fdogPath) if query_yes_no('Are you sure?'): if data: - folders = ['bin', 'core_orthologs', 'taxonomy', 'data'] + folders = ['bin', 'data'] for f in folders: dirPath = fdogPath+'/'+f if os.path.exists(os.path.abspath(dirPath)): @@ -75,7 +76,7 @@ def main(): try: subprocess.call([uninstallCmd], shell = True) except: - print('Error by uninstalling fdog. Please manually uninstall it using pip uninstall fdog') + print('Error by uninstalling fdog. Please manually uninstall it using ') if data: if os.path.exists(os.path.abspath(fdogPath)): shutil.rmtree(fdogPath) diff --git a/fdog/runMulti.py b/fdog/runMulti.py index eb7b365..0703cf9 100644 --- a/fdog/runMulti.py +++ b/fdog/runMulti.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # -# This script is used to run fdog with multiple seed sequences. +# This file is part of fDOG tool https://github.com/BIONF/fDOG # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -17,22 +17,29 @@ import sys import os -from os import listdir +from pathlib import Path from os.path import isfile, join -import time import argparse import subprocess -from pathlib import Path -import multiprocessing as mp import re -from tqdm import tqdm -import fdog.runSingle as fdogFn import shutil -import yaml +import multiprocessing as mp +from tqdm import tqdm from ete3 import NCBITaxa +from pkg_resources import get_distribution +import time + +import fdog.libs.zzz as general_fn +import fdog.libs.preparation as prepare_fn +import fdog.libs.orthosearch as ortho_fn +import fdog.libs.corecompile as core_fn +import fdog.libs.fas as fas_fn +import fdog.libs.tree as tree_fn +import fdog.libs.output as output_fn -def getSortedFiles(directory): + +def get_sorted_files(directory): list = os.listdir(directory) pairs = [] for file in list: @@ -43,129 +50,71 @@ def getSortedFiles(directory): pairs.sort(key=lambda s: s[0], reverse=True) return([x[1] for x in pairs]) -def prepare(args, step): - (seqFile, seqName, fdogPath, refspec, minDist, maxDist, coreOrth, - append, force, noCleanup, group, blast, db, - outpath, hmmpath, blastpath, searchpath, weightpath, - coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, - fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent) = args - - mute = False - if step == 'core': - coreOnly = True - silent = True - mute = True - else: - reuseCore = True - fasoff = True - if silent == True: - mute = True - ### check input arguments - seqFile, hmmpath, blastpath, searchpath, weightpath = fdogFn.checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments - basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] - ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] - coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] - fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, checkOff, debug, True] - return(basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) - -def getSeedName(seedFile): + +def get_seed_name(seedFile): seqName = seedFile.rsplit('.', 1)[0] seqName = re.sub('[\|\.]', '_', seqName) return(seqName) -def getIndividualRuntime(step, outpath, seeds): - logFile = outpath + '/runtime_core.txt' - searchTerm = 'Core set compilation finished in' - if step == 'ortho': - logFile = outpath + '/runtime_ortho.txt' - searchTerm = 'Ortholog search completed in' - log = open(logFile, "w") - for seed in seeds: - seqName = getSeedName(seed) - logFile = outpath + '/' + seqName + '/fdog.log' - if os.path.exists(logFile): - with open(logFile, 'r') as f: - for line in f: - if searchTerm in line: - runtime = line.split()[-2] - log.write('%s\t%s\n' % (seqName, runtime)) - else: - missing = open(outpath + '/missing.txt', 'a+') - missing.write(step + '\t' + seqName + '\n') - log.close() - -def compileCore(options, seeds, inFol, cpu, outpath): - print('Starting compiling core orthologs...') - start = time.time() - coreCompilationJobs = [] + +def compile_core(core_options, other_options, seeds, inFol, cpus, outpath, silentOff): + core_compilation_jobs = [] + (coreArgs, orthoCoreArgs, otherCoreArgs) = core_options + otherCoreArgs_bkp = otherCoreArgs + (refspec, reuseCore, forceCore, pathArgs, debug) = other_options + (outpath, hmmpath, corepath, searchpath, annopath) = pathArgs for seed in seeds: - seqFile = [inFol + '/' + seed] - seqName = getSeedName(seed) - if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'core') - coreCompilationJobs.append([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - if len(coreCompilationJobs) > 0: - pool = mp.Pool(cpu) - coreOut = [] - for _ in tqdm(pool.imap_unordered(fdogFn.runSingle, coreCompilationJobs), total=len(coreCompilationJobs)): - coreOut.append(_) + seqFile = ('%s/%s' % (inFol, seed)) + seqName = get_seed_name(seed) + if not os.path.exists('%s/core_orthologs/%s/hmm_dir/%s.hmm' % (outpath, seqName, seqName)) or forceCore == True: + seed_id = prepare_fn.identify_seed_id(seqFile, refspec, corepath, debug, silentOff) + core_compilation_jobs.append([seqFile, seqName, refspec, seed_id, + reuseCore, forceCore, coreArgs, pathArgs, orthoCoreArgs, + otherCoreArgs, debug]) + if len(core_compilation_jobs) > 0: + pool = mp.Pool(cpus) + core_runtime = [] + for _ in tqdm(pool.imap_unordered(core_fn.run_compile_core, core_compilation_jobs), total=len(core_compilation_jobs)): + core_runtime.append(_) pool.close() pool.join() - # read logs file to get runtime for individual seeds - getIndividualRuntime('core', outpath, seeds) - end = time.time() - multiCoreTime = '{:5.3f}'.format(end-start) - print('==> Core compiling finished in %s sec' % multiCoreTime) #'{:5.3f}s'.format(end-start)) - return(multiCoreTime) - -def searchOrtho(options, seeds, inFol, cpu, outpath): - print('Searching orthologs for...') - start = time.time() - coreCompilationJobs = [] + out = [] + for r in core_runtime: + out.append('\t'.join(r)) + return(out) + + +def search_ortholog(options, seeds, inFol, cpu, outpath): + (orthoArgs, otherArgs, pathArgs, refspec) = options + (searchTaxa, cpus, debug, silentOff, noCleanup, force, append) = otherArgs + ortho_runtime = [] for seed in seeds: + begin = time.time() seqFile = [inFol + '/' + seed] - seqName = getSeedName(seed) - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = prepare(seqFile + [seqName] + options, 'ortholog') - if mute == True: - print(seed) - else: - print('\n##### ' + seed) - fdogFn.runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute]) - end = time.time() - # read logs file to get runtime for individual seeds - getIndividualRuntime('ortho', outpath, seeds) - multiOrthoTime = '{:5.3f}'.format(end-start) - print('==> Ortholog search finished in %s sec' % multiOrthoTime) - return(multiOrthoTime) - -def joinOutputs(outpath, jobName, seeds, keep, silent): - print('Joining single outputs...') + seqName = get_seed_name(seed) + if not os.path.exists('%s/%s.extended.fa' % (outpath, seqName)) or force == True: + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs]) + output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append) + end = time.time() + ortho_runtime.append('%s\t%s' % (seqName, '{:5.3f}s'.format(end - begin))) + return(ortho_runtime) + + +def join_outputs(outpath, jobName, seeds, keep, silentOff): finalFa = '%s/%s.extended.fa' % (outpath, jobName) - finalPP = open('%s/%s.phyloprofile' % (outpath, jobName), 'wb') - Path(outpath+'/singleOutput').mkdir(parents=True, exist_ok=True) + single_output_fol = '%s/%s_singleOutput' % (outpath, jobName) + Path('%s/%s_singleOutput' % (outpath, jobName)).mkdir(parents=True, exist_ok=True) with open(finalFa,'wb') as wfd: for seed in seeds: - seqName = getSeedName(seed) - resultFile = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName) - resultPP ='%s/%s/%s.phyloprofile' % (outpath, seqName, seqName) - if silent == False: + seqName = get_seed_name(seed) + resultFile = '%s/%s.extended.fa' % (outpath, seqName) + if silentOff == True: print(resultFile) if os.path.exists(resultFile): with open(resultFile,'rb') as fd: shutil.copyfileobj(fd, wfd) - with open(resultPP,'rb') as pp: - shutil.copyfileobj(pp, finalPP) - shutil.move(outpath + '/' + seqName, outpath + '/singleOutput') - else: - Path(outpath+'/missingOutput').mkdir(parents=True, exist_ok=True) - if not os.path.exists(outpath + '/missingOutput/' + seqName): - shutil.move(outpath + '/' + seqName, outpath + '/missingOutput') + if not os.path.exists('%s/%s.extended.fa' % (single_output_fol, seqName)): + shutil.move(resultFile, single_output_fol) if os.path.exists(outpath + '/' + seqName + '.fa'): os.remove(outpath + '/' + seqName + '.fa') if os.path.exists(os.getcwd() + '/' + seqName + '.fa'): @@ -173,59 +122,19 @@ def joinOutputs(outpath, jobName, seeds, keep, silent): if keep == True: try: print('Compressing single outputs...') - shutil.make_archive(outpath + '/' + jobName + '_singleOutput', 'gztar', outpath+'/singleOutput') + shutil.make_archive(single_output_fol, 'gztar', single_output_fol) except: - shutil.make_archive(outpath + '/' + jobName + '_singleOutput', 'tar', outpath+'/singleOutput') - shutil.rmtree(outpath + '/singleOutput') - return(finalFa) - -def removeDupLines (infilename, outfilename): - lines_seen = set() # holds lines already seen - outfile = open(outfilename, "w") - for line in open(infilename, "r"): - if line not in lines_seen: # not a duplicate - outfile.write(line) - lines_seen.add(line) - outfile.close() - -def calcFAS (outpath, extendedFa, weightpath, cpu): - print('Starting calculating FAS scores...') - start = time.time() - fasCmd = 'fas.runFdogFas -i %s -w %s --cores %s --redo_anno' % (extendedFa, weightpath, cpu) - try: - subprocess.call([fasCmd], shell = True) - end = time.time() - if os.path.exists(outpath + '/tmp'): - shutil.rmtree(outpath + '/tmp') - fasTime = '{:5.3f}s'.format(end-start) - print('==> FAS calculation finished in %s sec' % fasTime) - return(fasTime) - except: - sys.exit('Problem running\n%s' % (fasCmd)) - -def createConfigPP(outpath, jobName, refspec): - settings = dict( - mainInput = '%s/%s.phyloprofile' % (outpath, jobName), - fastaInput = '%s/%s.extended.fa' % (outpath, jobName), - ) - domainFile = '%s/%s_forward.domains' % (outpath, jobName) - if os.path.exists(os.path.abspath(domainFile)): - settings['domainInput'] = domainFile - taxId = refspec.split('@')[1] - refspec = fdogFn.getTaxName(taxId) - if not refspec == 'UNK': - settings['rank'] = 'species' - settings['refspec'] = refspec - settings['clusterProfile'] = 'TRUE' - with open('%s/%s.config.yml' % (outpath, jobName), 'w') as configfile: - yaml.dump(settings, configfile, default_flow_style = False) + shutil.make_archive(single_output_fol, 'tar', single_output_fol) + shutil.rmtree(single_output_fol) + def main(): - version = '0.0.53' - parser = argparse.ArgumentParser(description='You are running fdogs.run version ' + str(version) + '.') - parser.add_argument('--version', action='version', version=str(version)) + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + epilog="For more information on certain options, please refer to the wiki pages " + "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') - required.add_argument('--input', help='Input folder containing the seed sequences (protein only) in fasta format', + required.add_argument('--seqFolder', help='Input folder containing the seed sequences (protein only) in fasta format', action='store', default='', required=True) required.add_argument('--jobName', help='Job name. This will also be file name for the output', action='store', default='', required=True) @@ -235,22 +144,11 @@ def main(): optional_paths = parser.add_argument_group('Non-default directory options') optional_paths.add_argument('--outpath', help='Output directory', action='store', default='') optional_paths.add_argument('--hmmpath', help='Path for the core ortholog directory', action='store', default='') - optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') + optional_paths.add_argument('--corepath', help='Path for the core taxa directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') - optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--annopath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') - addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) - addtionalIO.add_argument('--forceComplete', help='Overwrite existing core orthologs and all output files', action='store_true', default=False) - addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') - addtionalIO.add_argument('--blast', help='Determine sequence id and refspec automatically. Note, the chosen sequence id and reference species does not necessarily reflect the species the sequence was derived from.', - action='store_true', default=False) - addtionalIO.add_argument('--db', help='Run fdog in database mode. Requires a mySql database. Only for internal use.', action='store_true', default=False) - core_options = parser.add_argument_group('Core compilation options') core_options.add_argument('--coreOnly', help='Compile only the core orthologs', action='store_true', default=False) core_options.add_argument('--reuseCore', help='Reuse existing core set of your sequence', action='store_true', default=False) @@ -260,12 +158,8 @@ def main(): core_options.add_argument('--maxDist', help='Maximum systematic distance of primer taxa for the core set compilation. Default: kingdom', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'], action='store', default='kingdom') - core_options.add_argument('--coreOrth', help='Number of orthologs added to the core set. Default: 5', action='store', default=5, type=int) + core_options.add_argument('--coreSize', help='Maximul number of orthologs in core set. Default: 6', action='store', default=6, type=int) core_options.add_argument('--coreTaxa', help='List of primer taxa that should exclusively be used for the core set compilation', action='store', default='') - core_options.add_argument('--coreStrict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - core_options.add_argument('--CorecheckCoorthologsRef', help='During the core compilation, an ortholog also be accepted when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) core_options.add_argument('--CorecheckCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the core compilation', action='store_true', default=False) core_options.add_argument('--coreRep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', @@ -274,20 +168,14 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) - core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) - - ortho_options = parser.add_argument_group('Search strategy options') - ortho_options.add_argument('--searchTaxa', help='Specify list of search taxa', action='store', default='') - ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) - ortho_options.add_argument('--checkCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', + core_options.add_argument('--alnStrategy', help='Specify the alignment strategy during core ortholog compilation. Default: local', + choices=['local', 'glocal', 'global'], + action='store', default='local') + + ortho_options = parser.add_argument_group('Ortholog search strategy options') + ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') + ortho_options.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') + ortho_options.add_argument('--checkCoorthologsRefOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', action='store_true', default=False) ortho_options.add_argument('--rbh', help='Requires a reciprocal best hit during the ortholog search to accept a new ortholog', action='store_true', default=False) @@ -299,265 +187,217 @@ def main(): action='store', default=0.00005, type=float) ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.00001', action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalRelaxfac', help='The factor to relax the e-value cut-off (Blast search and HMM search). Default: 10', - action='store', default=10, type=int) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--autoLimit', help='Invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically a hit limit for each query. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreThreshold', help='Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less than x percent of the hmm score of the best hit are further evaluated. Default: x = 10. You can change this cutoff with the option -scoreCutoff. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', + ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) fas_options = parser.add_argument_group('FAS options') - fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) - fas_options.add_argument('--countercheck', help='The FAS score will be computed in two ways', action='store_true', default=True) fas_options.add_argument('--coreFilter', help='Specifiy mode for filtering core orthologs by FAS score. In \'relaxed\' mode candidates with insufficient FAS score will be disadvantaged. In \'strict\' mode candidates with insufficient FAS score will be deleted from the candidates list. The option \'--minScore\' specifies the cut-off of the FAS score.', choices=['relaxed', 'strict'], action='store', default='') fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) + addtionalIO = parser.add_argument_group('Other I/O options') + addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + addtionalIO.add_argument('--force', help='Overwrite existing ortholog search output files', action='store_true', default=False) + addtionalIO.add_argument('--forceCore', help='Overwrite existing core set of your sequence', action='store_true', default=False) + addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--keep', help='Keep output of individual seed sequence. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--debug', help='Set this flag to obtain more detailed information about the ortholog search progress', action='store_true', default=False) + addtionalIO.add_argument('--debugCore', help='Set this flag to obtain more detailed information about the core compilation actions', action='store_true', default=False) + addtionalIO.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional = parser.add_argument_group('Other options') + optional.add_argument('--fasOff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) - optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) - optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) - optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) - optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional.add_argument('--cpus', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) ### get arguments args = parser.parse_args() # required arguments - inFol = os.path.abspath(args.input) + inFol = os.path.abspath(args.seqFolder) jobName = args.jobName refspec = args.refspec - minDist = args.minDist - maxDist = args.maxDist - coreOrth = args.coreOrth - # path arguments outpath = os.path.abspath(args.outpath) hmmpath = args.hmmpath - blastpath = args.blastpath + corepath = args.corepath searchpath = args.searchpath - weightpath = args.weightpath + annopath = args.annopath pathFile = args.pathFile - # other I/O arguments - append = args.append - force = args.force - forceComplete = args.forceComplete - noCleanup = args.noCleanup - keep = args.keep - group = args.group - blast = args.blast - db = args.db - # core compilation arguments coreOnly = args.coreOnly reuseCore = args.reuseCore + minDist = args.minDist + maxDist = args.maxDist + coreSize = args.coreSize coreTaxa = args.coreTaxa - coreStrict = args.coreStrict - CorecheckCoorthologsRef = args.CorecheckCoorthologsRef + if not coreTaxa == '': + if os.path.exists(os.path.abspath(coreTaxa)): + coreTaxa = os.path.abspath(coreTaxa) CorecheckCoorthologsOff = args.CorecheckCoorthologsOff - if CorecheckCoorthologsOff == True: - CorecheckCoorthologsRef = False coreRep = args.coreRep coreHitLimit = args.coreHitLimit distDeviation = args.distDeviation + alnStrategy = args.alnStrategy # ortholog search arguments - strict = args.strict - checkCoorthologsRef = args.checkCoorthologsRef - checkCoorthologsOff = args.checkCoorthologsOff - if checkCoorthologsOff == True: - checkCoorthologsRef = False + searchTaxa = args.searchTaxa + if not searchTaxa == '': + if os.path.exists(os.path.abspath(searchTaxa)): + searchTaxa = os.path.abspath(searchTaxa) + group = args.group + if not group == '' and not searchTaxa == '': + print('WARNING: Both --group and --searchTaxa are specified. Search taxa will be obtained only from %s!' % searchTaxa) + group = '' + checkCoorthologsRefOff = args.checkCoorthologsRefOff rbh = args.rbh rep = args.rep - ignoreDistance = args.ignoreDistance lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer - evalRelaxfac = args.evalRelaxfac hitLimit = args.hitLimit - autoLimit = args.autoLimit - scoreThreshold = args.scoreThreshold scoreCutoff = args.scoreCutoff - aligner = args.aligner - local = args.local - glocal = args.glocal - searchTaxa = args.searchTaxa # fas arguments - fasoff = args.fasoff - countercheck = args.countercheck + fasOff = args.fasOff coreFilter = args.coreFilter minScore = args.minScore - # others - cpu = args.cpu - hyperthread = args.hyperthread - checkOff = args.checkOff + # other I/O arguments + append = args.append + force = args.force + forceCore = args.forceCore + noCleanup = args.noCleanup + keep = args.keep debug = args.debug + debugCore = args.debugCore silentOff = args.silentOff - if silentOff == True: - silent = False - else: - silent = True - ### check fas - if not fasoff: - try: - fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) - except: - sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') - - ### delete output folder and files if needed - if forceComplete: - if os.path.exists(outpath): - print("Removing existing output directory %s" % outpath) - shutil.rmtree(outpath) - Path(outpath).mkdir(parents=True, exist_ok=True) - if force: - if os.path.exists(outpath): - print("Removing existing files %s in %s*" % (jobName, outpath)) - outfiles = os.listdir(outpath) - for item in outfiles: - if item.startswith(jobName): - try: - os.remove(os.path.join(outpath, item)) - except: - shutil.rmtree(outpath+'/'+item) - if item.startswith("runtime"): - os.remove(os.path.join(outpath, item)) - if os.path.exists(outpath + '/missing.txt'): - os.remove(outpath + '/missing.txt') - - ### get fdog and data path - dataPath = '' - fdogPath = os.path.realpath(__file__).replace('/runMulti.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - if pathFile == '': - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - else: - cfg = fdogFn.load_config(pathFile) - try: - dataPath = cfg['dataPath'] - except: - dataPath = 'config' - - if hmmpath == '': - hmmpath = outpath + '/core_orthologs' - # hmmpath = dataPath + '/core_orthologs' - # if dataPath == 'config': - # try: - # hmmpath = cfg['hmmpath'] - # except: - # sys.exit('hmmpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - else: - hmmpath = os.path.abspath(hmmpath) - if blastpath == '': - blastpath = dataPath + '/blast_dir' - if dataPath == 'config': - try: - blastpath = cfg['blastpath'] - except: - sys.exit('blastpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - if searchpath == '': - searchpath = dataPath + '/genome_dir' - if dataPath == 'config': - try: - searchpath = cfg['searchpath'] - except: - sys.exit('searchpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) - if weightpath == '': - weightpath = dataPath + '/weight_dir' - if dataPath == 'config': - try: - weightpath = cfg['weightpath'] - except: - sys.exit('weightpath not found in %s. Please check https://github.com/BIONF/fDOG/wiki/Input-and-Output-Files#data-structure' % pathFile) + # others + aligner = args.aligner + cpus = args.cpus + if cpus > os.cpu_count(): + cpus = os.cpu_count() + + begin = time.time() + ##### Check and group parameters + (inFol, hmmpath, corepath, searchpath, annopath) = prepare_fn.check_input( + [inFol, refspec, outpath, hmmpath, + corepath, searchpath, annopath, pathFile]) + pathArgs = [outpath, hmmpath, corepath, searchpath, annopath] - ### join options - options = [fdogPath, refspec, minDist, maxDist, coreOrth, - append, force, noCleanup, group, blast, db, - outpath, hmmpath, blastpath, searchpath, weightpath, - coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation, - fasoff, countercheck, coreFilter, minScore, - strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa, - cpu, hyperthread, checkOff, debug, silent] + if not fasOff: + check_fas = fas_fn.check_fas_executable() + if check_fas == 0: + sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') ### START Path(outpath).mkdir(parents=True, exist_ok=True) multiLog = open(outpath + '/' + jobName + '_log.txt', "w") fdogStart = time.time() - seeds = getSortedFiles(inFol) - print('PID ' + str(os.getpid())) - multiLog.write('PID ' + str(os.getpid()) + '\n') + seeds = get_sorted_files(inFol) + print('PID %s - Jobname %s'% (str(os.getpid()), jobName)) + multiLog.write('PID %s - Jobname %s\n'% (str(os.getpid()), jobName)) + - ### run core compilation + ##### DO CORE COMPILATION if reuseCore == False: - multiCoreTime = compileCore(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Core compilation finished in %s sec\n' % multiCoreTime) + print('Starting compiling core orthologs...') + start = time.time() + coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, + alnStrategy, fasOff] + orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast, + lowComplexityFilter, evalHmmer, coreHitLimit, + scoreCutoff, aligner] # rep = True + otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] + core_options = [coreArgs, orthoCoreArgs, otherCoreArgs] + other_options = [refspec, reuseCore, forceCore, pathArgs, debug] + core_runtime = compile_core(core_options, other_options, seeds, inFol, cpus, outpath, silentOff) + end = time.time() + multi_core_time = '{:5.3f}'.format(end-start) + print('==> Core compilation finished in %ss\n' % multi_core_time) + if len(core_runtime) > 1: + multiLog.write('==> Core compilation finished in %ss\n%s\n' % (multi_core_time, '\n'.join(core_runtime))) + else: + multiLog.write('==> Core compilation finished in %ss\n' % multi_core_time) else: if not os.path.exists(hmmpath): sys.exit('--reuseCore was set, but no core orthologs found in %s! You could use --hmmpath to manually specify the core ortholog directory.' % outpath) - ### do ortholog search - if coreOnly == False: - if not os.path.exists('%s/%s.extended.fa' % (outpath, jobName)): - ### create list of search taxa - searchTaxa = '' - searchGroup = 'all' + + ##### DO ORTHOLOG SEARCH USING HMM (HAMSTR) + finalFa = '%s/%s.extended.fa' % (outpath, jobName) + if not coreOnly: + print('Searching orthologs...') + start = time.time() + if not os.path.exists(finalFa) or force == True: + ### get list of search taxa if not group == '': - print('Creating list for search taxa...') - searchTaxa = '%s/searchTaxa.txt' % (outpath) - searchGroup = group - cmd = 'perl %s/bin/getSearchTaxa.pl -i %s -b %s -h %s -r %s -n %s -t %s/taxonomy -o %s' % (fdogPath, searchpath, evalBlast, evalHmmer, evalRelaxfac, searchGroup, fdogPath, searchTaxa) - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - ### run ortholog search - multiOrthoTime = searchOrtho(options, seeds, inFol, cpu, outpath) - multiLog.write('==> Ortholog search finished in %s sec\n' % multiOrthoTime) - ### join output - finalFa = joinOutputs(outpath, jobName, seeds, keep, silent) - else: - if append == True: - sys.exit("Currently the append option is not available. Please use fdog.run if you need this option!") - else: - sys.exit("%s.extended.fa found in %s! If you want to re-run the ortholog search, please use --force or --append option." % (jobName, outpath)) - ### calculate FAS scores - if fasoff == False: - if os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): - os.remove('%s/%s.phyloprofile' % (outpath, jobName)) - if not os.path.exists('%s/%s.phyloprofile' % (outpath, jobName)): - if os.path.exists(finalFa) and os.path.getsize(finalFa) > 0: - fasTime = calcFAS(outpath, finalFa, weightpath, cpu) - multiLog.write('==> FAS calculation finished in %s sec\n' % fasTime) + ### Check valid taxonomy group + ncbi = NCBITaxa() + group_id = ncbi.get_name_translator([group]) + if len(group_id) == 0: + exit('ERROR: Taxon group "%s" invalid!' % group) + ### create taxonomy tree from list of search taxa + searchTaxa = [] + tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + + for tax_id in tax_ids.keys(): + check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) + if check == True: + searchTaxa.append(tax_ids[tax_id]) + if debugCore: + print(searchTaxa) + if len(searchTaxa) == 0: + exit('ERROR: No taxon found within %s taxonomy group!' % group) else: - print("Final fasta file %s not exists or empty!" % finalFa) + searchTaxa = ','.join(searchTaxa) + + if len(searchTaxa) == '': + searchTaxa = general_fn.read_dir(searchpath) + searchTaxa = ','.join(searchTaxa) + + ### do ortholog search + orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, + lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] + ortho_options = [orthoArgs, otherArgs, pathArgs, refspec] + ortho_runtime = search_ortholog(ortho_options, seeds, inFol, cpus, outpath) + end = time.time() + multi_ortho_time = '{:5.3f}'.format(end-start) + print('==> Ortholog search finished in %ss\n' % multi_ortho_time) + multiLog.write('==> Ortholog search finished in %ss\n%s\n' % (multi_ortho_time, '\n'.join(ortho_runtime))) + ### join output + print('Joining single outputs...') + start = time.time() + join_outputs(outpath, jobName, seeds, keep, silentOff) + end = time.time() + print('==> Joining outputs finished in %ss\n' % '{:5.3f}'.format(end-start)) + + ##### DO FINAL FAS CALCULATION + if not fasOff: + try: + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) + except: + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + if os.path.exists(finalFa): + start = time.time() + fas_fn.calc_fas_multi(finalFa, outpath, annopath, cpus) + end = time.time() + print('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) + multiLog.write('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) else: - shutil.move('%s/%s.phyloprofile' % (outpath, jobName), '%s/%s.phyloprofile.tmp' % (outpath, jobName)) - removeDupLines ('%s/%s.phyloprofile.tmp' % (outpath, jobName), '%s/%s.phyloprofile' % (outpath, jobName)) - os.remove('%s/%s.phyloprofile.tmp' % (outpath, jobName)) + output_fn.hamstr_2_profile(finalFa) - ### create PhyloProfile config file - createConfigPP(outpath, jobName, refspec) - - fdogEnd = time.time() - print('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) - multiLog.write('==> fdogs.run finished in ' + '{:5.3f}s'.format(fdogEnd-fdogStart)) - multiLog.close() + end = time.time() + print('==> fdogs.run finished in ' + '{:5.3f}s'.format(end - begin)) if __name__ == '__main__': main() diff --git a/fdog/runSingle.py b/fdog/runSingle.py index 2fbd9d4..cce779d 100644 --- a/fdog/runSingle.py +++ b/fdog/runSingle.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # -# This script is used to run fdog for one seed sequence. +# This file is part of fDOG tool https://github.com/BIONF/fDOG # # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -19,193 +19,27 @@ import os import argparse import subprocess -from pathlib import Path -import yaml from ete3 import NCBITaxa +from pkg_resources import get_distribution +import time +import fdog.libs.preparation as prepare_fn +import fdog.libs.orthosearch as ortho_fn +import fdog.libs.corecompile as core_fn +import fdog.libs.fas as fas_fn +import fdog.libs.tree as tree_fn +import fdog.libs.output as output_fn -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) - -def load_config(config_file): - with open(config_file, 'r') as stream: - try: - return yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - -def checkInput(args): - (fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath) = args - # create output directory - Path(outpath).mkdir(parents=True, exist_ok=True) - Path(hmmpath).mkdir(parents=True, exist_ok=True) - # check path existing - for path in [hmmpath, blastpath, searchpath, weightpath]: - checkFileExist(path) - # check for seqFile - if not os.path.exists(os.path.abspath(seqFile)): - if not os.path.exists(fdogPath + '/data/' + seqFile): - sys.exit('%s not found in %s or %s' % (seqFile, os.getcwd(), fdogPath + '/data/')) - else: - seqFile = fdogPath + '/data/' + seqFile - else: - seqFile = os.path.abspath(seqFile) - # check refspec - if not os.path.exists(os.path.abspath(blastpath+'/'+refspec)): - exit('Reference taxon %s not found in %s' % (refspec, blastpath)) - return (seqFile, hmmpath, blastpath, searchpath, weightpath) - -def getfdogInfo(fdogPath, infoType): - if os.path.exists(fdogPath + '/bin/oneSeq.pl'): - cmd = subprocess.Popen([fdogPath + '/bin/oneSeq.pl', infoType], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - msg, err = cmd.communicate() - print(msg.decode('UTF-8').strip()) - print(err.decode('UTF-8').strip()) - exit() - else: - exit('%s not found' % (fdogPath + '/bin/oneSeq.pl')) - -def runSingle(args): - (basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, mute) = args - # basic command - (fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth) = basicArgs - cmd = 'perl %s/bin/oneSeq.pl -seqFile=%s -seqName=%s -refspec=%s' % (fdogPath, seqFile, seqName, refspec) - # add paths - (outpath, hmmpath, blastpath, searchpath, weightpath) = pathArgs - cmd = cmd + ' -outpath=%s -hmmpath=%s -blastpath=%s -searchpath=%s -weightpath=%s' % (outpath, hmmpath, blastpath, searchpath, weightpath) - # add other I/O options - (append, force, noCleanup, group, blast, db) = ioArgs - if append == True: - cmd = cmd + ' -append' - if force == True: - cmd = cmd + ' -force' - if noCleanup == False: - cmd = cmd + ' -cleanup' - if blast == True: - cmd = cmd + ' -blast' - if db == True: - cmd = cmd + ' -db' - if not group == '': - cmd = cmd + ' -group=%s' % group - # add core compilation options - (coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation) = coreArgs - if coreOnly == True: - cmd = cmd + ' -coreOnly' - if reuseCore == True: - cmd = cmd + ' -reuseCore' - else: - cmd = cmd + ' -minDist=%s -maxDist=%s -coreOrth=%s' % (minDist, maxDist, coreOrth) - if not coreTaxa == '': - cmd = cmd + ' -coreTaxa=%s' % coreTaxa - if coreStrict == True: - cmd = cmd + ' -coreStrict' - if CorecheckCoorthologsRef == True: - cmd = cmd + ' -CorecheckCoorthologsRef' - if coreRep == True: - cmd = cmd + ' -coreRep' - if not coreHitLimit == 3: - cmd = cmd + ' -coreHitLimit=%s' % coreHitLimit - if not distDeviation == 0.05: - cmd = cmd + ' -distDeviation=%s' % distDeviation - # add ortholo search options - (strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa) = orthoArgs - if strict == True: - cmd = cmd + ' -strict' - if checkCoorthologsRef == True: - cmd = cmd + ' -checkCoorthologsRef' - if rbh == True: - cmd = cmd + ' -rbh' - if rep == True: - cmd = cmd + ' -rep' - if ignoreDistance == True: - cmd = cmd + ' -ignoreDistance' - if lowComplexityFilter == True: - cmd = cmd + ' -filter=T' - if not evalBlast == 0.00005: - cmd = cmd + ' -evalBlast=%s' % evalBlast - if not evalHmmer == 0.00005: - cmd = cmd + ' -evalHmmer=%s' % evalHmmer - if not evalRelaxfac == 10: - cmd = cmd + ' -evalRelaxfac=%s' % evalRelaxfac - if not hitLimit == 10: - cmd = cmd + ' -hitLimit=%s' % hitLimit - if autoLimit == True: - cmd = cmd + ' -autoLimit' - if scoreThreshold: - cmd = cmd + ' -scoreThreshold' - if not scoreCutoff == 10: - cmd = cmd + ' -scoreCutoff=%s' % scoreCutoff - if not aligner == 'muscle': - cmd = cmd + ' -aligner=%s' % aligner - if glocal == True: - cmd = cmd + ' -glocal' - if not searchTaxa == '': - checkFileExist(searchTaxa) - searchTaxa = os.path.abspath(searchTaxa) - cmd = cmd + ' -searchTaxa=%s' % searchTaxa - # add fas options - (fasoff, countercheck, coreFilter, minScore) = fasArgs - if fasoff == True: - cmd = cmd + ' -fasoff' - else: - if countercheck == True: - cmd = cmd + ' -countercheck' - if not coreFilter == '': - if minScore > 0: - cmd = cmd + ' -coreFilter=%s -minScore=%s' % (coreFilter, minScore) - # add other options - (cpu, hyperthread, checkOff, debug, silent) = otherArgs - cmd = cmd + ' -cpu=%s' % cpu - if hyperthread == True: - cmd = cmd + ' -hyperthread' - if checkOff == True: - cmd = cmd + ' -checkOff' - if debug == True: - cmd = cmd + ' -debug' - if silent == True: - cmd = cmd + ' -silent' - # print(cmd) - if mute == True: - cmd = cmd + ' > /dev/null 2>&1' - try: - subprocess.call([cmd], shell = True) - except: - sys.exit('Problem running\n%s' % (cmd)) - -def createConfigPP(outpath, seqName, refspec): - settings = dict( - mainInput = '%s/%s/%s.phyloprofile' % (outpath, seqName, seqName), - fastaInput = '%s/%s/%s.extended.fa' % (outpath, seqName, seqName), - ) - domainFile = '%s/%s/%s_forward.domains' % (outpath, seqName, seqName) - if os.path.exists(os.path.abspath(domainFile)): - settings['domainInput'] = domainFile - taxId = refspec.split('@')[1] - refspec = getTaxName(taxId) - if not refspec == 'UNK': - settings['rank'] = 'species' - settings['refspec'] = refspec - settings['clusterProfile'] = 'FALSE' - with open('%s/%s/%s.config.yml' % (outpath, seqName, seqName), 'w') as outfile: - yaml.dump(settings, outfile, default_flow_style = False) - -def getTaxName(taxId): - ncbi = NCBITaxa() - try: - name = ncbi.get_taxid_translator([taxId])[int(taxId)] - except: - name = 'UNK' - return(name) def main(): - version = '0.0.53' - parser = argparse.ArgumentParser(description='You are running fdog.run version ' + str(version) + '.') - parser.add_argument('--version', action='version', version=str(version)) + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.', + epilog="For more information on certain options, please refer to the wiki pages " + "on github: https://github.com/BIONF/fDOG/wiki") required = parser.add_argument_group('Required arguments') required.add_argument('--seqFile', help='Input file containing the seed sequence (protein only) in fasta format', action='store', default='', required=True) - required.add_argument('--seqName', help='Job name. This will also be file name for the output', + required.add_argument('--jobName', help='Job name. This will also be file name for the output', action='store', default='', required=True) required.add_argument('--refspec', help='Reference taxon. It should be the species the seed sequence was derived from', action='store', default='', required=True) @@ -213,20 +47,11 @@ def main(): optional_paths = parser.add_argument_group('Non-default directory options') optional_paths.add_argument('--outpath', help='Output directory', action='store', default='') optional_paths.add_argument('--hmmpath', help='Path for the core ortholog directory', action='store', default='') - optional_paths.add_argument('--blastpath', help='Path for the blastDB directory', action='store', default='') + optional_paths.add_argument('--corepath', help='Path for the core taxa directory', action='store', default='') optional_paths.add_argument('--searchpath', help='Path for the search taxa directory', action='store', default='') - optional_paths.add_argument('--weightpath', help='Path for the pre-calculated feature annotion directory', action='store', default='') + optional_paths.add_argument('--annopath', help='Path for the pre-calculated feature annotion directory', action='store', default='') optional_paths.add_argument('--pathFile', help='Config file contains paths to data folder (in yaml format)', action='store', default='') - addtionalIO = parser.add_argument_group('Other I/O options') - addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) - addtionalIO.add_argument('--force', help='Overwrite existing output files', action='store_true', default=False) - addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) - addtionalIO.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') - addtionalIO.add_argument('--blast', help='Determine sequence id and refspec automatically. Note, the chosen sequence id and reference species does not necessarily reflect the species the sequence was derived from.', - action='store_true', default=False) - addtionalIO.add_argument('--db', help='Run fdog in database mode. Requires a mySql database. Only for internal use.', action='store_true', default=False) - core_options = parser.add_argument_group('Core compilation options') core_options.add_argument('--coreOnly', help='Compile only the core orthologs', action='store_true', default=False) core_options.add_argument('--reuseCore', help='Reuse existing core set of your sequence', action='store_true', default=False) @@ -236,12 +61,8 @@ def main(): core_options.add_argument('--maxDist', help='Maximum systematic distance of primer taxa for the core set compilation. Default: kingdom', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'], action='store', default='kingdom') - core_options.add_argument('--coreOrth', help='Number of orthologs added to the core set. Default: 5', action='store', default=5, type=int) + core_options.add_argument('--coreSize', help='Maximul number of orthologs in core set. Default: 6', action='store', default=6, type=int) core_options.add_argument('--coreTaxa', help='List of primer taxa that should exclusively be used for the core set compilation', action='store', default='') - core_options.add_argument('--coreStrict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - core_options.add_argument('--CorecheckCoorthologsRef', help='During the core compilation, an ortholog also be accepted when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) core_options.add_argument('--CorecheckCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the core compilation', action='store_true', default=False) core_options.add_argument('--coreRep', help='Obtain only the sequence being most similar to the corresponding sequence in the core set rather than all putative co-orthologs', @@ -250,20 +71,14 @@ def main(): action='store', default=3, type=int) core_options.add_argument('--distDeviation', help='The deviation in score in percent (0 = 0 percent, 1 = 100 percent) allowed for two taxa to be considered similar. Default: 0.05', action='store', default=0.05, type=float) - core_options.add_argument('--ignoreDistance', help='Ignore the distance between Taxa and to choose orthologs only based on score', - action='store_true', default=False) - core_options.add_argument('--local', help='Specify the alignment strategy during core ortholog compilation. Default: True', - action='store_true', default=True) - core_options.add_argument('--glocal', help='Specify the alignment strategy during core ortholog compilation. Default: False', - action='store_true', default=False) + core_options.add_argument('--alnStrategy', help='Specify the alignment strategy during core ortholog compilation. Default: local', + choices=['local', 'glocal', 'global'], + action='store', default='local') ortho_options = parser.add_argument_group('Ortholog search strategy options') ortho_options.add_argument('--searchTaxa', help='Specify file contains list of search taxa', action='store', default='') - ortho_options.add_argument('--strict', help='An ortholog is only then accepted when the reciprocity is fulfilled for each sequence in the core set', - action='store_true', default=False) - ortho_options.add_argument('--checkCoorthologsRef', help='During the final ortholog search, accept an ortholog also when its best hit in the reverse search is not the core ortholog itself, but a co-ortholog of it', - action='store_true', default=True) - ortho_options.add_argument('--checkCoorthologsOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', + ortho_options.add_argument('--group', help='Allows to limit the search to a certain systematic group', action='store', default='') + ortho_options.add_argument('--checkCoorthologsRefOff', help='Turn off checking for co-ortholog of the reverse search during the final ortholog search', action='store_true', default=False) ortho_options.add_argument('--rbh', help='Requires a reciprocal best hit during the ortholog search to accept a new ortholog', action='store_true', default=False) @@ -275,177 +90,192 @@ def main(): action='store', default=0.00005, type=float) ortho_options.add_argument('--evalHmmer', help='E-value cut-off for the HMM search. Default: 0.00001', action='store', default=0.00005, type=float) - ortho_options.add_argument('--evalRelaxfac', help='The factor to relax the e-value cut-off (Blast search and HMM search). Default: 10', - action='store', default=10, type=int) ortho_options.add_argument('--hitLimit', help='number of hits of the initial pHMM based search that should be evaluated via a reverse search. Default: 10', action='store', default=10, type=int) - ortho_options.add_argument('--autoLimit', help='Invoke a lagPhase analysis on the score distribution from the hmmer search. This will determine automatically a hit limit for each query. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreThreshold', help='Instead of setting an automatic hit limit, you can specify with this flag that only candidates with an hmm score no less than x percent of the hmm score of the best hit are further evaluated. Default: x = 10. You can change this cutoff with the option -scoreCutoff. Note, it will be effective for both the core compilation and the final ortholog search', - action='store_true', default=False) - ortho_options.add_argument('--scoreCutoff', help='In combination with -scoreThreshold you can define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', + ortho_options.add_argument('--scoreCutoff', help='Define the percent range of the hmms core of the best hit up to which a candidate of the hmmsearch will be subjected for further evaluation. Default: 10', action='store', default=10, type=int) fas_options = parser.add_argument_group('FAS options') - fas_options.add_argument('--fasoff', help='Turn OFF FAS support', action='store_true', default=False) - fas_options.add_argument('--countercheck', help='The FAS score will be computed in two ways', action='store_true', default=True) fas_options.add_argument('--coreFilter', help='Specifiy mode for filtering core orthologs by FAS score. In \'relaxed\' mode candidates with insufficient FAS score will be disadvantaged. In \'strict\' mode candidates with insufficient FAS score will be deleted from the candidates list. The option \'--minScore\' specifies the cut-off of the FAS score.', choices=['relaxed', 'strict'], action='store', default='') fas_options.add_argument('--minScore', help='Specify the threshold for coreFilter. Default: 0.75', action='store', default=0.75, type=float) + addtionalIO = parser.add_argument_group('Other I/O options') + addtionalIO.add_argument('--append', help='Append the output to existing output files', action='store_true', default=False) + addtionalIO.add_argument('--force', help='Overwrite existing ortholog search output files', action='store_true', default=False) + addtionalIO.add_argument('--forceCore', help='Overwrite existing core set of your sequence', action='store_true', default=False) + addtionalIO.add_argument('--noCleanup', help='Temporary output will NOT be deleted. Default: False', action='store_true', default=False) + addtionalIO.add_argument('--debug', help='Set this flag to obtain more detailed information about the ortholog search progress', action='store_true', default=False) + addtionalIO.add_argument('--debugCore', help='Set this flag to obtain more detailed information about the core compilation actions', action='store_true', default=False) + addtionalIO.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional = parser.add_argument_group('Other options') + optional.add_argument('--fasOff', help='Turn OFF FAS support', action='store_true', default=False) optional.add_argument('--aligner', help='Choose between mafft-linsi or muscle for the multiple sequence alignment. DEFAULT: muscle', choices=['mafft-linsi', 'muscle'], action='store', default='muscle') - optional.add_argument('--cpu', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) - optional.add_argument('--hyperthread', help='Set this flag to use hyper threading. Default: False', action='store_true', default=False) - optional.add_argument('--checkOff', help='Set this flag to turn of the initial checks. Default: False', action='store_true', default=False) - optional.add_argument('--debug', help='Set this flag to obtain more detailed information about the programs actions', action='store_true', default=False) - optional.add_argument('--silentOff', help='Show more output to terminal', action='store_true', default=False) + optional.add_argument('--cpus', help='Determine the number of threads to be run in parallel. Default: 4', action='store', default=4, type=int) ### get arguments args = parser.parse_args() # required arguments seqFile = args.seqFile - seqName = args.seqName + seqName = args.jobName refspec = args.refspec - minDist = args.minDist - maxDist = args.maxDist - coreOrth = args.coreOrth - # path arguments outpath = os.path.abspath(args.outpath) hmmpath = args.hmmpath - blastpath = args.blastpath + corepath = args.corepath searchpath = args.searchpath - weightpath = args.weightpath + annopath = args.annopath pathFile = args.pathFile - # other I/O arguments - append = args.append - force = args.force - noCleanup = args.noCleanup - group = args.group - blast = args.blast - db = args.db - # core compilation arguments coreOnly = args.coreOnly reuseCore = args.reuseCore + minDist = args.minDist + maxDist = args.maxDist + coreSize = args.coreSize coreTaxa = args.coreTaxa - coreStrict = args.coreStrict - CorecheckCoorthologsRef = True #args.CorecheckCoorthologsRef + if not coreTaxa == '': + if os.path.exists(os.path.abspath(coreTaxa)): + coreTaxa = os.path.abspath(coreTaxa) CorecheckCoorthologsOff = args.CorecheckCoorthologsOff - if CorecheckCoorthologsOff == True: - CorecheckCoorthologsRef = False coreRep = args.coreRep coreHitLimit = args.coreHitLimit distDeviation = args.distDeviation + alnStrategy = args.alnStrategy # ortholog search arguments - strict = args.strict - checkCoorthologsRef = args.checkCoorthologsRef - checkCoorthologsOff = args.checkCoorthologsOff - if checkCoorthologsOff == True: - checkCoorthologsRef = False + searchTaxa = args.searchTaxa + if not searchTaxa == '': + if os.path.exists(os.path.abspath(searchTaxa)): + searchTaxa = os.path.abspath(searchTaxa) + group = args.group + if not group == '' and not searchTaxa == '': + print('WARNING: Both --group and --searchTaxa are specified. Search taxa will be obtained only from %s!' % searchTaxa) + group = '' + checkCoorthologsRefOff = args.checkCoorthologsRefOff rbh = args.rbh rep = args.rep - ignoreDistance = args.ignoreDistance lowComplexityFilter = args.lowComplexityFilter evalBlast = args.evalBlast evalHmmer = args.evalHmmer - evalRelaxfac = args.evalRelaxfac hitLimit = args.hitLimit - autoLimit = args.autoLimit - scoreThreshold = args.scoreThreshold scoreCutoff = args.scoreCutoff - aligner = args.aligner - local = args.local - glocal = args.glocal - searchTaxa = args.searchTaxa # fas arguments - fasoff = args.fasoff - countercheck = args.countercheck + fasOff = args.fasOff coreFilter = args.coreFilter minScore = args.minScore - # others - cpu = args.cpu - hyperthread = args.hyperthread - checkOff = args.checkOff + # other I/O arguments + append = args.append + force = args.force + forceCore = args.forceCore + noCleanup = args.noCleanup debug = args.debug + debugCore = args.debugCore silentOff = args.silentOff - if silentOff == True: - silent = False - else: - silent = True - - ### get fdog and data path - dataPath = '' - fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') - if pathFile == '': - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - else: - cfg = load_config(pathFile) - try: - dataPath = cfg['dataPath'] - except: - dataPath = 'config' - - if hmmpath == '': - hmmpath = outpath + '/core_orthologs' - # hmmpath = dataPath + '/core_orthologs' - # if dataPath == 'config': - # try: - # hmmpath = cfg['hmmpath'] - # except: - # sys.exit('hmmpath not found in %s' % pathFile) - - if blastpath == '': - blastpath = dataPath + '/blast_dir' - if dataPath == 'config': - try: - blastpath = cfg['blastpath'] - except: - sys.exit('blastpath not found in %s' % pathFile) - if searchpath == '': - searchpath = dataPath + '/genome_dir' - if dataPath == 'config': - try: - searchpath = cfg['searchpath'] - except: - sys.exit('searchpath not found in %s' % pathFile) - if weightpath == '': - weightpath = dataPath + '/weight_dir' - if dataPath == 'config': + + # others + aligner = args.aligner + cpus = args.cpus + if cpus > os.cpu_count(): + cpus = os.cpu_count() + + + begin = time.time() + ##### Check and group parameters + if seqFile == 'infile.fa': + fdogPath = os.path.realpath(__file__).replace('/runSingle.py','') + seqFile = '%s/data/infile.fa' % fdogPath + + (seqFile, hmmpath, corepath, searchpath, annopath) = prepare_fn.check_input( + [seqFile, refspec, outpath, hmmpath, + corepath, searchpath, annopath, pathFile]) + pathArgs = [outpath, hmmpath, corepath, searchpath, annopath] + + if not fasOff: + check_fas = fas_fn.check_fas_executable() + if check_fas == 0: + sys.exit('ERROR: FAS is not executable! You still can use fDOG with --fasOff!') + + ##### Identify seed ID from refspec genome + seed_id = prepare_fn.identify_seed_id(seqFile, refspec, corepath, debug, silentOff) + print('Identified seed ID: %s' % seed_id) + + + ##### DO CORE COMPILATION + # start = time.time() + coreArgs = [minDist, maxDist, coreSize, coreTaxa, distDeviation, + alnStrategy, fasOff] + orthoCoreArgs = [CorecheckCoorthologsOff, rbh, True, evalBlast, + lowComplexityFilter, evalHmmer, coreHitLimit, + scoreCutoff, aligner] # rep = True + otherCoreArgs = [cpus, debugCore, silentOff, noCleanup, force, append] + print('Compiling core set for %s' % seqName) + core_runtime = core_fn.run_compile_core([seqFile, seqName, refspec, seed_id, reuseCore, + forceCore, coreArgs, pathArgs, orthoCoreArgs, otherCoreArgs, debug]) + # end = time.time() + print('==> Core compilation finished in %s' % core_runtime[1]) + + + ##### DO ORTHOLOG SEARCH USING CORE HMM (HAMSTR) + if not coreOnly: + start = time.time() + # check existing output + finalOutfile = '%s/%s.extended.fa' % (outpath, seqName) + finalOutfile = os.path.abspath(finalOutfile) + output_fn.check_output_exist(finalOutfile, force, append) + # get list of search taxa + if not group == '': + ### Check valid taxonomy group + ncbi = NCBITaxa() + group_id = ncbi.get_name_translator([group]) + if len(group_id) == 0: + exit('ERROR: Taxon group "%s" invalid!' % group) + ### create taxonomy tree from list of search taxa + searchTaxa = [] + tax_ids = core_fn.get_core_taxa_ids(coreTaxa, corepath) + + for tax_id in tax_ids.keys(): + check = tree_fn.check_taxon_group(group_id[group][0], tax_id, ncbi) + if check == True: + searchTaxa.append(tax_ids[tax_id]) + if debugCore: + print(searchTaxa) + if len(searchTaxa) == 0: + exit('ERROR: No taxon found within %s taxonomy group!' % group) + else: + searchTaxa = ','.join(searchTaxa) + # do ortholog search + orthoArgs = [checkCoorthologsRefOff, rbh, rep, evalBlast, + lowComplexityFilter, evalHmmer, hitLimit, scoreCutoff, aligner] + otherArgs = [searchTaxa, cpus, debug, silentOff, noCleanup, force, append] + hamstr_out = ortho_fn.run_hamstr([seqName, refspec, pathArgs, orthoArgs, otherArgs]) + output_fn.write_hamstr(hamstr_out, outpath, seqName, force, append) + end = time.time() + print('==> Ortholog search finished in ' + '{:5.3f}s'.format(end - start)) + + ##### DO FINAL FAS CALCULATION + if not fasOff: try: - weightpath = cfg['weightpath'] + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) except: - sys.exit('weightpath not found in %s' % pathFile) - - ### check input arguments - seqFile, hmmpath, blastpath, searchpath, weightpath = checkInput([fdogPath, seqFile, refspec, outpath, hmmpath, blastpath, searchpath, weightpath]) - # group arguments - basicArgs = [fdogPath, seqFile, seqName, refspec, minDist, maxDist, coreOrth] - ioArgs = [append, force, noCleanup, group, blast, db] - pathArgs = [outpath, hmmpath, blastpath, searchpath, weightpath] - coreArgs = [coreOnly, reuseCore, coreTaxa, coreStrict, CorecheckCoorthologsRef, coreRep, coreHitLimit, distDeviation] - fasArgs = [fasoff, countercheck, coreFilter, minScore] - orthoArgs = [strict, checkCoorthologsRef, rbh, rep, ignoreDistance, lowComplexityFilter, evalBlast, evalHmmer, evalRelaxfac, hitLimit, autoLimit, scoreThreshold, scoreCutoff, aligner, local, glocal, searchTaxa] - otherArgs = [cpu, hyperthread, checkOff, debug, silent] - - ### run fdog - runSingle([basicArgs, ioArgs, pathArgs, coreArgs, orthoArgs, fasArgs, otherArgs, False]) - - ### create PhyloProfile config file - createConfigPP(outpath, seqName, refspec) + sys.exit('Problem with FAS! Please check https://github.com/BIONF/FAS or turn it off if not needed!') + if os.path.exists(finalOutfile): + start = time.time() + fas_fn.calc_fas_multi(finalOutfile, outpath, annopath, cpus) + end = time.time() + print('==> FAS calculation finished in ' + '{:5.3f}s'.format(end - start)) + else: + output_fn.hamstr_2_profile(finalOutfile) + + end = time.time() + print('==> fdog.run finished in ' + '{:5.3f}s'.format(end - begin)) if __name__ == '__main__': main() diff --git a/fdog/setup/__init__.py b/fdog/setup/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/fdog/setup/indexTaxonomy.pl b/fdog/setup/indexTaxonomy.pl deleted file mode 100644 index 967f3ef..0000000 --- a/fdog/setup/indexTaxonomy.pl +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/perl -use Bio::DB::Taxonomy; - -my $idx_dir = $ARGV[0]; -# taxon files can be downloaded from: ftp://ftp.ncbi.nih.gov/pub/taxonomy/ -my $db = Bio::DB::Taxonomy->new(-source => 'flatfile', - -nodesfile => $idx_dir . '/nodes.dmp', - -namesfile => $idx_dir . '/names.dmp', - -directory => $idx_dir); -# test -my $taxonid = 9606; -my $taxon = $db->get_taxon(-taxonid => $taxonid); -my $name = $taxon->scientific_name; - -if ($name eq "Homo sapiens") { - print "Index files for taxonomy database were successfully generated!\n"; -} else { - print "Something wrong happened while indexing taxonomy. Please try again!\n"; -} - -exit; diff --git a/fdog/setup/install_lib.sh b/fdog/setup/install_lib.sh deleted file mode 100755 index e5ca4a9..0000000 --- a/fdog/setup/install_lib.sh +++ /dev/null @@ -1,182 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS - -flag=0 - -### update GPG key (Google signature key for signing and authenticating packages) -if ! [ "$sys" == "Darwin" ]; then - wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add - -fi - -### check grep, sed and wget availability -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -if [ "$sys" == "Darwin" ]; then - if [ -z "$(which brew)" ]; then - echo "Please install homebrew to install dependencies tools and libraries!" - echo "Check https://brew.sh" - exit - fi - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -else - if [ "$EUID" -ne 0 ]; then - echo "You must run this setup as a root user!" - exit - fi -fi - -if [ -z "$(which $readlinkprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install coreutils - fi -fi - -if [ -z "$(which $sedprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install gnu-sed - fi -fi - -if [ -z "$(which $grepprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install grep - fi -fi - -if [ -z "$(which $wgetprog)" ]; then - if [ "$sys" == "Darwin" ]; then - brew install wget - fi -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi - -### check dependencies -echo "-------------------------------------" -echo "Installing dependencies..." - -dependenciesUbuntu=( - build-essential # for make - curl - r-base # for Statistics::R - wise - hmmer # hmmer (for both hmmsearch and hmmbuild) - clustalw - mafft - muscle - blast2 # blast - ncbi-blast+ - libdbi-perl - libipc-run-perl - perl-doc - locales - lib32z1 -) - -dependenciesMac=( - brewsci/bio/genewise - hmmer # hmmer (for both hmmsearch and hmmbuild) - brewsci/bio/clustal-w - mafft - brewsci/bio/muscle - blast -) - -if [ "$sys" == "Darwin" ]; then - for i in "${dependenciesMac[@]}"; do - echo $i - brew install $i - done - if [ -z "$(grep clustalw ~/$bashFile)" ]; then - echo "alias clustalw='clustalw2'" >> ~/$bashFile - fi -else - sudo apt-get update -y - for i in "${dependenciesUbuntu[@]}"; do - echo $i - sudo apt-get install -y -qq $i > /dev/null - done -fi - -dependencies=( - genewise - hmmsearch - hmmbuild - mafft - muscle - blastn -) - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo "$i not found / cannot be automatically installed. Please install it manually and run this setup again!" - flag=1 - fi -done -if [ "$flag" == 1 ]; then exit 1; fi - -wisePath=$(which "genewise") -if [ -z "$(grep WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -echo "Installing Perl modules..." -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -if [ -z "$(which cpanm)" ]; then - curl -L http://cpanmin.us | perl - --sudo App::cpanminus -fi -for i in "${perlModules[@]}"; do - msg=$((perldoc -l $i) 2>&1) - if [[ "$(echo $msg)" == *"No documentation"* ]]; then - sudo cpanm ${i} --quiet --force - fi -done -echo "done!" - -echo "-------------------------------------" -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -echo "Please run fdog.setup without --lib option to continue setup fdog!" diff --git a/fdog/setup/setup.sh b/fdog/setup/setup.sh deleted file mode 100755 index 7515ed2..0000000 --- a/fdog/setup/setup.sh +++ /dev/null @@ -1,417 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS -echo "Current OS system: $sys" - -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -CURRENT="${CURRENT/\/setup/}" -BIN="$CURRENT/bin" - -flag=0 -fas=1 -installLib=0 -homedir="$(echo $HOME)" -outDir=$CURRENT - -while getopts lfo: opt; do - case ${opt} in - o ) - echo "Data output path: $OPTARG" - outDir=$OPTARG - ;; - l ) - echo "INSTALL LIB" - installLib=1 - ;; - f ) - echo "NO FAS!" - fas=0 - ;; - \? ) - echo "Usage: setup.sh [-l] [-f]" - exit 1 - ;; - esac -done - -### install dependencies -if [ $installLib == 1 ]; then - if [ "$sys" == "Darwin" ]; then - $CURRENT/setup/install_lib.sh - else - echo "Enter sudo password to install required libraries..." - sudo $CURRENT/setup/install_lib.sh - fi - exit -fi - -### check grep, sed, readlink and wget availability -echo "-------------------------------------" -echo "Checking .bash_profile/.bashrc, grep, sed/gsed and wget availability..." -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -rprofile='.Rprofile' - -if [ "$sys" == "Darwin" ]; then - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -fi - -if [ -z "$(which $sedprog)" ]; then - echo -e "\e[31m$sedprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $grepprog)" ]; then - echo -e "\e[31m$grepprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $wgetprog)" ]; then - echo -e "\e[31m$wgetprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if [ -z "$(which $readlinkprog)" ]; then - echo -e "\e[31m$readlinkprog not found!\e[0m" - echo "Please run fdog.setup with --lib first!" - exit -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi -if ! [ -f ~/$rprofile ]; then - touch ~/$rprofile -fi -echo "done!" - -### prepare folders -echo "-------------------------------------" -echo "Preparing folders..." -if [ ! -d "$CURRENT/taxonomy" ]; then mkdir "$CURRENT/taxonomy"; fi -if [ ! -d "$CURRENT/bin" ]; then mkdir "$CURRENT/bin"; fi -if [ ! -d "$CURRENT/bin/aligner" ]; then mkdir "$CURRENT/bin/aligner"; fi -echo "done!" - -### download tools -echo "-------------------------------------" -echo "Downloading and installing annotation tools/databases:" - -fasta36="yes" -if [ -z "$(which fasta36)" ]; then - fasta36="no" - # fasta36v="fasta-36.3.8h" - fasta36v="36.3.8h_04-May-2020" - if ! [ -f "bin/aligner/bin/fasta36" ]; then - echo "fasta36" - # wget "http://faculty.virginia.edu/wrpearson/fasta/fasta36/${fasta36v}.tar.gz" - # tar xf $fasta36v.tar.gz - # rm "${fasta36v}.tar.gz" - # mv $fasta36v/* $CURRENT/bin/aligner/ - # rm -rf $fasta36v - wget "https://github.com/wrpearson/fasta36/archive/refs/tags/v${fasta36v}.tar.gz" - tar xf "v${fasta36v}.tar.gz" - rm "v${fasta36v}.tar.gz" - mv fasta36-${fasta36v}/* $CURRENT/bin/aligner/ - rm -rf "fasta36-${fasta36v}" - cd "$CURRENT/bin/aligner/src" - if [ $sys=="Linux" ]; then - make -f ../make/Makefile.linux64_sse2 all - elif [ $sys=="Darwin" ]; then - make -f ../make/Makefile.os_x86_64 all - fi - fi - if [ -z "$($grepprog PATH=$CURRENT/bin/aligner/bin ~/$bashFile)" ]; then - echo "export PATH=$CURRENT/bin/aligner/bin:\$PATH" >> ~/$bashFile - fi -fi -cd $CURRENT -if [ -z "$(which fasta36)" ]; then - if ! [ -f "$CURRENT/bin/aligner/bin/fasta36" ]; then - echo -e "\e[31mfasta36 tool could not be found in $CURRENT/bin/aligner/. Please check again!\e[0m" - exit - fi -fi - -cd "$CURRENT/taxonomy" -if ! [ -f "nodes" ]; then - wget "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" - tar xf taxdump.tar.gz - rm taxdump.tar.gz - echo "Taxonomy database indexing. It can take a while, please wait..." - perl $CURRENT/setup/indexTaxonomy.pl $CURRENT/taxonomy - rm citations.dmp - rm delnodes.dmp - rm division.dmp - rm gencode.dmp - rm merged.dmp - rm gc.prt - rm readme.txt -fi -cd $CURRENT -if ! [ -f "$CURRENT/taxonomy/nodes" ]; then - echo -e "\e[31mError while indexing NCBI taxonomy database! Please check $CURRENT/taxonomy/ folder and run this setup again!\e[0m" - exit -fi - -setupFAS=0 -if [ $fas == 1 ]; then - cd "$CURRENT/bin" - if [ -z "$(which fas.doAnno)" ]; then - echo "FAS" - pip install --user greedyFAS - if [ -z "$($grepprog \$HOME/.local/bin:\$PATH ~/$bashFile)" ]; then - echo "export PATH=\$HOME/.local/bin:\$PATH" >> ~/$bashFile - fi - if [ -z "$($grepprog $homedir/.local/bin ~/$rprofile)" ]; then - echo "Sys.setenv(PATH = paste(\"$homedir/.local/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile - fi - setupFAS=1 - else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi - fi - - cd $CURRENT - source ~/$bashFile - if [ -z "$(which fas.doAnno)" ]; then - echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mpip install greedyFAS\e[0m!" - echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" - exit - else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi - fi - echo "done!" -fi - -### download data -data_fdog_file="data_HaMStR-2019c.tar.gz" -checkSumData="1748371655 621731824 $data_fdog_file" -cd $outDir -if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi -if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi - -if ! [ "$(ls -A $outDir/genome_dir)" ]; then - echo "-------------------------------------" - echo "Getting pre-calculated data" - - echo "Processing $outDir ..." - if [ ! -f $outDir/$data_fdog_file ]; then - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - else - CHECKSUM=$(cksum $data_fdog_file) - echo "Checksum: $CHECKSUM" - if ! [ "$CHECKSUM" == "$checkSumData" ]; then - rm $outDir/$data_fdog_file - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - fi - fi - - if [ ! -f $outDir/$data_fdog_file ]; then - echo "File $data_fdog_file not found! Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - exit - fi - - CHECKSUM=$(cksum $data_fdog_file) - if [ "$CHECKSUM" == "$checkSumData" ]; then - echo "Extracting archive $data_fdog_file..." - tar xf $outDir/$data_fdog_file - rm $outDir/$data_fdog_file - if [ -d "$outDir/genome_dir" ]; then - for i in $(ls "$outDir/genome_dir"); do rm -f "$outDir/genome_dir/$i/$i.fa.mod"; done - fi - - if [ "$(ls -A $outDir/blast_dir)" ]; then - echo "Data should be in place to run fdog." - else - echo -e "\e[31mSomething went wrong with the download. Data folders are empty.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Or contact us if you think this is our issue!" - exit - fi - else - echo -e "\e[31mSomething went wrong with the download. Checksum does not match.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Please put it into $outDir folder and run this setup again!" - exit - fi -fi -# write data path to pathConfig file -if [ -f $BIN/pathconfig.txt ]; then - rm $BIN/pathconfig.txt -fi -touch $BIN/pathconfig.txt -echo $outDir >> $BIN/pathconfig.txt - -### add paths to bash profile file -echo "-------------------------------------" -echo "Adding WISECONFIGDIR to ~/$bashFile" - -wisePath=$(which "genewise") -if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -# echo "Adding paths to ~/$rprofile" -# if [ -z "$($grepprog $CURRENT/bin ~/$rprofile)" ]; then -# echo "Sys.setenv(PATH = paste(\"$CURRENT/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile -# fi -echo "done!" - -### adapt paths in fdog scripts -echo "-------------------------------------" -echo "Adapting paths in fdog scripts" -# update the sed and grep commands -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/oneSeq.pl - -# localize the perl installation -path2perl=`which perl` -echo "path to perl: $path2perl" -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/translate.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/oneSeq.pl - -echo "done!" - -### final check -echo "-------------------------------------" -echo "Final check..." -flag=0 - -echo "Tools" -dependencies=( -genewise -hmmsearch -hmmscan -hmmbuild -mafft -muscle -clustalw -blastp -) - -for i in "${dependencies[@]}"; do - tool=$i - if [ $tool == "clustalw" ]; then - if [ "$sys" == "Darwin" ]; then - tool="clustalw2" - fi - fi - if [ -z "$(which $tool)" ]; then - echo -e "\t\e[31mWARNING $tool not found!\e[0m" - flag=1 - fi -done - -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -echo "Perl modules" -for i in "${perlModules[@]}"; do - msg=$((perl -e "use $i") 2>&1) - if ! [[ -z ${msg} ]]; then - echo -e "\t\e[31mWARNING $i could not be installed\e[0m" - flag=1 - fi -done - -echo "Environment paths" -envPaths=( -WISECONFIGDIR -) -for i in "${envPaths[@]}"; do - if [ -z "$($grepprog $i ~/$bashFile)" ]; then - echo -e "\t\e[31mWARNING $i was not added into ~/$bashFile\e[0m" - flag=1 - fi -done -if [ "$fasta36" == "no" ]; then - if [ -z "$($grepprog PATH=$CURRENT/bin/aligner/bin ~/$bashFile)" ]; then - echo -e "\t\e[31mWARNING $CURRENT/bin/aligner/bin was not added into ~/$bashFile\e[0m" - flag=1 - fi -fi -echo "done!" - -if [ "$flag" == 1 ]; then - echo "Some tools/libraries counld not installed correctly or paths were not added into ~/$bashFile." - echo "Please manually install the missing dependencies using fdog.setup with --lib option (ask your admin if you don't have root privileges)." - echo "Then run this setup again to try one more time!" - exit -else - echo "Generating symbolic links" - ln -s -f $CURRENT/bin/hamstr.pl $CURRENT/bin/hamstr - ln -s -f $CURRENT/bin/oneSeq.pl $CURRENT/bin/oneSeq - echo "Sourcing bash profile file" - source ~/$bashFile - echo "-------------------------------------" - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$setupFAS" == 1 ]; then - echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" - echo "Then you can test fdog with:" - else - echo "All tests succeeded, fdog should be ready to run. You can test it with:" - fi - echo -e "\e[96mfdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3\e[0m" - echo "Output files with prefix \"test\" will be found at your current working directory!" - echo -e "For more details, use \e[96mfdog.run -h\e[0m or visit https://github.com/BIONF/fDOG/wiki" - echo "Happy running fdog! ;-)" -fi -exit 1 diff --git a/fdog/setup/setup_conda.sh b/fdog/setup/setup_conda.sh deleted file mode 100755 index cf1bc6d..0000000 --- a/fdog/setup/setup_conda.sh +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -sys="$(uname)" # Linux for Linux or Darwin for MacOS -echo "Current OS system: $sys" - -CURRENT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -CURRENT="${CURRENT/\/setup/}" -BIN="$CURRENT/bin" - -flag=0 -outDir=$CURRENT - -while getopts o: opt; do - case ${opt} in - o ) - echo "Data output path: $OPTARG" - outDir=$OPTARG - ;; - \? ) - echo "Usage: setup.sh [-l] [-f]" - exit 1 - ;; - esac -done -if [ ! -d "$outDir" ]; then mkdir "$outDir"; fi - -### check grep, sed and wget availability -echo "-------------------------------------" -echo "Checking .bash_profile/.bashrc, grep, sed/gsed and wget availability..." -grepprog='grep' -sedprog='sed' -readlinkprog='readlink' -wgetprog='wget' -bashFile='.bashrc' -rprofile='.Rprofile' - -if [ "$sys" == "Darwin" ]; then - sedprog='gsed' - grepprog='ggrep' - readlinkprog='greadlink' - shell=$(echo $SHELL) - if [ $shell == "/bin/zsh" ]; then - bashFile='.zshrc' - else - bashFile='.bash_profile' - fi -fi - -# NOTE: install only available for Linux! -if [ -z "$(which $sedprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$sedprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c conda-forge sed -fi - -if [ -z "$(which $grepprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$grepprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c bioconda grep -fi - -if [ -z "$(which $wgetprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$wgetprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi - conda install -c anaconda wget -fi - -if [ -z "$(which $readlinkprog)" ]; then - if [ "$sys" == "Darwin" ]; then - echo -e "\e[31m$readlinkprog not found. Please install it first (e.g. using brew)!\e[0m" - flag=1 - fi -fi - -if ! [ -f ~/$bashFile ]; then - touch ~/$bashFile -fi -if ! [ -f ~/$rprofile ]; then - touch ~/$rprofile -fi -if [ "$flag" == 1 ]; then exit 1; fi -echo "done!" - -### check dependencies -echo "-------------------------------------" -echo "Installing dependencies..." - -if [ -z "$(which R)" ]; then - echo "R" - conda install -y r -fi - -if [[ -z $(conda list | $grepprog "pkg-config ") ]]; then - echo "pkg-config" - conda install -y pkg-config -fi - -if [[ -z $(conda list | $grepprog "perl-bioperl ") ]]; then - echo "perl-bioperl" - conda install -y -c bioconda perl-bioperl - conda install -y -c bioconda perl-bioperl-core - conda install -y -c bioconda perl-bioperl-run -fi - -dependencies=( - blastp # blast - genewise # wise2 - hmmsearch # hmmer (for both hmmsearch and hmmbuild) - clustalw - mafft # for linsi - muscle - fasta36 -) - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo $i - tool=$i - if [ "$tool" = "blastp" ]; then - conda install -y -c bioconda blast - elif [ "$tool" = "hmmsearch" ]; then - conda install -y -c bioconda hmmer - elif [ "$tool" = "genewise" ]; then - conda install -y -c bioconda wise2 - wisePath=$(which "genewise") - if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile - fi - elif [ "$tool" = "fasta36" ]; then - conda install -y -c bioconda fasta3 - else - conda install -y -c bioconda $i - fi - fi -done - -for i in "${dependencies[@]}"; do - if [ -z "$(which $i)" ]; then - echo -e "\e[31m$i not found. Please install it to use fdog!\e[0m" - flag=1 - fi -done -if [ "$flag" == 1 ]; then exit 1; fi - -perlModules=( - Array::Utils - Capture::Tiny - DBI - DB_File - File::Copy - File::Path - File::Basename - File::Which - List::Util - Parallel::ForkManager - POSIX - Getopt::Long - IO::Handle - IPC::Run - Statistics::R - Term::Cap - Time::HiRes - Bio::AlignIO - Bio::Align::ProteinStatistics - Bio::DB::Taxonomy - Bio::SearchIO - Bio::SearchIO::blastxml - Bio::Search::Hit::BlastHit - Bio::Seq - Bio::SeqIO - Bio::SeqUtils - Bio::Tree::Tree - Bio::Tools::Run::StandAloneBlast -) - -for i in "${perlModules[@]}"; do - msg=$((perldoc -l $i) 2>&1) - if [[ "$(echo $msg)" == *"No documentation"* ]]; then - cpanm ${i} --quiet --force - fi -done - -echo "done!" - -### prepare folders -echo "-------------------------------------" -echo "Preparing folders..." - -# create required folders -if [ ! -d "$CURRENT/taxonomy" ]; then mkdir "$CURRENT/taxonomy"; fi -if [ ! -d "$CURRENT/bin" ]; then mkdir "$CURRENT/bin"; fi -if [ ! -d "$CURRENT/bin/aligner" ]; then mkdir "$CURRENT/bin/aligner"; fi -echo "done!" - -### download tools -echo "-------------------------------------" -echo "Downloading and installing annotation tools/databases:" - -cd "$CURRENT/taxonomy" -if ! [ -f "nodes" ]; then - wget "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" - tar xfv taxdump.tar.gz - rm taxdump.tar.gz - echo "Taxonomy database indexing. It can take a while, please wait..." - perl $CURRENT/setup/indexTaxonomy.pl $CURRENT/taxonomy - rm citations.dmp - rm delnodes.dmp - rm division.dmp - rm gencode.dmp - rm merged.dmp - rm gc.prt - rm readme.txt -fi -cd $CURRENT -if ! [ -f "$CURRENT/taxonomy/nodes" ]; then - echo -e "\e[31mError while indexing NCBI taxonomy database! Please check $CURRENT/taxonomy/ folder and run this setup again!\e[0m" - exit -fi - -cd "$CURRENT/bin" -setupFAS=0 -if [ -z "$(which fas.doAnno)" ]; then - echo "FAS" - conda install -y -c BIONF fas - if [ -z "$(which fas.doAnno)" ]; then - echo -e "\e[31mInstallation of FAS failed! Please try again!\e[0m" - exit - fi - setupFAS=1 -else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi -fi - -if [ -z "$(which fas.doAnno)" ]; then - echo -e "Installation of FAS failed! Please try again or install FAS by yourself using \e[91mconda install -c BIONF fas\e[0m or \e[91mpip install greedyFAS\e[0m" - echo -e "For more info, please check FAS website at \e[91mhttps://github.com/BIONF/FAS\e[0m" - exit -else - if ! [ -z "$(fas.setup -t ./ --check 2>&1 | grep ERROR)" ]; then - setupFAS=1 - fi -fi -cd $CURRENT -echo "done!" - -### download data -data_fdog_file="data_HaMStR-2019c.tar.gz" -checkSumData="1748371655 621731824 $data_fdog_file" -cd $outDir -if [ ! -d "$outDir/genome_dir" ]; then mkdir "$outDir/genome_dir"; fi -if [ ! -d "$outDir/assembly_dir" ]; then mkdir "$outDir/assembly_dir"; fi - -if ! [ "$(ls -A $outDir/genome_dir)" ]; then - echo "-------------------------------------" - echo "Getting pre-calculated data" - - echo "Processing $outDir ..." - if [ ! -f $outDir/$data_fdog_file ]; then - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - else - CHECKSUM=$(cksum $data_fdog_file) - echo "Checksum: $CHECKSUM" - if ! [ "$CHECKSUM" == "$checkSumData" ]; then - rm $outDir/$data_fdog_file - echo "Downloading data from https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - wget --no-check-certificate https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file - fi - fi - - if [ ! -f $outDir/$data_fdog_file ]; then - echo "File $data_fdog_file not found! Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - exit - fi - - CHECKSUM=$(cksum $data_fdog_file) - if [ "$CHECKSUM" == "$checkSumData" ]; then - echo "Extracting archive $data_fdog_file..." - tar xf $outDir/$data_fdog_file - rm $outDir/$data_fdog_file - for i in $(ls "$outDir/genome_dir"); do rm -f "$outDir/genome_dir/$i/$i.fa.mod"; done - - if [ "$(ls -A $outDir/blast_dir)" ]; then - echo "Data should be in place to run fdog.\n" - else - echo -e "\e[31mSomething went wrong with the download. Data folders are empty.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Or contact us if you think this is our issue!" - exit - fi - else - echo -e "\e[31mSomething went wrong with the download. Checksum does not match.\e[0m" - echo "Please try to download again from" - echo "https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo/$data_fdog_file" - echo "Please put it into $outDir folder and run this setup again!" - exit - fi -fi -# write data path to pathConfig file -if [ -f $BIN/pathconfig.txt ]; then - rm $BIN/pathconfig.txt -fi -touch $BIN/pathconfig.txt -echo $outDir >> $BIN/pathconfig.txt - -### add paths to bash profile file -echo "-------------------------------------" -echo "Adding WISECONFIGDIR to ~/$bashFile" - -wisePath=$(which "genewise") -if [ -z "$($grepprog WISECONFIGDIR=$wisePath ~/$bashFile)" ]; then - echo "export WISECONFIGDIR=${wisePath}" >> ~/$bashFile -fi - -# echo "Adding paths to ~/$rprofile" -# if [ -z "$($grepprog $CURRENT/bin ~/$rprofile)" ]; then -# echo "Sys.setenv(PATH = paste(\"$CURRENT/bin\", Sys.getenv(\"PATH\"), sep=\":\"))" >> ~/$rprofile -# fi - -echo "done!" - -### adapt paths in fdog scripts -echo "-------------------------------------" -echo "Adapting paths in fdog scripts" -# update the sed and grep commands -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s/\(my \$sedprog = '\).*/\1$sedprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$grepprog = '\).*/\1$grepprog';/" $CURRENT/bin/oneSeq.pl -$sedprog -i -e "s/\(my \$readlinkprog = '\).*/\1$readlinkprog';/" $CURRENT/bin/oneSeq.pl - -# localize the perl installation -path2perl=`which perl` -echo "path to perl: $path2perl" -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/hamstr.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/translate.pl -$sedprog -i -e "s|\#\!.*|\#\!$path2perl|g" $CURRENT/bin/oneSeq.pl - -### final check -echo "-------------------------------------" -echo "Final check..." -flag=0 - -echo "Conda packages" -condaPkgs=( -perl-bioperl -perl-bioperl-core -blast -hmmer -wise2 -clustalw -mafft -muscle -fasta3 -) -for i in "${condaPkgs[@]}"; do - if [[ -z $(conda list | $grepprog "$i ") ]]; then - progname=$i - if [ "$i" == "blast" ]; then - progname="blastp" - elif [ "$i" == "wise2" ]; then - progname="genewise" - elif [ "$i" == "hmmer" ]; then - progname="hmmsearch" - elif [ "$i" == "fasta3" ]; then - progname="fasta36" - fi - if [ -z "$(which $progname)" ]; then - echo -e "\t\e[31m$i could not be installed\e[0m" - flag=1 - fi - fi -done -echo "done!" - -echo "Perl modules" -for i in "${perlModules[@]}"; do - msg=$((perl -e "use $i") 2>&1) - if ! [[ -z ${msg} ]]; then - echo -e "\t\e[31m$i could not be installed\e[0m" - flag=1 - fi -done -echo "done!" - -echo "Environment paths" -envPaths=( -WISECONFIGDIR -) -for i in "${envPaths[@]}"; do - if [ -z "$($grepprog $i ~/$bashFile)" ]; then - echo -e "\t\e[31m$i was not added into ~/$bashFile\e[0m" - flag=1 - fi -done -echo "done!" - -if [ "$flag" == 1 ]; then - echo "Some tools/libraries counld not installed correctly or paths were not added into ~/$bashFile." - echo "Please manually install the missing dependencies using using fdog.setup with --lib option (ask your admin if you don't have root privileges)." - echo "Then run this setup again to try one more time!" - exit -else - echo "Generating symbolic links" - ln -s -f $CURRENT/bin/hamstr.pl $CURRENT/bin/hamstr - ln -s -f $CURRENT/bin/oneSeq.pl $CURRENT/bin/oneSeq - echo "Sourcing bash profile file" - source ~/$bashFile - echo "-------------------------------------" - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/hamstr.pl - $sedprog -i -e 's/my $configure = .*/my $configure = 1;/' $CURRENT/bin/oneSeq.pl - if [ "$setupFAS" == 1 ]; then - echo "All tests succeeded." - echo -e "\e[91mPLEASE RUN\e[0m \e[96mfas.setup\e[0m \e[91mTO CONFIGURE FAS BEFORE USING fdog!\e[0m" - echo "Then you can test fdog with:" - else - echo "All tests succeeded, fdog should be ready to run. You can test it with:" - fi - echo -e "\e[96mfdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3\e[0m" - echo "Output files with prefix \"test\" will be found at your current working directory!" - echo -e "For more details, use \e[96mfdog.run -h\e[0m or visit https://github.com/BIONF/fDOG/wiki" - echo "Happy using fdog! ;-)" -fi -exit 1 diff --git a/fdog/setupfDog.py b/fdog/setupfDog.py index b6a67d6..a112a20 100644 --- a/fdog/setupfDog.py +++ b/fdog/setupfDog.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to setup fdog: install dependencies and # download pre-computed data @@ -18,62 +18,241 @@ import sys import os +import platform import argparse import subprocess +import shutil from ete3 import NCBITaxa from pathlib import Path +from pkg_resources import get_distribution + +import fdog.libs.zzz as general_fn +import fdog.libs.fas as fas_fn +import fdog.libs.alignment as align_fn + + +def check_conda_env(): + """ Return if a conda env is currently using """ + if 'CONDA_DEFAULT_ENV' in os.environ: + if not os.environ['CONDA_DEFAULT_ENV'] == 'base': + return(True) + return(False) + + +def get_source_path(): + """ Get path of installed fDOG library """ + fdogPath = os.path.realpath(__file__).replace('/setupfDog.py','') + return(fdogPath) + + +def get_data_path(fdogPath): + """ Get path of fDOG data """ + pathconfigFile = fdogPath + '/bin/pathconfig.txt' + if not os.path.exists(pathconfigFile): + sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + else: + with open(pathconfigFile) as f: + dataPath = f.readline().strip() + print(dataPath) + sys.exit() + + +def install_fas(woFAS): + """ Install greedyFAS """ + if not woFAS: + ### check if fas already installed + try: + fasVersion = subprocess.run(['fas.run --version'], shell = True, capture_output = True, check = True) + except: + print('=> greedyFAS (https://github.com/BIONF/FAS)') + install_fas_cmd = 'pip install greedyFAS' + try: + subprocess.check_output([install_fas_cmd], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + sys.exit('\033[91mERROR: Problem with installing FAS! Please do it manually. See: https://github.com/BIONF/FAS!\033[0m') + ### check if fas installed but not yet configured + check_fas = fas_fn.check_fas_executable() + + +def install_fasta36(fdogPath, cwd): + """ Install FASTA36 from source """ + try: + subprocess.check_output(['which fasta36'], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print('=> FASTA36 (https://github.com/wrpearson/fasta36)') + fasta36v = '36.3.8h_04-May-2020' + fasta36url = 'https://github.com/wrpearson/fasta36/archive/refs/tags' + fasta36file = 'v%s.tar.gz' % fasta36v + if not os.path.exists('%s/bin/aligner/bin/fasta36' % fdogPath): + if os.path.exists('%s/bin/aligner' % fdogPath): + shutil.rmtree('%s/bin/aligner' % fdogPath) + general_fn.download_file(fasta36url, fasta36file) + shutil.unpack_archive(fasta36file, '%s/bin/' % fdogPath, 'gztar') + os.remove(fasta36file) + shutil.move('%s/bin/fasta36-%s' % (fdogPath, fasta36v), '%s/bin/aligner' % fdogPath) + if 'Darwin' in platform.uname(): + make_cmd = 'make -f %s/bin/aligner/make/Makefile.os_x86_64 all' % fdogPath + elif 'Linux' in platform.uname(): + make_cmd = 'make -f %s/bin/aligner/make/Makefile.linux64_sse2 all' % fdogPath + else: + sys.exit('\033[91mERROR: Cannot identify type of system (neither Linux nor Darwin/MacOS)\033[0m') + try: + print('Compiling fasta36. Please wait...') + os.chdir('%s/bin/aligner/src' % fdogPath) + subprocess.run(make_cmd, shell = True, check = True) + except: + sys.exit('\033[91mERROR: Cannot install FASTA36!\033[0m') + os.chdir(cwd) + if not os.path.exists('%s/bin/aligner/bin/fasta36' % fdogPath): + sys.exit('\033[91mERROR: fasta36 not found! Please install it manually!\033[0m') + else: + print('FASTA36 installed at %s/bin/aligner/' % fdogPath) + else: + fasta36_path = align_fn.check_fasta36_executable(fdogPath) + print('FASTA36 found at %s' % fasta36_path) + + +def check_dependencies(fdogPath): + """ Check for missing dependencies + Dependencies are specified in fdog/data/dependencies.txt file + """ + missing = [] + dependencies = '%s/data/dependencies.txt' % fdogPath + for tool in general_fn.read_file(dependencies): + function = tool + if tool == 'hmmer': + function = 'hmmsearch' + if tool == 'ncbi-blast+': + function = 'blastp' + try: + subprocess.check_output(['which %s' % function], shell = True, stderr = subprocess.STDOUT) + except subprocess.CalledProcessError as e: + missing.append(tool) + return(missing) + + +def download_data(dataPath, force): + """ Downloade pre-calculated fDOG data """ + data_fdog_file = "data_HaMStR-2019c.tar.gz" + checksum_data = "1748371655 621731824 $data_fdog_file" + + genome_path = '%s/searchTaxa_dir' % dataPath + Path(genome_path).mkdir(parents = True, exist_ok = True) + + if len(general_fn.read_dir(genome_path)) < 1 or force: + data_url = 'https://applbio.biologie.uni-frankfurt.de/download/hamstr_qfo' + if os.path.exists(data_fdog_file) and force: + os.remove(data_fdog_file) + general_fn.download_file(data_url, data_fdog_file) + try: + print('Extracting %s...' % data_fdog_file) + shutil.unpack_archive(data_fdog_file, dataPath, 'gztar') + except: + sys.exit('\033[91mERROR: Cannot extract %s to %s!\033[0m' % (data_fdog_file, dataPath)) + if 'genome_dir' in general_fn.read_dir(dataPath): + os.rename('%s/genome_dir' % dataPath, '%s/searchTaxa_dir' % dataPath) + os.rename('%s/blast_dir' % dataPath, '%s/coreTaxa_dir' % dataPath) + os.rename('%s/weight_dir' % dataPath, '%s/annotation_dir' % dataPath) + check_cmd = 'fdog.checkData -s %s/searchTaxa_dir -c %s/coreTaxa_dir -a %s/annotation_dir --reblast' % (dataPath, dataPath, dataPath) + try: + print('Checking downloaded data...') + subprocess.run([check_cmd], stdout = subprocess.DEVNULL, check = True, shell = True) + except: + print('\033[96mWARNING: Problem with validating downloaded data. Please run fdog.checkData manually!\033[0m') + os.remove(data_fdog_file) + print('fDOG data downloaded and saved at %s' % dataPath) + else: + print('fDOG data found at %s' % dataPath) -def checkOptConflict(lib, conda): - if lib: - if (conda): - sys.exit('*** ERROR: --lib and --conda cannot be used at the same time!') def main(): - version = '0.0.3' - parser = argparse.ArgumentParser(description='You are running fdog.setup version ' + str(version) + '.') + version = get_distribution('fdog').version + parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.') required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') - required.add_argument('-o', '--outPath', help='Output path for fdog data', action='store', default='', required=True) - optional.add_argument('--conda', help='Setup fdog within a conda env', action='store_true', default=False) - optional.add_argument('--lib', help='Install fdog libraries only', action='store_true', default=False) - optional.add_argument('--getSourcepath', help='Get path to installed fdog', action='store_true', default=False) - optional.add_argument('--getDatapath', help='Get fdog default data path', action='store_true', default=False) + required.add_argument('-d', '--dataPath', help='Output path for fDOG data', action='store', default='', required=True) + optional.add_argument('--getSourcepath', help='Get path to installed fdog package', action='store_true', default=False) + optional.add_argument('--getDatapath', help='Get fDOG default data path', action='store_true', default=False) + optional.add_argument('--woFAS', help='Do not install FAS (https://github.com/BIONF/FAS)', action='store_true', default=False) + optional.add_argument('--force', help='Force overwrite fDOG data', action='store_true', default=False) - ### get arguments + ### parse arguments args = parser.parse_args() - conda = args.conda - lib = args.lib - checkOptConflict(lib, conda) - outPath = args.outPath - Path(outPath).mkdir(parents = True, exist_ok = True) - fdogPath = os.path.realpath(__file__).replace('/setupfDog.py','') + dataPath = args.dataPath + woFAS = args.woFAS + force = args.force + + ### get install path + fdogPath = get_source_path() if args.getSourcepath: print(fdogPath) sys.exit() + ### get data path if args.getDatapath: - pathconfigFile = fdogPath + '/bin/pathconfig.txt' - if not os.path.exists(pathconfigFile): - sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).') + get_data_path(fdogPath) + + ### check if pathconfig file exists + pathconfig_file = '%s/bin/pathconfig.txt' % fdogPath + demo_cmd = 'fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3' + if os.path.exists(pathconfig_file) and not force: + check_fas = 1 + if not woFAS: + check_fas = fas_fn.check_fas_executable() + if check_fas == 1: + print('fDOG seems to be ready to use!') + print('You can test fDOG using the following command:\n%s' % demo_cmd) else: - with open(pathconfigFile) as f: - dataPath = f.readline().strip() - print(dataPath) + print('fDOG seems to be ready to use without FAS!') + print('You can test fDOG using the following command:\n%s --fasOff' % demo_cmd) sys.exit() + ### get ncbi taxonomy database for ete3 - print('Creating local NCBI taxonomy database...') + print('*** Creating local NCBI taxonomy database...') ncbi = NCBITaxa() - ### run setup - if conda: - setupFile = '%s/setup/setup_conda.sh -o %s' % (fdogPath, outPath) - subprocess.call([setupFile], shell = True) - else: - if lib: - setupFile = '%s/setup/setup.sh -l' % (fdogPath) + + ### install dependencies + print('*** Installing dependencies...') + ## FAS + if not woFAS: + install_fas(woFAS) + ## hmmer, blast+, clustalw, mafft, muscle + missing_tools = check_dependencies(fdogPath) + if len(missing_tools) > 0: + if check_conda_env() == True: + req_file = '%s/data/conda_requirements.yml' % fdogPath + print('=> Dependencies in %s' % req_file) + conda_install_cmd = 'conda install -c bioconda --file %s -y' % (req_file) + try: + subprocess.call([conda_install_cmd], shell = True) + except: + sys.exit('\033[91mERROR: Cannot install conda packages in %s!\033[0m' % req_file) else: - setupFile = '%s/setup/setup.sh -o %s' % (fdogPath, outPath) - subprocess.call([setupFile], shell = True) + install_cmd = 'sudo apt-get install -y -qq ' + sys.exit('\033[91mERROR: Please install these tools manually:\n%s\nusing the command: %s!\033[0m' % (', '.join(missing_tools), install_cmd)) + else: + print('=> Dependencies in %s/data/dependencies.txt already installed!' % fdogPath) + ## fasta36 + install_fasta36(fdogPath, os.getcwd()) + + ### download pre-calculated data + print('*** Downloading precalculated data...') + if force: + if os.path.exists(dataPath): + print('WARNING: %s will be deleted!' % dataPath) + shutil.rmtree(dataPath) + Path(dataPath).mkdir(parents = True, exist_ok = True) + download_data(dataPath, force) + + ### create pathconfig file + if os.path.exists(pathconfig_file): + os.remove(pathconfig_file) + with open(pathconfig_file, 'w') as cf: + cf.write(dataPath) + + print('\033[96m==> FINISHED! fDOG data can be found at %s\033[0m' % dataPath) + print('You can test fDOG using the following command:\n%s' % demo_cmd) if __name__ == '__main__': main() diff --git a/fdog/showTaxa.py b/fdog/showTaxa.py index a29bf57..093df32 100644 --- a/fdog/showTaxa.py +++ b/fdog/showTaxa.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- ####################################################################### -# Copyright (C) 2020 Vinh Tran +# Copyright (C) 2022 Vinh Tran # # This script is used to list all available taxa of the installed fdog # @@ -19,9 +19,6 @@ import os from ete3 import NCBITaxa -def checkFileExist(file): - if not os.path.exists(os.path.abspath(file)): - sys.exit('%s not found' % file) def getNcbiName(taxonName): ncbi = NCBITaxa() @@ -32,6 +29,7 @@ def getNcbiName(taxonName): name = taxonName return(name) + def getTaxa(): # get data path fdogPath = os.path.realpath(__file__).replace('/showTaxa.py','') @@ -41,19 +39,20 @@ def getTaxa(): with open(pathconfigFile) as f: dataPath = f.readline().strip() - # print taxa in blast_dir + # print taxa in coreTaxa_dir print('##### Data found at %s' % dataPath) print('\n##### Taxa in the core sets, which can be used as reference species #####\n') - for taxon in sorted(os.listdir(dataPath + '/blast_dir/')): - if os.path.isdir(dataPath + '/blast_dir/' + taxon): + for taxon in sorted(os.listdir(dataPath + '/coreTaxa_dir/')): + if os.path.isdir(dataPath + '/coreTaxa_dir/' + taxon): print('%s\t%s' % (taxon, getNcbiName(taxon))) - # print taxa in genome_dir + # print taxa in searchTaxa_dir print('\n##### Search taxa. in which you can search orthologs #####\n') - for taxon in sorted(os.listdir(dataPath + '/genome_dir/')): - if os.path.isdir(dataPath + '/genome_dir/' + taxon): + for taxon in sorted(os.listdir(dataPath + '/searchTaxa_dir/')): + if os.path.isdir(dataPath + '/searchTaxa_dir/' + taxon): print('%s\t%s' % (taxon, getNcbiName(taxon))) + def main(): getTaxa() diff --git a/setup.py b/setup.py index fedc507..53fdd84 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,7 @@ setup( name="fdog", - version="0.0.53", - + version="0.1.0", python_requires='>=3.7.0', description="Feature-aware Directed OrtholoG search tool", long_description=long_description, @@ -43,6 +42,8 @@ 'ete3', 'six', 'PyYAML', + 'pyhmmer', + 'pysam', 'greedyFAS>=1.11.2' ], entry_points={ @@ -54,7 +55,7 @@ "fdog.addTaxa = fdog.addTaxa:main", "fdog.showTaxa = fdog.showTaxa:main", "fdog.mergeOutput = fdog.mergeOutput:main", - "fdog.remove = fdog.removefDog:main", + "fdog.uninstall = fdog.removefDog:main", "fdog.assembly = fdog.fDOGassembly:main", "fdog.mergeAssembly = fdog.mergeAssemblyOutput:main"], },