Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.1.0 #26

Merged
merged 27 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
daec73c
create v0.1.0
trvinh Dec 6, 2022
8872cee
simplfying hamstr.pl
trvinh Dec 6, 2022
71c8636
first python conversion
trvinh Jan 11, 2023
08d7e64
modified addTaxon and addTaxa
trvinh Jan 12, 2023
43797e3
first final version
trvinh Jan 13, 2023
e0d35d3
fixed undefined var
trvinh Jan 13, 2023
139f4a1
fixed github_build
trvinh Jan 13, 2023
a202294
added install/check dependencies
trvinh Jan 13, 2023
ba0efba
added install/check dependencies
trvinh Jan 13, 2023
b62dce6
added install/check dependencies
trvinh Jan 13, 2023
770b1ce
added install/check dependencies
trvinh Jan 13, 2023
45702d4
added install/check dependencies
trvinh Jan 13, 2023
94fd00d
added install/check dependencies
trvinh Jan 13, 2023
b6b8974
added install/check dependencies
trvinh Jan 13, 2023
1efcb43
added install/check dependencies
trvinh Jan 13, 2023
1ed85d5
added install/check dependencies
trvinh Jan 13, 2023
6723e8a
added install/check dependencies
trvinh Jan 13, 2023
2e3d924
added install/check dependencies
trvinh Jan 13, 2023
1986b40
added install/check dependencies
trvinh Jan 13, 2023
ea20fa7
added install/check dependencies
trvinh Jan 13, 2023
e2124b7
fixed parameters of create_genome
trvinh Jan 13, 2023
8eb9a2e
modified github_build
trvinh Jan 13, 2023
dcc3e84
rename data folders
trvinh Jan 19, 2023
9681324
fixed github_build
trvinh Jan 19, 2023
5323541
fixed github_build
trvinh Jan 19, 2023
bf8eace
fixed bug when searching in refspec
trvinh Jan 19, 2023
49fa477
added runtime for core complilation
trvinh Jan 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
19 changes: 14 additions & 5 deletions .github/workflows/github_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ name: build

on:
push:
branches: [ master ]
branches:
- master
- 0.1.0
tags:
- '*'
- '*'
pull_request:
branches: [ master ]

Expand Down Expand Up @@ -38,10 +40,17 @@ jobs:
run: |
pwd
pip install .
fdog.setup -o /home/runner/work/fDOG/fDOG/dt --lib
fdog.setup -o /home/runner/work/fDOG/fDOG/dt
path=$(fdog.setup -d ./ --getSourcepath); for i in $(less $path/data/dependencies.txt); do sudo apt-get install -y -qq $i; done
fdog.setup -d /home/runner/work/fDOG/fDOG/dt --woFAS
fdog.checkData -s /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir -c /home/runner/work/fDOG/fDOG/dt/coreTaxa_dir -a /home/runner/work/fDOG/fDOG/dt/annotation_dir --reblast
fdog.showTaxa
fdog.run --seqFile infile.fa --seqName test --refspec HUMAN@9606@3 --fasoff
fdog.run --seqFile infile.fa --jobName test --refspec HUMAN@9606@3 --fasOff
mkdir seeds
path=$(fdog.setup -d ./ --getSourcepath); a="1 2 3"; for i in ${a[@]}; do cp $path/data/infile.fa seeds/$i.fa; done
fdogs.run --seqFolder seeds --jobName test_multi --refspec HUMAN@9606@3 --fasOff --searchTaxa PARTE@5888@3,THAPS@35128@3
head /home/runner/work/fDOG/fDOG/dt/searchTaxa_dir/HUMAN@9606@3/HUMAN@9606@3.fa > hm.fa
fdog.addTaxon -f hm.fa -i 9606 -o ./ -c -a
ls
- name: Deploy
if: startsWith(github.event.ref, 'refs/tags')
uses: casperdcl/deploy-pypi@v2
Expand Down
Binary file removed fdog/.DS_Store
Binary file not shown.
204 changes: 89 additions & 115 deletions fdog/addTaxa.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
# -*- coding: utf-8 -*-

#######################################################################
# Copyright (C) 2020 Vinh Tran
# Copyright (C) 2022 Vinh Tran
#
# This script is used to prepare data for fdog.
# For each given genome FASTA file, It will create a folder within genome_dir
# For each given genome FASTA file, It will create a folder within searchTaxa_dir
# with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version]
# e.g HUMAN@9606@3), a annotation file in JSON format in weight_dir and
# a blast DB in blast_dir folder (optional).
# For a long header of original FASTA sequence, only the first word
# will be taken as the ID of new fasta file, everything after the
# first whitespace will be removed. If this first word is not unique,
# an automatically increasing index will be added.
# e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and
# a blast DB in coreTaxa_dir folder (optional).
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
Expand All @@ -26,98 +22,74 @@
import sys
import os
import argparse
from os import listdir
from os.path import isfile, join
from pathlib import Path
import subprocess
from Bio import SeqIO
import multiprocessing as mp
from tqdm import tqdm
from ete3 import NCBITaxa
import csv
from io import StringIO
import re
import shutil
from tqdm import tqdm
from datetime import datetime
import time
from pkg_resources import get_distribution
from collections import OrderedDict

def checkFileExist(file):
if not os.path.exists(os.path.abspath(file)):
sys.exit('%s not found' % file)
import fdog.libs.zzz as general_fn
import fdog.libs.tree as tree_fn
import fdog.libs.addtaxon as add_taxon_fn

def getTaxName(taxId):
ncbi = NCBITaxa()
try:
ncbiName = ncbi.get_taxid_translator([taxId])[int(taxId)]
ncbiName = re.sub('[^a-zA-Z1-9\s]+', '', ncbiName)
taxName = ncbiName.split()
name = taxName[0][:3].upper()+taxName[1][:2].upper()
except:
name = "UNK" + taxId
return(name)

def parseMapFile(mappingFile):
nameDict = {}
with open(mappingFile) as f:
def parse_map_file(mapping_file, folIn):
""" Create spec name from mapping file
And also check if given input files in mapping file exist
"""
name_dict = {}
with open(mapping_file) as f:
for line in f:
if not '#' in line:
tmp = line.split('\t')
fileName = tmp[0]
taxId = tmp[1].strip()
file_name = tmp[0]
file_in = '%s/%s' % (folIn, file_name)
general_fn.check_file_exist(file_in)
tax_id = tmp[1].strip()
try:
taxName = tmp[2].strip()
tax_name = tmp[2].strip()
except:
taxName = getTaxName(taxId)
tax_name = ''
try:
ver = tmp[3].strip()
except:
ver = datetime.today().strftime('%y%m%d') #1
# print(taxName+"@"+str(taxId)+"@"+str(ver))
nameDict[fileName] = (taxName, str(taxId), str(ver))
return(nameDict)
ver = datetime.today().strftime('%y%m%d')
spec_name = add_taxon_fn.generate_spec_name(tax_id, tax_name, ver)
name_dict[file_in] = spec_name
return(name_dict)

def runAddTaxon(args):
(f,n,i,o,c,v,a,cpus,replace,delete) = args
cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus)
if c == True:
cmd = cmd + ' -c'
if a == True:
cmd = cmd + ' -a'
if replace == True:
cmd = cmd + ' --replace'
if delete == True:
cmd = cmd + ' --delete'
# print(cmd)
logFile = o + '/addTaxa2fDog.log'
cmd = cmd + ' >> ' + logFile
try:
subprocess.call([cmd], shell = True)
except:
sys.exit('Problem running\n%s' % (cmd))

def main():
version = '0.0.9'
parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.')
version = get_distribution('fdog').version
parser = argparse.ArgumentParser(description='You are running fDOG version ' + str(version) + '.')
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument('-i', '--input', help='Path to input folder', action='store', default='', required=True)
required.add_argument('-m', '--mapping',
help='Tab-delimited text file containing <fasta_filename>tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.',
help='Tab-delimited text file containing <fasta_file_name>tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.',
action='store', default='', required=True)
optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False)
optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False)
optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
optional.add_argument('-f', '--force', help='Force overwrite existing data', action='store_true', default=False)
optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False)

### get arguments
args = parser.parse_args()
folIn = args.input
folIn = os.path.abspath(folIn)
mapping = args.mapping
checkFileExist(mapping)
general_fn.check_file_exist(mapping)
outPath = args.outPath
if outPath == '':
fdogPath = os.path.realpath(__file__).replace('/addTaxa.py','')
fdogPath = os.path.realpath(__file__).replace('/addTaxon.py','')
pathconfigFile = fdogPath + '/bin/pathconfig.txt'
if not os.path.exists(pathconfigFile):
sys.exit('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).')
Expand All @@ -131,61 +103,63 @@ def main():
cpus = mp.cpu_count()-2
replace = args.replace
delete = args.delete
add_taxon_fn.check_conflict_opts(replace, delete)
force = args.force

start = time.time()
### parse mapping file
name_dict = parse_map_file(mapping, folIn)

### get existing genomes
Path(outPath + "/genome_dir").mkdir(parents = True, exist_ok = True)
Path(outPath + "/weight_dir").mkdir(parents = True, exist_ok = True)
genomeFiles = listdir(outPath + "/genome_dir")

### generate taxon names from mapping file
nameDict = parseMapFile(mapping)

### read all input fasta files and create addTaxon jobs
jobs = []
dupList = {}
faFiles = [f for f in listdir(folIn) if isfile(join(folIn, f))]
for f in faFiles:
# tmp = f.split('.')
if f in nameDict:
# check duplicated taxon name in existing data
taxName = '@'.join(nameDict[f])
flag = 1
if taxName in genomeFiles:
if force:
shutil.rmtree(outPath + "/genome_dir/" + taxName)
if not noAnno:
shutil.rmtree(outPath + "/weight_dir/" + taxName)
else:
flag = 0
dupList[f] = taxName
### initiate paths
Path(outPath + '/searchTaxa_dir').mkdir(parents = True, exist_ok = True)

if flag == 1:
fasta = folIn + '/' + f
name = nameDict[f][0]
taxid = nameDict[f][1]
verProt = nameDict[f][2]
jobs.append([
folIn + '/' + f, nameDict[f][0], nameDict[f][1],
outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete
])

if len(dupList) > 0:
print("These taxa are probably already present in %s:" % (outPath + "/genome_dir"))
for f in dupList:
print('\t'+f+'\t'+dupList[f])
### create file in searchTaxa_dir [and coreTaxa_dir]
genome_jobs = []
blast_jobs = []
for f in name_dict:
spec_name = name_dict[f]
## remove old folder if force is set
if force:
print('They will be deleted and re-compiled!')
else:
sys.exit("Please remove them from the mapping file or use different Name/ID/Version!")

print('Parsing...')
for job in tqdm(jobs):
# print('@'.join([job[1],job[2],job[5]]) + '\t' + job[0])
runAddTaxon(job)

print('Output can be found in %s' % outPath)
if os.path.exists(outPath + '/searchTaxa_dir/' + spec_name):
shutil.rmtree(outPath + '/searchTaxa_dir/' + spec_name)
if os.path.exists(outPath + '/coreTaxa_dir/' + spec_name):
shutil.rmtree(outPath + '/coreTaxa_dir/' + spec_name)
## create jobs
genome_path = '%s/searchTaxa_dir/%s' % (outPath, spec_name)
Path(genome_path).mkdir(parents = True, exist_ok = True)
genome_jobs.append([f, genome_path, spec_name, force, replace, delete])
if coreTaxa:
genome_file = '%s/%s.fa' % (genome_path, spec_name)
blast_jobs.append([outPath, spec_name, genome_file, force, True])
pool = mp.Pool(cpus)

print('Parsing genome for %s species...' % len(genome_jobs))
genome_out = []
for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_genome, genome_jobs),
total=len(genome_jobs)):
genome_out.append(_)
out_msg = 'Output for %s can be found in %s within searchTaxa_dir' % (spec_name, outPath)
if len(blast_jobs) > 0:
print('\nCreating Blast DB for %s species...' % len(blast_jobs))
blast_out = []
for _ in tqdm(pool.imap_unordered(add_taxon_fn.create_blastdb, blast_jobs),
total=len(blast_jobs)):
blast_out.append(_)
out_msg = '%s, coreTaxa_dir' % out_msg

### create annotation
if not noAnno:
Path(outPath + '/annotation_dir').mkdir(parents = True, exist_ok = True)
for f in name_dict:
genome_file = '%s/searchTaxa_dir/%s/%s.fa' % (outPath, name_dict[f], name_dict[f])
add_taxon_fn.create_annoFile(outPath, genome_file, cpus, force)
if os.path.exists('%s/annotation_dir/tmp' % outPath):
shutil.rmtree('%s/annotation_dir/tmp' % outPath)
out_msg = '%s, annotation_dir' % out_msg

end = time.time()
print('==> Adding %s taxa finished in %s' % (len(name_dict), '{:5.3f}s'.format(end - start)))
print('==> %s' % out_msg)

if __name__ == '__main__':
main()
Loading