# Pipeline to Assemble the genome of bacteria

Combination of two bacteria genome assembling pipelines, Tychus and Mypro.  
Realization of Tychus with Python scripts. Add SOAPdenovo, improve the genus database for prokka with data from Mypro https://sourceforge.net/projects/sb2nhri/files/MyPro/ 

## setup the files and parameters you are going to work with.

### two reads files

In [1]:
read1 = '/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_1.fq.gz'
read2 = '/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_2.fq.gz'

### parameters

In [54]:
thread = 24
outfolder = '/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus'
bacteriaStrain = 'MNH08365' # change as needed
cleanOrNot = True # whether or not to remove intermediate files

### usually fixed parameters

In [3]:
adapters = '/home/p/Trimmomatic-0.38/adapters/TruSeq3-PE.fa'


## import required packages

In [53]:
import subprocess
import os
import shutil
from Bio import SeqIO
import time

## set up some constants
Below are some programs that will be used

In [5]:
## program fastqc
FASTQC = '/home/p/FastQC/fastqc'
## program, Trim reads
TRIMMOMATIC = '/home/p/Trimmomatic-0.38/trimmomatic-0.38.jar'
## program, reads quality report and Trim
FASTP = '/home/p/fastp/fastp'
## program, Identify the best kmer for genome assembling
KMERGENIE = '/home/p/KmerGenie/kmergenie-1.7048/kmergenie'
## program, assemble the genome
ABYSS = '/home/p/anaconda/anaconda3_5.2.0/bin/abyss-pe'
## program, assemble the genome
VELVETH = '/home/p/anaconda/anaconda3_5.2.0/bin/velveth'
VELVETG = '/home/p/anaconda/anaconda3_5.2.0/bin/velvetg'
## program, assemble the genome
SPADES = '/home/p/anaconda/anaconda3_5.2.0/bin/spades.py'
## program, assemble the genome
FQ2FA = '/home/p/anaconda/anaconda3_5.2.0/bin/fq2fa'
IDBA_UD = '/home/p/anaconda/anaconda3_5.2.0/bin/idba_ud'
## program, assemble the genome
SOAPDENOVO = '/home/p/anaconda/anaconda3_5.2.0/bin/SOAPdenovo-127mer'
## program, merge assembled results
MERGE = '/home/p/CISA/CISA1.3/Merge.py'
CISAPATH = '/home/p/CISA/CISA1.3/'
CISA = '/home/p/CISA/CISA1.3/CISA.py'
## program, merge assembled results
NOVO_STITCH = '/home/p/Novo_Stitch/scripts/main.py'
REFALIGNER = '/home/p/RefalignerAssembler/latest/RefAligner'
GLPK = '/usr/local/bin/glpsol'
## program, annotate merged genome
PROKKA = '/home/p/prokka/prokka-master/bin/prokka'
## program, evaluate result
QUAST = '/home/p/quast/quast-4.6.3/quast.py'

## used programs
NCUMER = '/home/p/quast/quast-4.6.3/quast_libs/MUMmer/nucmer'
MAKEBLASTDB = '/home/p/blast/bin/makeblastdb'
BLASTN = '/home/p/blast/bin/blastn'


## check quality of input reads with fastqc

creat the folder to store fastqc result

In [6]:
if not os.path.exists(outfolder):
    os.mkdir(outfolder)
os.chdir(outfolder)
if not os.path.exists('fastqc'):
    os.mkdir('fastqc')

run the command

In [7]:
command_line = '{FASTQC} {read1} {read2} -t {thread} -o {outfolder}/fastqc'.format( \
               FASTQC = FASTQC, read1 = read1, read2 = read2, thread = thread, outfolder = outfolder)
print(command_line)
subprocess.run(command_line,shell=True)

/home/p/FastQC/fastqc /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_1.fq.gz /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_2.fq.gz -w 24 -o /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/fastqc


CompletedProcess(args='/home/p/FastQC/fastqc /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_1.fq.gz /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_2.fq.gz -w 24 -o /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/fastqc', returncode=0)

## trim reads with TRIMMOMATIC

creat the folder to store the result

In [8]:
if not os.path.exists('Trimmomatic'):
    os.mkdir('Trimmomatic')

In [9]:
command_line = '''java -jar {TRIMMOMATIC} PE -threads {thread} -phred33 {read1} {read2} \
                ./Trimmomatic/read1.pair.fq ./Trimmomatic/read1.single.fq ./Trimmomatic/read2.pair.fq ./Trimmomatic/read2.single.fq \
                ILLUMINACLIP:{adapters}:2:30:10 SLIDINGWINDOW:4:20 MINLEN:50 '''.format( \
               TRIMMOMATIC = TRIMMOMATIC, read1 = read1, read2 = read2, thread = thread, adapters = adapters)
print(command_line)
result = subprocess.run(command_line,shell=True, check=True)
print(result)

java -jar /home/p/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 24 -phred33 /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_1.fq.gz /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_2.fq.gz                 ./Trimmomatic/read1.pair.fq ./Trimmomatic/read1.single.fq ./Trimmomatic/read2.pair.fq ./Trimmomatic/read2.single.fq                 ILLUMINACLIP:/home/p/Trimmomatic-0.38/adapters/TruSeq3-PE.fa:2:30:10 SLIDINGWINDOW:4:20 MINLEN:50 
CompletedProcess(args='java -jar /home/p/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 24 -phred33 /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_1.fq.gz /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/MNH08365_2.fq.gz                 ./Trimmomatic/read1.pair.fq ./Trimmomatic/read1.single.fq ./Trimmomatic/read2.pair.fq ./Trimmomatic/read2.single.fq                 ILLUMINACLIP:/

## Identify best kmer with kmergenie

In [10]:
if not os.path.exists('kmergenie'):
    os.mkdir('kmergenie')
fout = open('readlist.txt','w')
fout.write(os.path.join(outfolder,'Trimmomatic','read1.pair.fq') + '\n')
fout.write(os.path.join(outfolder,'Trimmomatic','read2.pair.fq') + '\n')
fout.close()

In [11]:
command_line = '''{KMERGENIE} {outfolder}/readlist.txt -t {thread} -o {outfolder}/kmergenie/kmergenie.histogram '''.format( \
               KMERGENIE = KMERGENIE, outfolder = outfolder, thread = thread, adapters = adapters)
print(command_line)
result = subprocess.check_output(command_line, shell=True)

/home/p/KmerGenie/kmergenie-1.7048/kmergenie /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/readlist.txt -t 24 -o /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/kmergenie/kmergenie.histogram 


In [12]:
result = result.decode('utf-8')
print(result)

running histogram estimation
list of reads:
/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq
/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq
Setting maximum kmer length to: 150 bp
computing histograms (from k=21 to k=121): 121 111 101 91 81 71 51 61 41 31 21 estimation of the best k so far: 61

ntCard wall-clock time over all k values: 34 seconds 
fitting model to histograms to estimate best k
refining estimation around [55; 67], with a step of 2
running histogram estimation
list of reads:
/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq
/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq
Setting maximum kmer length to: 150 bp
computing histograms (from k=57 to k=67): 63 65 61 59 67 57 table of predicted num. of geno

In [13]:
result.split('\n')[-5]
bestkmer = result.split('\n')[-5].split()[-1]
print('best kmer is ', bestkmer)

best kmer is  61


## run abyss

In [14]:
if not os.path.exists('abyss'):
    os.mkdir('abyss')
os.chdir('abyss')

In [15]:
command_line = '''{ABYSS} k={bestkmer} np={thread}  in="{read1} {read2}" name=abyss'''.format( \
               ABYSS = ABYSS, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)

/home/p/anaconda/anaconda3_5.2.0/bin/abyss-pe k=61 np=24  in="/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq" name=abyss


In [16]:
result = result.decode('utf-8')
print(result)
os.chdir('..')

/home/p/anaconda/anaconda3_5.2.0/bin/mpirun -np 24 ABYSS-P -k61 -q3    --coverage-hist=coverage.hist -s abyss-bubbles.fa  -o abyss-1.fa /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq 
ABySS 2.1.0
ABYSS-P -k61 -q3 --coverage-hist=coverage.hist -s abyss-bubbles.fa -o abyss-1.fa /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq
Running on 24 processors
1: Running on host localhost.localdomain
2: Running on host localhost.localdomain
4: Running on host localhost.localdomain
6: Running on host localhost.localdomain
9: Running on host localhost.localdomain
10: Running on host localhost.localdomain
11: Running on host localh

## run velvet

In [17]:
if not os.path.exists('velvet'):
    os.mkdir('velvet')
os.chdir('velvet')

In [18]:
os.getcwd()

'/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/velvet'

In [19]:
command_line = '''{VELVETH} . {bestkmer} -separate -fastq -shortPaired {read1} {read2} '''.format( \
               VELVETH = VELVETH, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)

/home/p/anaconda/anaconda3_5.2.0/bin/velveth . 61 -separate -fastq -shortPaired /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq 


In [20]:
result = result.decode('utf-8')
print(result)

[0.000000] Reading FastQ file /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq;
[0.000053] Reading FastQ file /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq;
[73.855966] 19695418 sequences found in total in the paired sequence files
[73.855972] Done
[73.874804] Reading read set file ./Sequences;
[78.061765] 19695418 sequences found
[96.199030] Done
[96.199033] 19695418 sequences in total.
[96.199078] Writing into roadmap file ./Roadmaps...
[125.001654] Inputting sequences...
[125.012275] Inputting sequence 0 / 19695418
[127.018804] Inputting sequence 14000000 / 19695418
[128.189542] Inputting sequence 5000000 / 19695418
[130.099874] Inputting sequence 19000000 / 19695418
[131.392979] Inputting sequence 10000000 / 19695418
[132.537936] Inputting sequence 1000000 / 19695418
[135.368511] Inputting sequence 15000000 / 19695418
[136.250908] Inputting se

In [21]:
command_line = '''{VELVETG} .  -exp_cov auto -cov_cutoff auto '''.format( VELVETG = VELVETG)
print(command_line)
result = subprocess.check_output(command_line, shell=True)

/home/p/anaconda/anaconda3_5.2.0/bin/velvetg .  -exp_cov auto -cov_cutoff auto 


In [22]:
result = result.decode('utf-8')
print(result)
os.chdir('..')

[0.000000] Reading roadmap file ./Roadmaps
[27.580867] 19695418 roadmaps read
[27.596294] Creating insertion markers
[34.005227] Ordering insertion markers
[35.643671] Counting preNodes
[39.000879] 5973749 preNodes counted, creating them now
[43.344941] Sequence 1000000 / 19695418
[47.595644] Sequence 2000000 / 19695418
[51.771762] Sequence 3000000 / 19695418
[55.746266] Sequence 4000000 / 19695418
[60.079233] Sequence 5000000 / 19695418
[63.994490] Sequence 6000000 / 19695418
[67.916661] Sequence 7000000 / 19695418
[71.940916] Sequence 8000000 / 19695418
[75.886416] Sequence 9000000 / 19695418
[79.999447] Sequence 10000000 / 19695418
[83.882210] Sequence 11000000 / 19695418
[87.755178] Sequence 12000000 / 19695418
[91.761211] Sequence 13000000 / 19695418
[96.171603] Sequence 14000000 / 19695418
[100.460566] Sequence 15000000 / 19695418
[104.717268] Sequence 16000000 / 19695418
[108.804571] Sequence 17000000 / 19695418
[112.947955] Sequence 18000000 / 19695418
[117.227555] Sequence 190

## run SPAdes

In [23]:
if not os.path.exists('spades'):
    os.mkdir('spades')
os.chdir('spades')

In [24]:
os.getcwd()

'/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/spades'

In [25]:
command_line = '''{SPADES} --pe1-1 {read1} --pe1-2  {read2} -t {thread} -o .  '''.format( \
               SPADES = SPADES, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)

/home/p/anaconda/anaconda3_5.2.0/bin/spades.py --pe1-1 /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq --pe1-2  /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq -t 24 -o .  


In [26]:
result = result.decode('utf-8')
print(result)
os.chdir('..')

Command line: /home/p/anaconda/anaconda3_5.2.0/bin/spades.py	--pe1-1	/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq	--pe1-2	/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq	-t	24	-o	/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/spades	

System information:
  SPAdes version: 3.12.0
  Python version: 3.6.4
  OS: Linux-3.10.0-862.6.3.el7.x86_64-x86_64-with-centos-7.5.1804-Core

Output dir: /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/spades
Mode: read error correction and assembling
Debug mode is turned OFF

Dataset parameters:
  Multi-cell mode (you should set '--sc' flag if input data was obtained with MDA (single-cell) technology or --meta flag if processing metagenomic dataset)
  Reads:
    Library number: 1, library type: paired-end
      orientation: fr


## run IDBA-UD

In [27]:
if not os.path.exists('IDBA-UD'):
    os.mkdir('IDBA-UD')
os.chdir('IDBA-UD')

In [28]:
command_line = '''{FQ2FA} --merge --filter {read1} {read2} idba-paired-contigs.fa '''.format( \
               FQ2FA = FQ2FA, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

/home/p/anaconda/anaconda3_5.2.0/bin/fq2fa --merge --filter /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read1.pair.fq /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/Trimmomatic/read2.pair.fq idba-paired-contigs.fa 



In [29]:
command_line = '''{IDBA_UD} -r idba-paired-contigs.fa --num_threads {thread} -o . '''.format( \
               IDBA_UD = IDBA_UD, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

/home/p/anaconda/anaconda3_5.2.0/bin/idba_ud -r idba-paired-contigs.fa --num_threads 24 -o . 
number of threads 24
reads 19664504
long reads 0
extra reads 0
read_length 150
kmer 20
kmers 19062498 19607055
merge bubble 1157
contigs: 42009 n50: 484 max: 5656 mean: 102 total length: 4296420 n80: 39
aligned 12876007 reads
confirmed bases: 2292413 correct reads: 10615469 bases: 2384891
distance mean 321.22 sd 60.0233
seed contigs 4003 local contigs 84018
kmer 40
kmers 15609138 15859200
merge bubble 3070
contigs: 6916 n50: 17629 max: 118926 mean: 504 total length: 3490265 n80: 3298
aligned 18304412 reads
confirmed bases: 2944690 correct reads: 17798623 bases: 1672150
distance mean 327.924 sd 60.7581
seed contigs 710 local contigs 13832
kmer 60
kmers 10003157 9872370
merge bubble 309
contigs: 2512 n50: 46939 max: 157761 mean: 1311 total length: 3295257 n80: 12857
aligned 18877725 reads
confirmed bases: 2994720 correct reads: 18409104 bases: 155576
distance mean 328.138 sd 60.8018
seed contigs

In [30]:
os.chdir('..')

## run soapdenovo

In [31]:
if not os.path.exists('soapdenovo'):
    os.mkdir('soapdenovo')
os.chdir('soapdenovo')

In [32]:
fw=open('myconfig','w')
fw.write('max_rd_len=150\n[LIB]\navg_ins=350\nreverse_seq=0\nasm_flags=3\nq1={read1}\nq2={read2}\n'.format(\
        read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq')))
fw.close()

In [33]:
command_line = '''{SOAPDENOVO} all -s myconfig -K {bestkmer} -R -o soapdenovo -d 10 -p 24  1>ass.log 2>ass.err '''.format( \
               SOAPDENOVO = SOAPDENOVO, bestkmer=bestkmer, outfolder = outfolder, thread = thread, \
               read1 = os.path.join(outfolder,'Trimmomatic','read1.pair.fq'), read2 = os.path.join(outfolder,'Trimmomatic','read2.pair.fq'))
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

/home/p/anaconda/anaconda3_5.2.0/bin/SOAPdenovo-127mer all -s myconfig -K 61 -R -o soapdenovo -d 10 -p 24  1>ass.log 2>ass.err 



In [34]:
os.chdir('..')

## run CISA

In [35]:
if not os.path.exists('CISA'):
    os.mkdir('CISA')
os.chdir('CISA')

In [36]:
shutil.copy('../abyss/abyss-contigs.fa','abyss.fa')
shutil.copy('../velvet/contigs.fa','velvet.fa')
shutil.copy('../spades/contigs.fasta','spades.fa')
shutil.copy('../IDBA-UD/contig.fa','IDBA-UD.fa')
shutil.copy('../soapdenovo/soapdenovo.contig','soapdenovo.fa')

'soapdenovo.fa'

In [37]:
ctgs = ['soapdenovo.fa', 'velvet.fa', 'abyss.fa', 'spades.fa', 'IDBA-UD.fa']

In [38]:
fw=open('ToMerge.config','w')
fw.write('count=5\n')
for i in ctgs:
    fw.write('data={fastafile},title={fastaname}\n'.format(fastafile=i,fastaname=i.split('.')[0]))
fw.write('min_length=400\nMaster_file=Merged.fa\nGap=1\n')
fw.close()

In [39]:
command_line = '''{MERGE} ToMerge.config '''.format(MERGE = MERGE)
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

/home/p/CISA/CISA1.3/Merge.py ToMerge.config 
soapdenovo.fa.p.fa
Number of contigs: 166
Length of the longest contig: 157765
whole:2986723
N50: 49127
velvet.fa.p.fa
Number of contigs: 133
Length of the longest contig: 291605
whole:3014292
N50: 75219
abyss.fa.p.fa
Number of contigs: 93
Length of the longest contig: 278946
whole:3108220
N50: 93105
spades.fa.p.fa
Number of contigs: 316
Length of the longest contig: 434654
whole:3135457
N50: 102757
IDBA-UD.fa.p.fa
Number of contigs: 200
Length of the longest contig: 341997
whole:3067591
N50: 94757



In [40]:
lis = open('Merge_info').readlines()
genomelens = [int(e.split()[-1]) for e in lis if 'Whole Genome' in e]
genomelen = max(genomelens)
print('max genome length is', genomelen)

max genome length is 3135457


In [41]:
fw=open('CISA.config','w')
fw.write('''
genome={genomelen}
infile=Merged.fa
outfile=Final.fa
nucmer={NCUMER}
R2_Gap=0.95
CISA={CISAPATH}
makeblastdb={MAKEBLASTDB}
blastn={BLASTN}
'''.format(genomelen=genomelen,NCUMER=NCUMER,CISAPATH=CISAPATH,MAKEBLASTDB=MAKEBLASTDB,BLASTN=BLASTN))
fw.close()

In [42]:
command_line = '''python {CISA} CISA.config '''.format(CISA = CISA)
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
#print(result)

python /home/p/CISA/CISA1.3/CISA.py CISA.config 


In [43]:
os.chdir('..')

## run prokka

In [44]:
os.getcwd()

'/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus'

In [45]:
if not os.path.exists('prokka'):
    os.mkdir('prokka')
os.chdir('prokka')

In [46]:
ls_contigs = list(SeqIO.parse(open('../CISA/Final.fa'), 'fasta'))
outfile = '{bacteriaStrain}.genome.fa'.format(bacteriaStrain=bacteriaStrain)
fout = open(outfile,'w')
for _n, _e in enumerate(ls_contigs):
    fout.write('>'+bacteriaStrain+'.g'+str(_n+1)+'\n'+str(_e.seq)+'\n')
fout.close()

In [47]:
command_line = '''{PROKKA} --kingdom Bacteria --cpus {thread}  {bacteriaStrain}.genome.fa --outdir . \
                --prefix {bacteriaStrain} --force --metagenome --locustag {bacteriaStrain}'''.format(\
                PROKKA = PROKKA, thread=thread, bacteriaStrain=bacteriaStrain)
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

/home/p/prokka/prokka-master/bin/prokka --kingdom Bacteria --cpus 24  MNH08365.genome.fa --outdir .                 --prefix MNH08365 --force --metagenome --locustag MNH08365



## run quast

In [48]:
os.chdir('..')

In [50]:
os.getcwd()

'/mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus'

In [51]:
command_line = '''python {QUAST} -t {thread} -o quast \
                ./prokka/{bacteriaStrain}.genome.fa ./CISA/abyss.fa.p.fa ./CISA/velvet.fa.p.fa ./CISA/spades.fa.p.fa \
                ./CISA/IDBA-UD.fa.p.fa ./CISA/soapdenovo.fa.p.fa'''.format(\
                QUAST = QUAST, thread=thread, bacteriaStrain=bacteriaStrain)
print(command_line)
result = subprocess.check_output(command_line, shell=True)
result = result.decode('utf-8')
print(result)

python /home/p/quast/quast-4.6.3/quast.py -t 24 -o quast                 ./prokka/MNH08365.genome.fa ./CISA/abyss.fa.p.fa ./CISA/velvet.fa.p.fa ./CISA/spades.fa.p.fa                 ./CISA/IDBA-UD.fa.p.fa ./CISA/soapdenovo.fa.p.fa
/home/p/quast/quast-4.6.3/quast.py -t 24 -o quast ./prokka/MNH08365.genome.fa ./CISA/abyss.fa.p.fa ./CISA/velvet.fa.p.fa ./CISA/spades.fa.p.fa ./CISA/IDBA-UD.fa.p.fa ./CISA/soapdenovo.fa.p.fa

Version: 4.6.3

System information:
  OS: Linux-3.10.0-862.6.3.el7.x86_64-x86_64-with-centos-7.5.1804-Core (linux_64)
  Python version: 3.6.4
  CPUs number: 24

Started: 2018-08-20 23:07:36

Logging to /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus/quast/quast.log

CWD: /mnt/d/linux/W/MoonNGS/raw/20180813Novogene16species_Human_Methyl/1.rawdata/MNH08365/Tychus
Main parameters: 
  Threads: 24, minimum contig length: 500, ambiguity: one, threshold for extensive misassembly size: 1000

Contigs:
  Pre-processing...
  1  ./prokka/

## clean files

In [56]:
if cleanOrNot:
    os.system('rm -rf {outfolder}/abyss'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/IDBA-UD'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/kmergenie'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/soapdenovo'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/spades'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/Trimmomatic'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/velvet'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/CISA'.format(outfolder=outfolder))
    os.system('rm -rf {outfolder}/readlist.txt'.format(outfolder=outfolder))
    