In [1]:
from cogent import LoadSeqs, LoadTree, DNA, PROTEIN
from cogent.evolve.models import JTT92

In [2]:
unaligned_dna = LoadSeqs('../data/unaligned_dna.fasta', moltype = DNA, aligned = False)

In [3]:
def trim_stop_codons(aln): # because the pycogent one doesn't work
    data = aln.todict()
    for name, seq in data.items():
        for i in range(0,len(seq),3):
            if seq.endswith('TGA'):
                data[name] = seq[:-3]
    return LoadSeqs(data=data, moltype=DNA, aligned=False)

In [4]:
unaligned_dna = trim_stop_codons(unaligned_dna)
unaligned_protein = unaligned_dna.getTranslation()
unaligned_protein.writeToFile('../results/unaligned_protein.fasta')

In [5]:
! einsi --version

v7.310 (2017/Mar/17)


In [6]:
! einsi ../results/unaligned_protein.fasta > ../results/aligned_protein.fasta


All-to-all alignment.
tbfast-pair (aa) Version 7.310 alg=N, model=BLOSUM62, 2.00, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
Gap Penalty = -1.53, +0.00, +0.00
treein = 0
compacttree = 0
Constructing a UPGMA tree ... 
   10 / 20
done.

Progressive alignment ... 
STEP    16 /19 c
Reallocating..done. *alloclen = 1906
STEP    19 /19 c
done.
tbfast (aa) Version 7.310 alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
Loading 'hat3' ... done.

   10 / 20
Segment   1/  1    1- 529
STEP 006-006-1  identical.    rejected. rejected. identical.    rejected. rejected. identical.    rejected. accepted. accepted. identical.    identical.    identical.    identical.    rejected. rejected. identical.   
Oscillating.

done
dvtditr (aa) Version 7.310 alg=A, model

In [7]:
aligned_protein = LoadSeqs('../results/aligned_protein.fasta', moltype=PROTEIN)

In [8]:
tree = LoadTree('../data/tree.nwk')
sm = JTT92(with_rate=True, distribution='gamma')
lf = sm.makeLikelihoodFunction(tree)
lf.setAlignment(aligned_protein)
lf.optimise(local=True)

In [9]:
print(lf)

Likelihood Function Table
       edge         parent    length
------------------------------------
     AmelF2      twoBeesF2    0.0494
     AdorF2      twoBeesF2    0.0321
  twoBeesF2    threeBeesF2    0.1473
     BterF2    threeBeesF2    0.2415
threeBeesF2         rootF2    0.3916
     OsmaF2      twoAntsF2    0.2122
     MforF2      twoAntsF2    0.2466
  twoAntsF2         rootF2    0.2874
     rootF2           root    0.2239
     AmelF3      twoBeesF3    0.0728
     AdorF3      twoBeesF3    0.0331
  twoBeesF3    threeBeesF3    0.2170
     BterF3    threeBeesF3    0.1808
threeBeesF3         rootF3    0.3515
     OsmaF3      twoAntsF3    0.1657
     MforF3      twoAntsF3    0.2641
  twoAntsF3         rootF3    0.2790
     rootF3           root    0.2647
     AmelF1      twoBeesF1    0.0321
     AdorF1      twoBeesF1    0.0296
  twoBeesF1    threeBeesF1    0.2726
     BterF1    threeBeesF1    0.1358
threeBeesF1         rootF1    0.4551
     OsmaF1      twoAntsF1    0.2041
     MforF1 

In [10]:
aligned_dna = aligned_protein.replaceSeqs(unaligned_dna)
aligned_dna.writeToFile('../results/aligned_dna.fasta')

In [11]:
for node in lf.tree.getEdgeVector(include_root=False):
    node.Length = lf.getParamValue('length', edge=node.Name)
    if node.Name in aligned_dna.Names:
        node.Name = str(aligned_dna.Names.index(node.Name)+1)
    else:
        node.NameLoaded = False

In [12]:
lf.tree.writeToFile('../results/tree.nwk',with_distances=True)

In [13]:
! ./newick2mafft.rb ../results/tree.nwk > ../results/tree.mafft

scale = 1.0
Initial tree = ((((4:0.0494378153641,11:0.0321185160735):0.147262331447,12:0.241455813):0.391612912096,(8:0.212161437674,13:0.246610990448):0.287393781555):0.223945507167,(((17:0.0728370267921,14:0.033143648224):0.216972707096,1:0.180790586977):0.351476431641,(9:0.165680317818,16:0.264076842065):0.278985685337):0.26466398134,((((15:0.0320698577921,2:0.0296235962633):0.272552048504,3:0.135806041444):0.455149672861,(7:0.204113813024,18:0.22664305655):0.354180709069):0.202454291919,(((19:0.050535386944,5:0.0188949468014):0.259409219998,6:0.232715998376):0.3333609692,(10:0.307433156271,20:0.186654813577):0.318376065761):0.0641042310778):0.101836605722);


In [14]:
! einsi --treein ../results/tree.mafft ../results/unaligned_protein.fasta > ../results/aligned_protein.fasta


All-to-all alignment.
tbfast-pair (aa) Version 7.310 alg=N, model=BLOSUM62, 2.00, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
Gap Penalty = -1.53, +0.00, +0.00
loadtree.
treein = 108
compacttree = 0
Loading a tree

   10 / 20
done.

Progressive alignment ... 
STEP    16 /19 c
Reallocating..done. *alloclen = 1906
STEP    19 /19 c
done.
tbfast (aa) Version 7.310 alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
Loading 'hat3' ... done.

10 / 20

Segment   1/  1    1- 522
STEP 006-016-1  rejected..    identical.    accepted. rejected. identical.    identical.    identical.    identical.    rejected. rejected. rejected. rejected. rejected. rejected.
Oscillating.

done
dvtditr (aa) Version 7.310 alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, ama

In [15]:
aligned_protein = LoadSeqs('../results/aligned_protein.fasta', moltype=PROTEIN)

In [16]:
aligned_dna = aligned_protein.replaceSeqs(unaligned_dna)
aligned_dna.writeToFile('../results/aligned_dna.fasta')