# Alignment of Fibroin Sequences for Ancestral State Reconstruction
We use MAFFT E-INS-i because it is "suitable for sequences containing large unalignable regions".
### Setup

In [1]:
from cogent import LoadSeqs, LoadTree, DNA, PROTEIN

### Load unaligned DNA sequences

In [2]:
unaligned_dna = LoadSeqs('../data/unaligned_dna.fasta', moltype = DNA, aligned = False)

### Translate to protein sequences
(after trimming stop codons)

In [3]:
unaligned_dna = unaligned_dna.withoutTerminalStopCodons()
unaligned_protein = unaligned_dna.getTranslation()
unaligned_protein.writeToFile('../results/unaligned_protein.fasta')

### Align the protein sequences

In [4]:
! ginsi --version

v7.310 (2017/Mar/17)


In [5]:
! ginsi --maxiterate 1000 --treeout ../results/unaligned_protein.fasta > ../results/aligned_protein.fasta


All-to-all alignment.
tbfast-pair (aa) Version 7.310 alg=A, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
Gap Penalty = -1.53, +0.00, +0.00
treein = 0
compacttree = 0
Constructing a UPGMA tree ... 
   10 / 20
done.

Progressive alignment ... 
STEP    16 /19 c
Reallocating..done. *alloclen = 1903
STEP    19 /19 c
done.
tbfast (aa) Version 7.310 alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
Loading 'hat3' ... done.

   10 / 20
Segment   1/  1    1- 518
STEP 003-016-0  rejected..    identical.    rejected. accepted. accepted. identical.    identical.   
Converged.

done
dvtditr (aa) Version 7.310 alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 G-INS-i (Suitable for sequences of similar l

### Check the MAFFT guide tree

In [6]:
mafft_guide_tree = LoadTree('../results/unaligned_protein.fasta.tree')
print mafft_guide_tree.asciiArt()

                                                            /-1_AdorF1
                                                  /edge.0--|
                                        /edge.1--|          \-2_AmelF1
                                       |         |
                              /edge.3--|          \-3_BterF1
                             |         |
                             |         |          /-4_MforF1
                             |          \edge.2--|
                             |                    \-5_OsmaF1
                    /edge.8--|
                   |         |                              /-6_AdorF4
                   |         |                    /edge.4--|
                   |         |          /edge.5--|          \-7_AmelF4
                   |         |         |         |
                   |          \edge.7--|          \-8_BterF4
                   |                   |
          /edge.13-|                   |          /-9_MforF4
         |         |   

### Load the aligned protein sequences

In [7]:
aligned_protein = LoadSeqs('../results/aligned_protein.fasta', moltype=PROTEIN)

### Translate aligned protein sequences to final aligned DNA sequences

In [8]:
aligned_dna = aligned_protein.replaceSeqs(unaligned_dna)
aligned_dna.writeToFile('../results/aligned_dna.fasta')