# <img src="https://github.com/JuliaLang/julia-logo-graphics/raw/master/images/julia-logo-color.png" height="100" /> _Colab Notebook Template_

## Instructions
1. Work on a copy of this notebook: _File_ > _Save a copy in Drive_ (you will need a Google account). Alternatively, you can download the notebook using _File_ > _Download .ipynb_, then upload it to [Colab](https://colab.research.google.com/).
2. If you need a GPU: _Runtime_ > _Change runtime type_ > _Harware accelerator_ = _GPU_.
3. Execute the following cell (click on it and press Ctrl+Enter) to install Julia, IJulia and other packages (if needed, update `JULIA_VERSION` and the other parameters). This takes a couple of minutes.
4. Reload this page (press Ctrl+R, or ⌘+R, or the F5 key) and continue to the next section.

_Notes_:
* If your Colab Runtime gets reset (e.g., due to inactivity), repeat steps 2, 3 and 4.
* After installation, if you want to change the Julia version or activate/deactivate the GPU, you will need to reset the Runtime: _Runtime_ > _Factory reset runtime_ and repeat steps 3 and 4.

# Checking the Installation
The `versioninfo()` function should print your Julia version and some other info about the system:

In [1]:
using Random
using Printf
using Dates

In [2]:
# Generates t sequences all of length l, and returns an array of strings
function generate_sequences(t::Int64, l::Int64)
   # t is the number of sequences to create
    # l is the length of the sequences
    DNA = Array{String,1}(undef,0)
    base_arr = ["A", "T", "G", "C"]

    for t_index in 1:t
        push!(DNA, "")
        for l_value in 1:l
            r = convert(Int64, floor(Random.rand() * 4) + 1)
            DNA[t_index] = string(DNA[t_index], base_arr[r])
        end
    end
    return DNA    
end

generate_sequences (generic function with 1 method)

In [None]:
# test generate_sequences here
data = generate_sequences(10,1000)

10-element Array{String,1}:
 "GGCTGGAGGGAAAAACGATGATACACTTAGGGATTCAGTGTTGGTCATTACTCTCGATGTGTGTCGAGGAGGTATGCCGGCGACCTGGTGCCTCTCGCCAATAGGCCGTCCCCGATTAGTTAATGATGACGGTTAACGACTTCGCGGGGGTTTTCACAGAATATTGGTGGTTCAGGGCGAATTCCGATTGACTGTAGGTTGTTCTCCGAATCCGCAGGAATGGAGTCAGCTGTCTAGTTTCACAACCCAGCGGTTGCACTTGTTCAAAAGGGCTTTATTGTCAGGTGGTTACCATCGTCACCGTGCCGCTGGGATCATAATCACCCGGCCGCTAAATTACTAAGCGGAATAATTTTGTCACACGCACCTCGATCCACAATCCTCGTTTGCTTTGCAGTCCACTGGCGAAAGGCTTGGGCGGTTAGCAGGGCTTATAGTCTTTGATTCGCTTGCTAGACAAAGTGCACCGTATTGTACGTGTAGGAGAAGCGACATTTTTGTCTCTGATACGTGAATCACTAGCTATTTTGCGCTAAGTCGCAGTGTATATATGCAGACCGAGGCAAAGTTATTCCCTATGTGCATTTATAGCCATATGCGTGCACTATTTCGGCATCTCCTCTTGGCGCAAATGAGCAAGGGCTGTGTTACGATAGCCCCACACCGTTGGCTAGCGAACTACCGTGATTTACCCCCTAATAACACTGTGATCAGTTACGCAGCTTTTTCGATGGATTGACGTTTGTCGTGCCCTCTCATTGATTGTTCTAAATTCAAGTTTGCAACGGCAGCCATACTTACAGGCTTGGGTGACATTAGTGCGAGTTTGGGTTCCAACAGACGGATTTCTGCGTAGATAACACTCCGAATCCTGATCATAGATGAGGCTGAACGCGTAGTCAGCGTTTGAGACGCGCGCATTTTGTGCATGTCGGCGCTTTTACGGTCTTCGTGTGCCACTCCAGAAATA

In [217]:
function MSA_to_TSP(sequences)
  node = []
  for i in 1:length(sequences)
    push!(node,sequences[i])
  end

  graph = Array{Float64, 2}(undef, length(node), length(node))
  max_score = 0
  for i in 1:length(node)
    graph[i,i] = 0
    for j in i+1:length(node)
      score, align1, align2 = get_alignment_score(node[i],node[j]) 
      graph[i,j] = score
      graph[j,i] = score
      if max_score < score 
                max_score = score
      end
    end
  end
  for i in 1:length(node)
    for j in i+1:length(node)
      graph[i,j] = max_score - graph[i,j] + 1
      graph[j,i] = max_score - graph[j,i] + 1
    end
  end    
  return graph
end

MSA_to_TSP (generic function with 1 method)

In [218]:
# test MSA_to_TSP here
MSA_to_TSP(data)

LoadError: UndefVarError: data not defined

In [219]:
function get_alignment_score(v, w, match_penalty=1, mismatch_penalty=-1, deletion_penalty=-1)
    n1 = length(v)
    n2 = length(w)
    #if !use_preallocated_matrices
    s = zeros(Float64, n1+1, n2+1)
    b = zeros(Float64, n1+1, n2+1)
    #end

    for i in 1:(n1+1)
        s[i,1] = (i-1) * deletion_penalty
        b[i,1] = 2
    end
    for j in 1:(n2+1)
        s[1,j] = (j-1) * deletion_penalty
        b[1,j] = 3
    end

    for i in 2:(n1+1)
        for j in 2:(n2+1)
            if v[i-1] == w[j-1]
                ms = s[i-1,j-1] + match_penalty
            else
                # ignore cases where a letter is paired with a gap
                # do not consider this a mismatch
                # if v[i-1] != '-' && w[j-1] != '-'
                #     ms = s[i-1,j-1] + mismatch_penalty
                # else
                #     # if a letter is paired with a gap, add no penalty
                #     ms = s[i-1,j-1] #+ match_penalty # + 0.5 * mismatch_penalty
                # end
                ms = s[i-1,j-1] + mismatch_penalty
            end
            test = [ms, s[i-1,j] + deletion_penalty, s[i,j-1] + deletion_penalty]
            p = argmax(test)
            s[i,j] = test[p]
            b[i,j] = p
        end
    end

    i = n1+1
    j = n2+1
    sv = []
    sw = []
    while(i > 1 || j > 1)
        p = b[i,j]
        if (p == 1)
            i = i-1
            j = j-1
            push!(sv, v[i])
            push!(sw, w[j])
        elseif p == 2
            i=i-1
            push!(sv, v[i])
            push!(sw, "-")
        elseif p == 3
            j = j-1
            push!(sv, "-")
            push!(sw, w[j])
        else
            break
        end
    end

    return (s[n1+1,n2+1], join(reverse(sv)), join(reverse(sw)))
end

get_alignment_score (generic function with 4 methods)

In [220]:
# test get_alignment_score here
sequences = generate_sequences(10,1000)
score, align1, align2 = get_alignment_score(sequences[1],sequences[2])
println("Score = ",score)
println("Alignment of fir sequence = ",align1)
println("Alignment of sec sequence = ",align2)

Score = 96.0
Alignment of fir sequence = AAACATCA-GGAA-CACTA-GAGCTCAAGCTCCTACTCAGCGCGGGT-GA-TCGCGAACT-TTCAAGATCTAC-CAGGCAACCACGCCGTCGAAATGATC-TTC---ACTGATCGAAATGTGTCGTGGGCAGT-GT-TGTAGCAC--TCCTTACAGTTAAGGC-GAAAGAATGATATTGTGATC-AGA-GGACTACAGG--CTCTCGGAGAAA--CACCGA--ATAGTAACATGTCATA-ACTCATGAGCTGGTAGGGCTCT-CCCTATGCGTGCTCCAAATCAGGGCTACTGAAGTCCGACTTAC-AGGAAAAG-A-AGGAATAACG-GC-T--C-GC-GT-CC-T--CTC--AGAG---G------ACG--AAG-A-T-GCTA--TC-GACA-ACT-GGTGTCATCTACTTACACGATGGAGAATTACTGGGGTAGTTG-GGGCTT--G-GCGCAGAGCTAT-CAT-TCCATGAGAAAGTCGGATC-CGCCG-TG-TG-CCCT-A-A---GT--GG-TAAGACCACCTTGAAGGCTTTGAG-TGACGGATGGTGGGTGCTATACAGCGCGCT-CGTGCTGT-CAGTGATCCCCA-GGC-TCGGC-TAAGAAGCCAGGGTGGCACGCCGAGCAGGTATT-GTGTGC-GCGTCGCTCGCTTGGTCGCAGTCCGTGTTCAGACT--CTCCGTGCA-ACGACAACAGCGAA--TAATGTACCATATCTTTACCACAATGCGAGAT-AAGGGTACC-AACGACC-GAGCTATGTTA-CGCTTTTCTCCCAGGGTATGGTGA-C-CGCGGA-CCTTACTTACCTGTT-GCGCCTAATAAAGCG-T-G-CTCCT--G-TCAGTCAAAAGACCC--CACTCTG-TCTC-CGTCGTGTAGTCTATATCTAGGGCCT-CCGATACA-TCAG-CGTGC--AC-AATA-GAACG--T-TTGCCA-AT--

In [221]:
# Return the ant colony with cities initialized to each ant

function create_colony(num_ants, num_nodes)
    colony = []
    for i in 1 : num_ants
       push!(colony, Dict("path"=>[rand(1:num_nodes)], "distance" => 0))
    end
    return colony
end

create_colony (generic function with 1 method)

In [222]:
create_colony(10, 4)

10-element Array{Any,1}:
 Dict{String,Any}("distance" => 0,"path" => [1])
 Dict{String,Any}("distance" => 0,"path" => [4])
 Dict{String,Any}("distance" => 0,"path" => [3])
 Dict{String,Any}("distance" => 0,"path" => [4])
 Dict{String,Any}("distance" => 0,"path" => [2])
 Dict{String,Any}("distance" => 0,"path" => [3])
 Dict{String,Any}("distance" => 0,"path" => [1])
 Dict{String,Any}("distance" => 0,"path" => [2])
 Dict{String,Any}("distance" => 0,"path" => [2])
 Dict{String,Any}("distance" => 0,"path" => [3])

In [223]:
function create_pheror_matrix(num_nodes)
    pheromone = zeros(Float64, num_nodes, num_nodes)
    for i in 1: num_nodes
        for j in 1: num_nodes
            pheromone[i,j] = 1/num_nodes
        end
    end
    return pheromone
end

create_pheror_matrix (generic function with 1 method)

In [224]:
create_pheror_matrix(4)

4×4 Array{Float64,2}:
 0.25  0.25  0.25  0.25
 0.25  0.25  0.25  0.25
 0.25  0.25  0.25  0.25
 0.25  0.25  0.25  0.25

In [225]:
function calculate_proba(num_nodes, pheromone, distance_matrix, alpha, beta)
    probability = zeros(Float64, num_nodes, num_nodes)
    for i in 1: num_nodes
        for j in 1: num_nodes
            probability[i,j] = (pheromone[i,j]^alpha) * (distance_matrix[i,j]^-beta)
            probability[j,i] = probability[i,j]
        end
    end
    return probability
end

calculate_proba (generic function with 1 method)

In [226]:
function calculate_proba_ant(pheromone, distance_matrix, unvisited_nodes, current_node, proba, alpha, beta)
  sigma = 0.0
  for unvisited_node in unvisited_nodes
    sigma += (pheromone[current_node,unvisited_node]^alpha) * (distance_matrix[current_node,unvisited_node]^-beta)
  end
  proba_ant = proba[current_node,:]/sigma
  return proba_ant
end

calculate_proba_ant (generic function with 1 method)

In [227]:
function find_best_path(n_ants, colony)
  bpath = []
  best_distance = Inf32
  typeof(best_distance)
  idx_best = 0
  for i=1: n_ants
    if colony[i]["distance"] < best_distance
      best_distance = colony[i]["distance"]
      bpath = colony[i]["path"]
      idx_best = i
    end
  end
  best_path = Dict("path"=> bpath, "distance"=> best_distance, "ant"=> idx_best)
  return best_path
end

find_best_path (generic function with 1 method)

In [228]:
function update_pheror_matrix(num_nodes, n_ants, pheromone, distance_matrix, colony, Q, decay)
  depositpher = 0.0
  for i=1: n_ants
    ant = i
    for j= 1:(length(colony[ant]["path"])-1)
      src = colony[ant]["path"][j]
      dest = colony[ant]["path"][j+1]
      pheromone[src,dest] += Q/colony[i]["distance"]
    end
    depositpher += Q/colony[i]["distance"]
    for i= 1:num_nodes
      for j= 1:num_nodes
        pheromone[i,j] = (1-decay)*pheromone[i,j]*depositpher
        pheromone[j,i] = pheromone[i,j]
      end
    end
  end
  return pheromone
end

update_pheror_matrix (generic function with 1 method)

In [229]:
function calculateDist_ant(ant, colony, distmatrix)
  dist = 0
  path = colony[ant]["path"]
  for i= 1:length(path)-1
    dist += distmatrix[path[i],path[i+1]]
  end
  return dist
end

calculateDist_ant (generic function with 1 method)

In [230]:
function traverse(ant, num_nodes, colony, pheromone, distance_matrix, proba, alpha, beta)
    unvisited = collect(1:num_nodes)
    current = colony[ant]["path"][1]
    deleteat!(unvisited, findfirst(isequal(current), unvisited))
    for j in 1: num_nodes-1
        if length(unvisited) > 1
            ant_probability = calculate_proba_ant(pheromone, distance_matrix, unvisited, current, proba, alpha, beta)
            prob = map((x) -> ant_probability[x] , unvisited)
            current = unvisited[findmax(prob)[2]]
            deleteat!(unvisited, findfirst(isequal(current), unvisited))
            push!(colony[ant]["path"], current)       
        else
            push!(colony[ant]["path"], unvisited[1])
        end
    end
    colony[ant]["distance"] = calculateDist_ant(ant, colony, distance_matrix)
end

traverse (generic function with 1 method)

In [231]:
function run1(num_ants, num_nodes, distance_matrix, iterations, Q, decay, alpha, beta)
    pheromone = create_pheror_matrix(num_nodes)
    gbpath = Dict()
    for i= 1: iterations
        colony = create_colony(num_ants, num_nodes)
        probability = calculate_proba(num_nodes, pheromone, distance_matrix, alpha, beta)
        for ant in 1: num_ants
            traverse(ant, num_nodes, colony, pheromone, distance_matrix, probability, alpha, beta)
        end
        # complete update_pheromone_matrix
        pheromone = update_pheror_matrix(num_nodes, num_ants, pheromone, distance_matrix, colony, Q, decay)
        #complete find best path fucntion
        best_path = find_best_path(num_ants, colony)
        
        bpath = best_path
        if i == 1
            gbpath = bpath
        else
            if bpath["distance"] < gbpath["distance"]
                gbpath = bpath
            end
        end
        println("current best path = ",bpath["path"])
        println("current distance = ",bpath["distance"])
        println("global best path = ",gbpath["path"])
        println("global best path distance =",gbpath["distance"])
        println("iteration over")
    end 
    # return the best path
    return gbpath["path"]
end

run1 (generic function with 1 method)

In [232]:
#distance_matrix = MSA_to_TSP(generate_sequences(10,1000))
#distance_matrix = [0 5 7.07 5 10.44; 5 0 5 7.07 10.19; 7.07 5 0 5 5.38; 5 7.07 5 0 5.83; 10.44 10.19 5.38 5.83 0]
distance_matrix = [0 556.149 1160.79 786.014 556.149; 556.149 0 648.358 556.149 786.014; 1160.79 648.358 0 597.706 1131.61; 786.014 556.149 597.706 0 554.032; 556.149 786.014 1131.61 554.032 0]
typeof(distance_matrix)
run1(50,5,distance_matrix,10,0.6,0.6,1,1)
#phero = run(20,10,distance_matrix,10,0.6,0.6,1,1)

current best path = [3, 4, 5, 1, 2]
current distance = 2264.036
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [4, 5, 1, 2, 3]
current distance = 2314.688
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [3, 4, 5, 1, 2]
current distance = 2264.036
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [4, 5, 1, 2, 3]
current distance = 2314.688
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [2, 1, 5, 4, 3]
current distance = 2264.036
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [2, 1, 5, 4, 3]
current distance = 2264.036
global best path = [3, 4, 5, 1, 2]
global best path distance =2264.036
iteration over
current best path = [2, 1, 5, 4, 3]
current distance = 2264.036
global best path = [3, 4, 5, 1, 2]
g

5-element Array{Int64,1}:
 3
 4
 5
 1
 2

In [233]:
findmax(phero[10,:])

LoadError: UndefVarError: phero not defined

In [234]:
function ACO_on_TSP()
end

ACO_on_TSP (generic function with 1 method)

In [235]:
# test ACO_on_TSP here

In [236]:
function TSP_to_MSA()
end

TSP_to_MSA (generic function with 1 method)

In [237]:
# test TSP_to_MSA here

In [238]:
# test createPherorMatrix here
createPherorMatrix()

LoadError: UndefVarError: createPherorMatrix not defined

In [239]:
# This cell is for testing by Amit

#distance_matrix = [0 10.0 15.0 20.0;10.0 0 35.0 25.0;15.0 35.0 0 30.0;20.0 25.0 30.0 0]
#distance_matrix = [0 12.0 10.0 19.0 8.0; 12.0 0 3.0 7.0 2.0; 10.0 3.0 0 6.0 20.0; 19.0 7.0 6.0 0 4.0; 8.0 2.0 20.0 4.0 0]
distance_matrix = [0 12  29 22 13 24; 12 0 19 3 25 6; 29 19 0 21 23 28; 22 3 21 0 4 5; 13 25 23 4 0 16; 24 6 28 5 16 0]
distance_matrix =Float64.(distance_matrix)
run1(50,6,distance_matrix,10,0.6,0.6,1,1)
# [4, 5, 2, 3, 1, 1] D E B C A
# 6, 2, 4, 5, 1, 3 F B D E A C

current best path = [6, 4, 2, 1, 5, 3]
current distance = 56.0
global best path = [6, 4, 2, 1, 5, 3]
global best path distance =56.0
iteration over
current best path = [6, 2, 4, 5, 1, 3]
current distance = 55.0
global best path = [6, 2, 4, 5, 1, 3]
global best path distance =55.0
iteration over
current best path = [6, 2, 4, 5, 3, 1]
current distance = 65.0
global best path = [6, 2, 4, 5, 1, 3]
global best path distance =55.0
iteration over
current best path = [1, 6, 2, 4, 5, 3]
current distance = 60.0
global best path = [6, 2, 4, 5, 1, 3]
global best path distance =55.0
iteration over
current best path = [3, 4, 2, 6, 5, 1]
current distance = 59.0
global best path = [6, 2, 4, 5, 1, 3]
global best path distance =55.0
iteration over
current best path = [3, 4, 2, 6, 5, 1]
current distance = 59.0
global best path = [6, 2, 4, 5, 1, 3]
global best path distance =55.0
iteration over
current best path = [3, 4, 2, 6, 5, 1]
current distance = 59.0
global best path = [6, 2, 4, 5, 1, 3]
global best

6-element Array{Int64,1}:
 6
 2
 4
 5
 1
 3

In [240]:
# test get_alignment_score here This cell is for testing by Amit
sequences = generate_sequences(2,1000)
sequences1 = generate_sequences(8,1900)
score, align1, align2 = get_alignment_score(sequences[1],sequences1[2])
println("Score = ",score)
println("Alignment of fir sequence = ",align1)
println("Alignment of sec sequence = ",align2)

Score = -219.0
Alignment of fir sequence = A-----T-T---AGTT------TG--GTTC-----CGT-TACTA-T-TG-A-TG--CTGTG---AGC-G-GG---T-G--TATAAAAG----T--C--T-GCGCTA--CGT-G---CC---C-T----G-AT-GCAGT-C--CTC-----T-ACC-TC----G-AAT-A-C---TATC-T---GC-----CG--CC--GT--TC-A-G----A-G-----G---C-G-A----A-AT-CTC-TCA-G--TGTAG-TT-G-----CA-T---T--C-TT--C-CAT--C-----GGC-----TC--G--A---TGC-G-C-CA---------------TCG-TAGG-C---CA---C-----G-A----C-G-G-----G-GTACT--GCC---TGG-------T-G-T-C--ACAA-T--C-A-G-TG-AAA--CAG---T---AG-------TGA-G---GT-ACAT---TC---T-AT-----GA--CC---C---A------G--C--C--CT-A-------AT--TGCCGTAAG---A-TGT-G---GAGG-GG--T-A----GG-TA-G-ACC-CTCTAA---ACACT-G----T-A--T----G--TC-----TGAGG--CG---AGT-AT-A-GGGG-C--------CTG-C--A---ACCAC-CTG---C-AC---T-C-----G-CGC-C--A-----G----CTGG-C-TC-A---TT----CC-A-G----G-TTTT------CCT-AAG--CA---C-TA---A-G-AGTGTAC-A---C-T-CA--T-T--GTC--GACT--T-C--C-G-A---------A--C-GA----ATC-TTCTTGTGGAG-T-GG-------G-----G----G-GTG-T-GC-CT-G--TC----T-C-A--TA-CTGTCCA---CC----CT----CAC-AC-C---G--TC--C

In [241]:
function find_gap_indices(A, alignedA)
    i = 1
    j = 1
    pointer = []
    while j <= length(alignedA)
        if alignedA[j] == '-' && (i > length(A) || A[i] != '-')
            push!(pointer, j)
            j += 1
        else
            j += 1
            i += 1
        end
    end
    return pointer
end

find_gap_indices (generic function with 1 method)

In [242]:
# test find_gap_indices here
A = "AAA--GGTT--"
alignedA = "AAA--G-GT-T--"
find_gap_indices(A,alignedA)

2-element Array{Any,1}:
  7
 10

In [243]:
function insert_gaps(S,gap_indices_for_A)
    copy_of_S = S
    if length(gap_indices_for_A) > 0 && length(gap_indices_for_A) > 0
        gap_indices_for_A = sort(gap_indices_for_A)
        for i in gap_indices_for_A
            copy_of_S = string(string(copy_of_S[1:i-1],'-'),copy_of_S[i:end])
        end
    end
    return copy_of_S
end

insert_gaps (generic function with 1 method)

In [244]:
# test insert_gaps here
insert_gaps("AAAAA",[1,3,5])

"-A-A-AAA"

In [245]:
# this function takes two params: sequences is the original array of input sequences, order is the array output of tsp algorithm of the order of sequences
# notice that the index in both the params are relative to each other
function align_output_sequences(sequences, order) 
    ordered_sequences = Array{String,1}(undef,0)
    for i=1:length(order)
        push!(ordered_sequences,sequences[order[i]])
    end
    aligned_sequences = Array{String,1}(undef,0)
    for i=1:length(ordered_sequences)
        push!(aligned_sequences,ordered_sequences[i])
    end
    for i=1:length(aligned_sequences)-1
        A = aligned_sequences[i]
        B = aligned_sequences[i+1]
        score, alignedA, alignedB = get_alignment_score(A,B)
        gap_indices_for_A = find_gap_indices(A, alignedA)
        # go to all predecessors of A and insert the gaps at same place
        for j = 1:i-1
            S = aligned_sequences[j]
            newly_alinged_S = insert_gaps(S,gap_indices_for_A)
            aligned_sequences[j] = newly_alinged_S
        end
        aligned_sequences[i] = alignedA
        aligned_sequences[i+1] = alignedB
    end  
    return aligned_sequences
end

align_output_sequences (generic function with 1 method)

In [246]:
function Make_Profile(k::Int, t::Int, A::Array{String,1})::Dict{String,Array{Int64,1}}
    idxA = zeros(Int64, k)
    idxC = zeros(Int64, k)
    idxG = zeros(Int64, k)
    idxT = zeros(Int64, k)
    idxGap = zeros(Int64, k)

    for index in range(1, length=k)
        for seq in A
            if seq[index] == 'A'
                idxA[index] += 1
            elseif seq[index] == 'C'
                idxC[index] += 1
            elseif seq[index] == 'G'
                idxG[index] += 1
            elseif seq[index] == 'T'
                idxT[index] += 1
            elseif seq[index] == '-'
                idxGap[index] += 1
            else
                msg = @sprintf("character [%s] not allowed in DNA sequence", string(seq[index]))
                throw(ErrorException(msg))
            end
        end
    end

    return Dict(
        "A" => idxA,
        "C" => idxC,
        "G" => idxG,
        "T" => idxT,
        "-" => idxGap
    )
end

Make_Profile (generic function with 1 method)

In [247]:
# Score the sequences in A by obtaining the consensus score using Make_Profile
function score_sequences(A::Array{String,1})::Int
    k = length(A[1])    # how long are the sequences
    t = length(A)       # how many sequences
    profile = Make_Profile(k, t, A)
    # score is the sum of the maximum occurring letter (not including gaps) in each position
    score = 0
    for i in 1:k
        a = profile["A"][i]
        t = profile["T"][i]
        g = profile["G"][i]
        c = profile["C"][i]
        gap = profile["-"][i]
#         if max(a, t, g, c) == 4
#             println("inx = ",i)
#             score += 1
#         end
        score += max(a, t, g, c)
#         score += 2(max(a, t, g, c)) - (a + t + c + g) - 2(gap)
    end
    return score
end

score_sequences (generic function with 1 method)

In [248]:
sequences = ["AATGGTCATAGCGAGATGAAGCCACGTGATGGATAATATTGTGCAAACGACCTTATTAGCTATTGACCGTCGATGTCCAACGAGACAATT",
    "GAATCTGTATTCTTCAAGCTTCAACTCCATGCACTACGAACGGTAGTGGTTCACATTGACCGTG",
    "TTGGGCGCATTGACCGTCCTTCCTAGCGTATCATCAAACTTGTGATTCTCTATCTAGAGCAAAATGCGGTGTCCGCTATATGGAGATCTATTTCAAAATA",
    "GTTACATATCAGAATCATTAGAAACGCTCTTAATGGGGTTAAGCAGAGACTTAGTAAGGATTAACTCCCAAGATGATTGACCGTGCTC"]
# sequences = ["ATGTCTCTGACCAGGACTGAGAGGACCATCATCCTGTCCCTGTGGAGCAAGATCT",
# "ATGTCTCTGACCAGGATTGAGAGGAGCATCACCCTATCGCTGACGAGCAAGATCT",
# "CTGACTCTCACCGGGAATGACAGGACCATCACACTATCGCTGACGAGCAAGATCT",
# "CTCACTCTCTCCGGTAATGACATGACCATCACACTATCTCTGACGTGCAAGATCT",
# "CTCACTCTCTCCGGTAATAACATTACAATCACACTCTCTCTGACGTGCAAGATCT",
# "GTCACTCTCTCCGGGAATAACATTACAATCACACGCTCTCGGACGTGCAAGATCG"]
graph = MSA_to_TSP(sequences)
print(graph)
run1(50,length(sequences),graph,20,0.6,0.6,1,1)
aligned_sequences = align_output_sequences(sequences,[1,4,2,3])
#println(aligned_sequences)
for i=1:length(aligned_sequences)
    println(aligned_sequences[i])
end
score_sequences(aligned_sequences)

[0.0 18.0 7.0 1.0; 18.0 0.0 18.0 7.0; 7.0 18.0 0.0 16.0; 1.0 7.0 16.0 0.0]current best path = [2, 4, 1, 3]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [2, 4, 1, 3]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [1, 4, 2, 3]
current distance = 26.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [3, 1, 4, 2]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [3, 1, 4, 2]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [2, 3, 1, 4]
current distance = 26.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
current best path = [3, 1, 4, 2]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path dist

280

In [249]:
align1 = [
"--------TTGGGCGCATTGACCGTCCTTCCTAGCGTATC-ATCAAACTTGTGATTCTCTATCTAGAGCAAAATGCGGTGTCCGCTATATGGAGATCTATTTCAAAATA------",
"GTTACATATCAGAATCATTAGAAACGCTCTTAATGGGGTTAAGCAGAGACTTAGT----AAGGATTAACTCCCAAGA-------TGATTGACCGTGCTC----------------",
"----------------------AATGGTCATAGCGAGATGAAGCCACGTGATGGAT---AATATTGTGCAAACGACCTTATTAGCTATTGACCGTCGATGTCCAACGAGACAATT",
"------------------------------------------------GAATCTGT---ATTCTTCAAGCTTCAACTCCATGCACTACGAACGGTAGTGGTTCACATTGACCGTG"]

println(score_sequences(align1))

230


In [250]:
nir = [
"AATGGTCATAGC------------G-----AGATGA--AG-CCACGTGATGGATAATATTGTGCAAACGACCTTATTAGCT---ATTGACCGTCGATGTCCAACGAGACAAT---T",
"GAATC-------------------T-----GTATTCTTC--AA-GCTTCAACTC-----CAT--------------GCACT---ACGAACGGTAGTGGTTCACATTGACCGT---G",
"TTGGG-CGCATTGACCGTCCTTCCTAGCGTATCATC--AAACT-TGTGATTCTCTATCTAGAGCAAAATGCGGTGTCCGCT---AT-----AT-GGAGATCTATTTCAAAAT---A",
"GTTA--CATATC------------A---GAATCATT--AG-AAACGCTCTTAATGGGGTTAAGCAGAG-AC-TTAGTAAGGATTAACTCCCAA-GATGATT-----GACCGTGCTC"
]
println(score_sequences(nir))
println(length(nir[1]))

230
116


In [251]:
humara = [
"AATGGT-CATAGC-G-AG-ATGA--AGCCACG----T--GATGGA-T--AA-T----A-TTG-TGCAAACGAC-C-TTA-TTAGCTATTGAC-CGTC--GATG---TCCAACGAGACAATT",
"---GTTACATATCAG-A--ATCATTAGAAACGC---T---CT----T--AA-TGGG-G-TTA-AGCAGA-GA--C-TTAGTAAG-GATTAACTC-CCAAGATG-A-TTGACCGTG-C--TC",
"---G----A-ATCTGTA--TTC-TT--CAA-GC---T--TC-AAC-T-CCA-T-GC-A-CT--A-C-GA--A--CGGTAGT--G-G--T---T---C---A-C-A-TTGACCGTG------",
"-TTG-GGCGCAT-TG-ACCGTCCTT-CCTA-GCGTATCATCAAACTTGTGATTCTCTATCTAGAGC--AAAATGCGGT-GT--CCG-CT--AT-ATGGAGATCTATTTCAAAAT------A"
]
println(score_sequences(humara))
println(length(humara[1]))

280
121


In [252]:
tcoffee = [
    "AATGGTCATAGC-GAGATGA-----AG------------CCACGTGATGGATAATATTGTGCAAACGACCTTATTAGCTATTGACCGT--CGATGTCCAACGAGACAATT",
    "GAATC-T-------GTATTCTTC-AAG------------CT------TCAA---CTCCATGCA--------------CTACGAACGGT--AGTGGTTCACATTGACCGTG",
    "TTGGG-CGCATTGACCGTCCTTCCTAGCGTATCATCAAACT-TGTGATTCTCTATCTAGAGCAAAATGCGGTGTCCGCTATA--------TGGAGATCTATTTCAAAATA",
    "GTTA--CATATC-AGAATCATTAGAAACG---------------CTCTTAATGGGGTTAAGCAGA-GA-CTTAGTAAGGATTAACTCCCAAGATGATTGAC--CGTGCTC"
]
println(score_sequences(tcoffee))

237


In [253]:
function calc_sum_pair(sequences) 
    t = length(sequences)
    k = length(sequences[1])
    score = 0
    for i=1:t
        A = sequences[i]
        for j=i+1:t
            B = sequences[j]
            for idx = 1:k
                if A[idx] == B[idx] && A[idx] != '-'
                    score += 1
                end
            end
        end
    end
    return score
end

calc_sum_pair (generic function with 1 method)

In [254]:
println(calc_sum_pair(nir))
println(calc_sum_pair(humara))
println(calc_sum_pair(tcoffee))

169
257
183


In [255]:
println(calc_sum_pair(nir))

169


In [256]:
function get_sequences_from_file(file_name)
    sequences = []
    flag_first_arrow = true
    open(file_name) do f
        sequence = ""
        for line in eachline(f)
            if startswith(line, '>') 
                if flag_first_arrow
                    flag_first_arrow = false
                    continue
                end
                push!(sequences, sequence)
                sequence = ""
            else
                sequence *= line
            end
        end
        if length(sequence) > 0
            push!(sequences, sequence)
        end
    end
    return sequences
end

get_sequences_from_file (generic function with 1 method)

In [267]:
# this is the driver function which take the fasta file as input and writes the output to the output.txt file.
# the default input to this function is input.fasta which should be present in the same folder as this code.
function driver(filename="input1.txt")
    # get the sequences from input file
    input_sequences = get_sequences_from_file(filename)
    # convert the get input_sequences to TSP problem and get the distance_matrix for TSP
    input_distance_matrix = MSA_to_TSP(input_sequences)
    # configuration for ACO
    num_ants = 50
    num_sequences = length(input_sequences)
    iterations = 1
    Q = 0.6
    decay = 0.6
    alpha = 1
    beta = 1
    # run the ant colony optimization, get the order of node traversal
    order_of_alignment = run1(num_ants,num_sequences,input_distance_matrix,iterations,Q,decay,alpha,beta)
    # align output sequences in the order identified by above algorithm
    aligned_sequences = align_output_sequences(sequences,order_of_alignment)
    # get the score of the aligned_sequences
    final_score = calc_sum_pair(aligned_sequences)
    # save the output in fasta format in output.txt
    
    # print the aligned sequences
    println("The aligned sequences are saved in output file. Printing them here as well the aligned sequences:")
    for i=1:length(aligned_sequences)
        println(aligned_sequences[i])
    end  
    return final_score, aligned_sequences
end
driver()

current best path = [2, 4, 1, 3]
current distance = 15.0
global best path = [2, 4, 1, 3]
global best path distance =15.0
iteration over
The aligned sequences are saved in output file. Printing them here as well the aligned sequences:
----G----A-AT-C-TGTATTC-TT--C-AA--GCT------T----CAACT-CCAT-G-CA-CTACGA--ACGGTAGT--G-G---T----T----C---A-CATTGACCGTG------
----GTTACATAT-C-AG-AATCATTAGA-AA-CGCT---C--T----TAA-TGGGGT-T-AAGC-A-GA-GAC-TTAGTAAG-GAT-T-AACTC--CCAAGATGATTGACCGTG-C--TC
AAT-GGT-CATAG-CGAG--ATGA--AGC-CA-CG-T--GA--TG-GATAA-T---AT-T-GTGC-A-AACGACCTTA-TTAGCTAT-T-GAC-CG-TC--GATG-TCCAACGAGACAATT
-TTGGGCGCATTGAC-CGTCCTTCCTAGCGTATC-ATCAAACTTGTGAT-TCT-CTATCTAGAGC-AAAATG-CGGT-GTCCGCTATATGGA---GATC--TAT--T---TC-A-A-AATA


(264, ["----G----A-AT-C-TGTATTC-TT--C-AA--GCT------T----CAACT-CCAT-G-CA-CTACGA--ACGGTAGT--G-G---T----T----C---A-CATTGACCGTG------", "----GTTACATAT-C-AG-AATCATTAGA-AA-CGCT---C--T----TAA-TGGGGT-T-AAGC-A-GA-GAC-TTAGTAAG-GAT-T-AACTC--CCAAGATGATTGACCGTG-C--TC", "AAT-GGT-CATAG-CGAG--ATGA--AGC-CA-CG-T--GA--TG-GATAA-T---AT-T-GTGC-A-AACGACCTTA-TTAGCTAT-T-GAC-CG-TC--GATG-TCCAACGAGACAATT", "-TTGGGCGCATTGAC-CGTCCTTCCTAGCGTATC-ATCAAACTTGTGAT-TCT-CTATCTAGAGC-AAAATG-CGGT-GTCCGCTATATGGA---GATC--TAT--T---TC-A-A-AATA"])