In [1]:
# Install muscle3.8 for biopython
# wget https://drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz
# tar xvf muscle3.8.31_i86linux64.tar.gz 
# sudo cp muscle3.8.31_i86linux64 /usr/bin/muscle
# sudo chmod +x /usr/bin/muscle

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from pathlib import Path
import pandas as pd
from collections import namedtuple

In [3]:
REF_SEQ = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDEDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGLTTPDKKHQKEPPFLWMGYEL"
REF_SEQ_LEN = len(REF_SEQ)
GENE = "pol"
data_path = Path("/data/hiv/data")

In [4]:
# M41L
# E44D
# D67N
# T69D/N/S
# L74V/I
# L210W
# T215A/C/D/E/G/H/I/L/N/S/V/Y/F


Mutation = namedtuple("Mutation", "original index mutation")

mutations = [
    Mutation("M", 40, ["L"]),
    Mutation("E", 41, ["D"]),
    Mutation("D", 66, ["N"]),
    Mutation("T", 68, ["D", "N", "S"]),
    Mutation("L", 73, ["V", "I"]),
    Mutation("L", 209, ["W"]),
    Mutation("T", 214, ["A", "C", "D", "E", "G", "H", "I", "L", "N", "S", "V", "Y", "F"])
]

In [5]:
df = pd.read_csv("/data/hiv/data/pol/2-pol-20000-aligned.csv")
df

Unnamed: 0,accession,gene
0,FJ199594,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKAL--TAICEEMEKE...
1,AB874173,PISPIETVPVKLKPGMDGPKVRQWPLTEEKIKAL--VEICTEMEKE...
2,GQ272384,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKAL--TEICAEMEKE...
3,FJ199763,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKAL--TAICEEMEKE...
4,FJ199639,PISSIETVPVKLKPGMDGPKVKQWPLTEEKIKAL--TAICEEMEKE...
...,...,...
19994,GQ288367,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALV--EICTEMEKE...
19995,GQ288349,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALV--EICTEMEKE...
19996,GQ288353,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALV--EICTEMEKE...
19997,GQ290758,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALV--EICTEMEKE...


In [6]:
def align_detect(reference_seq, target_seq, working_dir="tmp"):
           
    # Muscle uses files, need directory for temporary files
    working_dir = Path(working_dir)
    working_dir.mkdir(parents=True, exist_ok=True)
    
    # Input file contains two sequenes: first reference, second "new" sequence to compare with
    input_file = working_dir.joinpath("test.fasta")
    aligned_file = working_dir.joinpath("test_aligned.fasta")

    sequences = [SeqRecord(Seq(reference_seq), id="reference"), SeqRecord(Seq(target_seq), id="target")]
    SeqIO.write(sequences, input_file, "fasta")
    
    # Alignment
    muscle_cline = MuscleCommandline(input=input_file, out=aligned_file)
    muscle_cline()

    ref_aln = None
    new_aln = None
    
    # Read aligned sequences
    aligned = SeqIO.parse(aligned_file, "fasta")
    for seq in aligned:
        if seq.id == "reference":
            reference_aln = seq
        elif seq.id == "target":
            target_aln = seq

    # Reference sequence map of {original:aligned} indices
    index_map = {}
    index_o = 0
    for index_aln, c in enumerate(reference_aln.seq):
        if c == "-":
            continue
        index_map[index_o] = index_aln
        index_o += 1

#     print(index_map)

    for mutation in mutations:
        reference_letter = reference_aln[index_map[mutation.index]]
        target_letter = target_aln[index_map[mutation.index]]
        
        # Make sure that reference sequence is correct
        if reference_letter != mutation.original:
            print(f"Mutation ({mutation}) original letter does not match")
            continue
        
        # Check if target letter at mutation index is one of expected mutations
        if target_letter in mutation.mutation:
            print(f"Index: ({mutation.index}) : Found mutation")
        elif target_letter != reference_letter:
            print(f"Index: ({mutation.index}) : Different from reference, but not a mutation. Reference: [{reference_letter}] Target: [{target_letter}]")
        

for i in range(100):
    print(i)
    r = align_detect(REF_SEQ, df.iloc[i].gene)

0
1
2
Index: (68) : Different from reference, but not a mutation. Reference: [T] Target: [-]
Index: (73) : Different from reference, but not a mutation. Reference: [L] Target: [-]
3
4
5
6
7
Index: (73) : Different from reference, but not a mutation. Reference: [L] Target: [-]
8
9
10
11
12
13
14
15
16
17
18
19
20
21
Index: (68) : Found mutation
Index: (209) : Different from reference, but not a mutation. Reference: [L] Target: [N]
Index: (214) : Found mutation
22
23
24
25
26
27
Index: (40) : Different from reference, but not a mutation. Reference: [M] Target: [X]
Index: (41) : Different from reference, but not a mutation. Reference: [E] Target: [X]
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
Index: (68) : Different from reference, but not a mutation. Reference: [T] Target: [P]
43
44
45
46
47
48
Index: (66) : Found mutation
Index: (68) : Found mutation
Index: (209) : Found mutation
Index: (214) : Found mutation
49
50
51
52
53
54
55
56
Index: (66) : Found mutation
Index: (68) : Found mut