In [1]:
# Install muscle3.8 for biopython
# wget https://drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz
# tar xvf muscle3.8.31_i86linux64.tar.gz 
# sudo cp muscle3.8.31_i86linux64 /usr/bin/muscle
# sudo chmod +x /usr/bin/muscle

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import MuscleCommandline
from pathlib import Path
import pandas as pd
from collections import namedtuple

In [3]:
# Transkriptaze
REF_SEQ_TRANS = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDEDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGLTTPDKKHQKEPPFLWMGYEL"
# Integraze
REF_SEQ_INTEG = "FLDGIDKAQDEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLKGEAMHGQVDCSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLLKLAGRWPVKTIHTDNGSNFTGATVRAACWWAGIKQEFGIPYNPQSQGVVESMNKELKKIIGQVRDQAEHLKTAVQMAVFIHNFKRKGGIGGYSAGERIVDIIATDIQTKELQKQITKIQNFRVYYRDSRNPLWKGPAKLLWKGEGAVVIQDNSDIKVVPRRKAKIIRDYGKQMAGDDCVASRQDED"

In [4]:
Mutation = namedtuple("Mutation", "original index mutation")

# At least 4 mutations among
mutations1 = [
    Mutation("M", 40, ["L"]),
    Mutation("E", 41, ["D"]),
    Mutation("D", 66, ["N"]),
    Mutation("T", 68, ["D", "N", "S"]),
    Mutation("L", 73, ["V", "I"]),
    Mutation("L", 209, ["W"]),
    Mutation("T", 214, ["A", "C", "D", "E", "G", "H", "I", "L", "N", "S", "V", "Y", "F"])
]

mutations2 = [
    Mutation("K", 64, ["R", "E", "N"])
]

mutations3 = [
    68    # insertion at codon 69    
]

mutations4 = [
    Mutation("K", 69, ["E"])    
]

In [5]:
# df = pd.read_csv("/data/hiv/data/pol/2-pol-20000-aligned.csv")
df = pd.read_csv("/data/hiv/data/pol/2-pol.csv")
df

Unnamed: 0,accession,pol,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,pol_cut
0,AB098330,FFRENLAFQQGEARKFSTEQTGANSPTSGALWDGGRDILPSEAGAE...,Query_1,92.735,234,17,0,1,234,156,389,5.920000e-155,458.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALTEICIDMEKEGK...
1,AB098331,FFRENLAFQQGEARKFSTEQTGANSPTSGALWDGGRDILPSEAGTK...,Query_1,91.880,234,19,0,1,234,156,389,2.630000e-154,456.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKVKALTEICIDMEKEGK...
2,AB098332,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...,Query_1,94.872,234,12,0,1,234,156,389,1.760000e-157,464.0,PISPIETVPVTLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGK...
3,AB098333,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...,Query_1,94.872,234,12,0,1,234,156,389,1.760000e-157,464.0,PISPIETVPVTLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGK...
4,AB220944,FFRENLAFQQRKAGEFSSEQTRANSPTSRKLGDGGRDNLLTEAGAE...,Query_1,93.590,234,15,0,1,234,154,387,3.240000e-156,461.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240388,MZ468890,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,94.017,234,14,0,1,234,100,333,1.500000e-163,461.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALVEICTEMEKEGK...
240389,MZ468891,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...,Query_1,97.863,234,5,0,1,234,100,333,1.160000e-168,474.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
240390,MZ468892,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,96.581,234,8,0,1,234,100,333,3.970000e-165,465.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
240391,MZ468893,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...,Query_1,93.590,234,15,0,1,234,100,333,5.640000e-161,454.0,PISPIETVPVKLKPGMDGPKVKQWPLTXXKIKALTEICTEMEKEGK...


In [6]:
def align_sequences(reference_seq, target_seq, working_dir="tmp"):
           
    # Muscle uses files, need directory for temporary files
    working_dir = Path(working_dir)
    working_dir.mkdir(parents=True, exist_ok=True)
    
    # Input file contains two sequenes: first reference, second target sequence to compare with
    input_file = working_dir.joinpath(f"test_{hash(target_seq)}.fasta")
    aligned_file = working_dir.joinpath(f"test_aligned_{hash(target_seq)}.fasta")

    sequences = [SeqRecord(Seq(reference_seq), id="reference"), SeqRecord(Seq(target_seq), id="target")]
    SeqIO.write(sequences, input_file, "fasta")
    
    # Alignment
    muscle_cline = MuscleCommandline(input=input_file, out=aligned_file)
    muscle_cline()

    reference_aln = None
    target_aln = None
    
    # Read aligned sequences
    aligned = SeqIO.parse(aligned_file, "fasta")
    for seq in aligned:
        if seq.id == "reference":
            reference_aln = seq
        elif seq.id == "target":
            target_aln = seq

    # Reference sequence map of {original:aligned} indices
    index_map = {}
    index_o = 0
    for index_aln, c in enumerate(reference_aln.seq):
        if c == "-":
            continue
        index_map[index_o] = index_aln
        index_o += 1
        
    return reference_aln, target_aln, index_map

def find_mutations(mutations, reference_aln, target_aln, index_map, verbose=False):
    
    n_found = 0
    
    for mutation in mutations:
        reference_letter = reference_aln[index_map[mutation.index]]
        target_letter = target_aln[index_map[mutation.index]]
        
        # Make sure that reference sequence is correct
        if reference_letter != mutation.original:
            if verbose:
                print(f"Mutation ({mutation}) original letter does not match")
            continue
        
        # Check if target letter at mutation index is one of expected mutations
        if target_letter in mutation.mutation:
            if verbose:
                print(f"Index: ({mutation.index}) : Found mutation")
            n_found += 1
        elif target_letter != reference_letter:
            if verbose:
                print(f"Index: ({mutation.index}) : Different from reference, but not a mutation. Reference: [{reference_letter}] Target: [{target_letter}]")

    return n_found

def find_insertions(insertions, reference_aln, target_aln, index_map, verbose=False):
    n_found = 0
    
    for insertion_index in insertions:
        if verbose:
            print(f"Looking for insertion at {insertion_index}")
            
        prev_index_aln = index_map[insertion_index-1]+1
        
        if verbose:
            print(reference_aln[prev_index_aln])
            
        if reference_aln[prev_index_aln] == "-":
            n_found += 1
            
            if verbose:
                print("Found insertion")
        
    return n_found


In [7]:
for i in range(1000):
    
    reference_aln, target_aln, index_map = align_sequences(REF_SEQ_TRANS, df.iloc[i].pol_cut)
    m1 = find_mutations(mutations1, reference_aln, target_aln, index_map)
    m2 = find_mutations(mutations2, reference_aln, target_aln, index_map)
    m3 = find_insertions(mutations3, reference_aln, target_aln, index_map)
    m4 = find_mutations(mutations4, reference_aln, target_aln, index_map)
    
    if m1 >= 4 or m2 == 1 or m3 == 1 or m4 == 1:
        print(f"{i}) {df.iloc[i].accession} has resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
    elif m1 >= 3:
        print(f"{i}) {df.iloc[i].accession} possible resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
    
print("Done")

19) AB253423 has resistance. m1:4, m2:0, m3:0, m4:0
29) AB253681 has resistance. m1:4, m2:0, m3:0, m4:0
30) AB253682 has resistance. m1:4, m2:0, m3:0, m4:0
31) AB253683 has resistance. m1:4, m2:0, m3:0, m4:0
32) AB253684 has resistance. m1:4, m2:0, m3:0, m4:0
33) AB253685 has resistance. m1:4, m2:0, m3:0, m4:0
34) AB253686 has resistance. m1:4, m2:0, m3:0, m4:0
35) AB253687 has resistance. m1:4, m2:0, m3:0, m4:0
36) AB253688 has resistance. m1:4, m2:0, m3:0, m4:0
37) AB253689 has resistance. m1:4, m2:0, m3:0, m4:0
38) AB253690 has resistance. m1:4, m2:0, m3:0, m4:0
39) AB253691 has resistance. m1:4, m2:0, m3:0, m4:0
40) AB253692 has resistance. m1:4, m2:0, m3:0, m4:0
41) AB253693 has resistance. m1:4, m2:0, m3:0, m4:0
42) AB253694 has resistance. m1:4, m2:0, m3:0, m4:0
43) AB253695 has resistance. m1:4, m2:0, m3:0, m4:0
44) AB253696 has resistance. m1:4, m2:0, m3:0, m4:0
45) AB253697 has resistance. m1:4, m2:0, m3:0, m4:0
46) AB253698 has resistance. m1:4, m2:0, m3:0, m4:0
47) AB253699