In [1]:
# Install muscle3.8 for biopython
# wget https://drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz
# tar xvf muscle3.8.31_i86linux64.tar.gz 
# sudo cp muscle3.8.31_i86linux64 /usr/bin/muscle
# sudo chmod +x /usr/bin/muscle

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import MuscleCommandline
from pathlib import Path
import pandas as pd
from collections import namedtuple

In [3]:
# Transkriptaze
REF_SEQ_TRANS = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDEDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGLTTPDKKHQKEPPFLWMGYEL"
# Integraze
REF_SEQ_INTEG = "FLDGIDKAQDEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLKGEAMHGQVDCSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLLKLAGRWPVKTIHTDNGSNFTGATVRAACWWAGIKQEFGIPYNPQSQGVVESMNKELKKIIGQVRDQAEHLKTAVQMAVFIHNFKRKGGIGGYSAGERIVDIIATDIQTKELQKQITKIQNFRVYYRDSRNPLWKGPAKLLWKGEGAVVIQDNSDIKVVPRRKAKIIRDYGKQMAGDDCVASRQDED"

In [4]:
trans_df = pd.read_csv("/data/hiv/data/transcriptase/2-transcriptase.csv")
integ_df = pd.read_csv("/data/hiv/data/integrase/2-integrase.csv")

In [5]:
#
# Norint atlikti teisinga alignment BUTINA naudoti muscle arba panasausu bioinformatikos toolsa, 
# kas kreipia demesi sekos sudeti. 
#
# Paprastas pair_wise aligmentas veiks greiciau, taciau bus prarandamas sekos nuoseklumas.
#
def align_sequences(reference_seq, target_seq, working_dir="tmp"):
           
    # Muscle uses files, need directory for temporary files
    working_dir = Path(working_dir)
    working_dir.mkdir(parents=True, exist_ok=True)
    
    # Input file contains two sequenes: first reference, second target sequence to compare with
    input_file = working_dir.joinpath(f"test_{hash(target_seq)}.fasta")
    aligned_file = working_dir.joinpath(f"test_aligned_{hash(target_seq)}.fasta")

    sequences = [SeqRecord(Seq(reference_seq), id="reference"), SeqRecord(Seq(target_seq), id="target")]
    SeqIO.write(sequences, input_file, "fasta")
    
    # Alignment
    muscle_cline = MuscleCommandline(input=input_file, out=aligned_file)
    muscle_cline()

    reference_aln = None
    target_aln = None
    
    # Read aligned sequences
    aligned = SeqIO.parse(aligned_file, "fasta")
    for seq in aligned:
        if seq.id == "reference":
            reference_aln = seq
        elif seq.id == "target":
            target_aln = seq

    # Reference sequence map of {original:aligned} indices
    index_map = {}
    index_o = 0
    for index_aln, c in enumerate(reference_aln.seq):
        if c == "-":
            continue
        index_map[index_o] = index_aln
        index_o += 1
        
    return reference_aln, target_aln, index_map

def find_mutations(mutations, reference_aln, target_aln, index_map, verbose=False):
    
    n_found = 0
    
    for mutation in mutations:
        reference_letter = reference_aln[index_map[mutation.index]]
        target_letter = target_aln[index_map[mutation.index]]
        
        # Make sure that reference sequence is correct
        if reference_letter != mutation.original:
            if verbose:
                print(f"Mutation ({mutation}) original letter does not match")
            continue
        
        # Check if target letter at mutation index is one of expected mutations
        if target_letter in mutation.mutation:
            if verbose:
                print(f"Index: ({mutation.index}) : Found mutation")
            n_found += 1
        elif target_letter != reference_letter:
            if verbose:
                print(f"Index: ({mutation.index}) : Different from reference, but not a mutation. Reference: [{reference_letter}] Target: [{target_letter}]")

    return n_found

def find_insertions(insertions, reference_aln, target_aln, index_map, verbose=False):
    n_found = 0
    
    for insertion_index in insertions:
        if verbose:
            print(f"Looking for insertion at {insertion_index}")
            
        prev_index_aln = index_map[insertion_index-1]+1
        
        if verbose:
            print(reference_aln[prev_index_aln])
            
        if reference_aln[prev_index_aln] == "-":
            n_found += 1
            
            if verbose:
                print("Found insertion")
        
    return n_found


In [6]:
Mutation = namedtuple("Mutation", "original index mutation")

# TDF (Transcriptase)

In [7]:
tdf_m1 = [
    Mutation("M", 40, ["L"]),
    Mutation("E", 41, ["D"]),
    Mutation("D", 66, ["N"]),
    Mutation("T", 68, ["D", "N", "S"]),
    Mutation("L", 73, ["V", "I"]),
    Mutation("L", 209, ["W"]),
    Mutation("T", 214, ["A", "C", "D", "E", "G", "H", "I", "L", "N", "S", "V", "Y", "F"])
]

tdf_m2 = [
    Mutation("K", 64, ["R", "E", "N"])
]

tdf_m3 = [
    68    # insertion at codon 69    
]

tdf_m4 = [
    Mutation("K", 69, ["E"])    
]

def get_resistance_TDF(reference_aln, target_aln, index_map, verbose=False):
    """ Checks if given target_aln sequence has resistance to TDF
    
        Return:
            2 - resistance
            1 - possible resistance
            0 - no resistance
    """
    
    m1 = find_mutations(tdf_m1, reference_aln, target_aln, index_map)
    m2 = find_mutations(tdf_m2, reference_aln, target_aln, index_map)
    m3 = find_insertions(tdf_m3, reference_aln, target_aln, index_map)
    m4 = find_mutations(tdf_m4, reference_aln, target_aln, index_map)
    
    if m1 >= 4 or m2 == 1 or m3 == 1 or m4 == 1:
        if verbose:
            print(f"{i}) {trans_df.iloc[i].accession} has resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
        return 2
    elif m1 >= 3:
        if verbose:
            print(f"{i}) {trans_df.iloc[i].accession} possible resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
        return 1
    
    return 0    

# 3TC/FTC (Transcriptase)

In [8]:
ftc_m1 = [
    Mutation("K", 64, ["R"]),
    Mutation("M", 183, ["V", "I"])
]

ftc_i2 = [
    68    # insertion at codon 69    
]


def get_resistance_FTC(reference_aln, target_aln, index_map, verbose=False):
    """ Checks if given target_aln sequence has resistance to FTC
    
        Return:
            2 - resistance
            1 - possible resistance
            0 - no resistance
    """
    
    m1 = find_mutations(ftc_m1, reference_aln, target_aln, index_map)
    m2 = find_insertions(ftc_i2, reference_aln, target_aln, index_map)
    
    if m1 > 0 or m2 == 1:
        if verbose:
            print(f"{i}) {trans_df.iloc[i].accession} has resistance. m1:{m1}, m2:{m2}")
        return 2
    
    return 0    

# DTG (Integrase)

In [9]:
dtg_m1 = [
    Mutation("G", 117, ["R"]),
    Mutation("F", 121, ["Y"]),
    Mutation("E", 137, ["A", "K", "T"]),
    Mutation("G", 139, ["A", "C", "S"]),
    Mutation("N", 143, ["D"]),
    Mutation("Q", 147, ["H", "K", "R"]),
    Mutation("V", 150, ["L"]),
    Mutation("S", 152, ["F", "Y"]),
    Mutation("N", 154, ["H"]),
    Mutation("S", 229, ["R"]),
    Mutation("R", 262, ["K"])    
]

dtg_m2 = [
    Mutation("T", 65, ["K"]),
    Mutation("L", 73, ["M"])
]

dtg_m3 = [    
    Mutation("L", 73, ["I"]),
    Mutation("E", 91, ["Q"])
]

# Possible resistance
dtg_m4 = [
    Mutation("T", 65, ["K"])   
]

def get_resistance_DTG(reference_aln, target_aln, index_map, verbose=False):
    """ Checks if given target_aln sequence has resistance to DTG
    
        Return:
            2 - resistance
            1 - possible resistance
            0 - no resistance
    """
    
    m1 = find_mutations(dtg_m1, reference_aln, target_aln, index_map)
    m2 = find_mutations(dtg_m2, reference_aln, target_aln, index_map)
    m3 = find_mutations(dtg_m3, reference_aln, target_aln, index_map)
    m4 = find_mutations(dtg_m4, reference_aln, target_aln, index_map)
    
    if m1 > 0 or m2 == 2 or m3 == 2:
        if verbose:
            print(f"{i}) {integ_df.iloc[i].accession} has resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
        return 2
    elif m4 == 1:
        if verbose:
            print(f"{i}) {integ_df.iloc[i].accession} possible resistance. m1:{m1}, m2:{m2}, m3:{m3}, m4:{m4}")
        return 1
    
    return 0   

In [10]:
for i in range(1000):
    reference_aln, target_aln, index_map = align_sequences(REF_SEQ_TRANS, trans_df.iloc[i].transcriptase_cut)
    get_resistance_TDF(reference_aln, target_aln, index_map, verbose=True)
    get_resistance_FTC(reference_aln, target_aln, index_map, verbose=True)

331) AB868733 has resistance. m1:1, m2:0
335) AF011754 has resistance. m1:5, m2:0, m3:0, m4:0
336) AF011755 has resistance. m1:5, m2:0, m3:0, m4:0
345) AF047281 has resistance. m1:4, m2:0, m3:0, m4:0
346) AF047282 has resistance. m1:4, m2:0, m3:0, m4:0
346) AF047282 has resistance. m1:1, m2:0
347) AF047287 has resistance. m1:5, m2:0, m3:0, m4:0
347) AF047287 has resistance. m1:1, m2:0
348) AF047288 has resistance. m1:5, m2:0, m3:0, m4:0
348) AF047288 has resistance. m1:1, m2:0
349) AF047295 possible resistance. m1:3, m2:0, m3:0, m4:0
350) AF047297 has resistance. m1:4, m2:0, m3:0, m4:0
350) AF047297 has resistance. m1:1, m2:0
351) AF047298 has resistance. m1:4, m2:0, m3:0, m4:0
351) AF047298 has resistance. m1:1, m2:0
352) AF047299 has resistance. m1:5, m2:0, m3:0, m4:0
352) AF047299 has resistance. m1:1, m2:0
356) AF088078 has resistance. m1:4, m2:0, m3:0, m4:0
356) AF088078 has resistance. m1:1, m2:0
358) AF088080 has resistance. m1:4, m2:0, m3:0, m4:0
358) AF088080 has resistance. m

929) FJ525692 has resistance. m1:1, m2:0
942) FJ525756 has resistance. m1:1, m2:0
947) FJ525786 has resistance. m1:1, m2:0
948) FJ525788 has resistance. m1:1, m2:0
949) FJ525789 has resistance. m1:1, m2:0
950) FJ525790 has resistance. m1:1, m2:0
992) FJ530614 has resistance. m1:1, m2:0


In [11]:
for i in range(1000):
    reference_aln, target_aln, index_map = align_sequences(REF_SEQ_INTEG, integ_df.iloc[i].integrase_cut)
    get_resistance_DTG(reference_aln, target_aln, index_map, verbose=True) 

477) AB869463 has resistance. m1:1, m2:0, m3:0, m4:0
751) AB869762 has resistance. m1:1, m2:0, m3:0, m4:0
923) AB869961 has resistance. m1:1, m2:0, m3:0, m4:0
