In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from pathlib import Path

In [2]:
REF_SEQ = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDEDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGLTTPDKKHQKEPPFLWMGYEL"
REF_SEQ_LEN = len(REF_SEQ)
GENE = "pol"
data_path = Path("/data/hiv/data")

In [3]:
# M41L
# E44D
# D67N
# T69D/N/S
# L74V/I
# L210W
# T215A/C/D/E/G/H/I/L/N/S/V/Y/F

In [4]:
def align_detect(ref_seq, new_seq, working_dir="tmp"):
        
    # Muscle uses files, need directory for temporary files
    working_dir = Path(working_dir)
    working_dir.mkdir(parents=True, exist_ok=True)
    
    # Input file contains two sequenes: first reference, second "new" sequence to compare with
    input_file = working_dir.joinpath("test.fasta")
    aligned_file = working_dir.joinpath("test_aligned.fasta")

    sequences = [SeqRecord(Seq(ref_seq), id="ref"), SeqRecord(Seq(new_seq), id="new")]
    SeqIO.write(sequences, input_file, "fasta")
    
    # Alignment
    muscle_cline = MuscleCommandline(input=input_file, out=aligned_file)
    muscle_cline()

    ref_aln = None
    new_aln = None
    
    aligned = SeqIO.parse(aligned_file, "fasta")
    for seq in aligned:
        if seq.id == "ref":
            ref_aln = seq
        elif seq.id == "new":
            new_aln = seq

    # Reference sequence map of {original:aligned} indices
    index_map = {}
    index_o = 0
    for index_aln, c in enumerate(ref_aln.seq):
        if c == "-":
            continue
        index_map[index_o] = index_aln
        index_o += 1

    print(index_map)
    
    print(ref_aln[index_map[40]])
align_detect(REF_SEQ,"AAPISPIETAIPTQKLIS")

{0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 7, 6: 8, 7: 9, 8: 10, 9: 11, 10: 12, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 26, 25: 27, 26: 28, 27: 29, 28: 30, 29: 31, 30: 32, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 48, 47: 49, 48: 50, 49: 51, 50: 52, 51: 53, 52: 54, 53: 55, 54: 56, 55: 57, 56: 58, 57: 59, 58: 60, 59: 61, 60: 62, 61: 63, 62: 64, 63: 65, 64: 66, 65: 67, 66: 68, 67: 69, 68: 70, 69: 71, 70: 72, 71: 73, 72: 74, 73: 75, 74: 76, 75: 77, 76: 78, 77: 79, 78: 80, 79: 81, 80: 82, 81: 83, 82: 84, 83: 85, 84: 86, 85: 87, 86: 88, 87: 89, 88: 90, 89: 91, 90: 92, 91: 93, 92: 94, 93: 95, 94: 96, 95: 97, 96: 98, 97: 99, 98: 100, 99: 101, 100: 102, 101: 103, 102: 104, 103: 105, 104: 106, 105: 107, 106: 108, 107: 109, 108: 110, 109: 111, 110: 112, 111: 113, 112: 114, 113: 115, 114: 116, 115: 117, 116: 118, 117: 119, 118: 120, 119: 121, 120: 122, 121: 

In [5]:
import pandas as pd

In [6]:
genes = pd.read_csv(data_path.joinpath("1-pol.csv"))
blast = pd.read_csv(data_path.joinpath("blast/blast_all.csv"), sep="\t")

In [7]:
blast = blast[(blast["alignment length"]==REF_SEQ_LEN)]

In [8]:
# Sujungia BLAST outputa su visomis sekomis, sekos kurios nepapuole i blasto outputa (panasumas < ~60%) ismetamos
df = genes.join(blast.set_index("subject acc.ver"), on='accession', how='inner')

In [9]:
# Iskerpa seka, kad butu panasi i query seka
# Specialiai padaryta 1 simboliu perdaug, nes mutacija-insercija 69 pozicijoj paslenka seka per viena i desine
df[f"{GENE}_cut"] = df.apply(lambda r: r[GENE][r["s. start"]-1:r["s. start"]+REF_SEQ_LEN], axis=1)

In [10]:
df[(df["pol_cut"].str[40] == "L") & (df["pol_cut"].str[43] == "D") & (df["pol_cut"].str[66] == "N")& (df["pol_cut"].str[66].isin(["D", "N", "S"]))]

Unnamed: 0,accession,pol,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,pol_cut
367,AB287369,FFREGLAFPQGEAREFSSEQTRANSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,7.090000e-155,458.0,PISSIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
368,AB287370,FFREGLAFPQGEAREFSSEQTRANSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,7.090000e-155,458.0,PISSIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
369,AB287371,FFREGLAFPQGEAREFFSEQTRTNSPTSPTRRELQVWGSGSSSPSE...,Query_1,93.162,234,16,0,1,234,159,392,1.160000e-156,462.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
370,AB287372,FFREGLAFPQGEAREFFSEQTRTNSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,5.700000e-156,461.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELENDGK...
384,AB356103,PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,94.017,234,14,0,1,234,100,333,9.200000e-164,458.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTELEKDGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461197,EU553210,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...,Query_1,89.744,234,24,0,1,234,1,234,1.120000e-157,442.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...
461220,EU553233,PISSIETVPVKLKPGMDGPKVKQWPLTKEKIEALIEICSELEKDGK...,Query_1,87.607,234,29,0,1,234,1,234,1.010000e-152,430.0,PISSIETVPVKLKPGMDGPKVKQWPLTKEKIEALIEICSELEKDGK...
461227,EU553240,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...,Query_1,86.752,234,31,0,1,234,1,234,1.430000e-153,432.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...
461233,EU553246,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALMEICAELEEDGK...,Query_1,90.598,234,22,0,1,234,1,234,7.470000e-158,443.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALMEICAELEEDGK...


In [11]:
df

Unnamed: 0,accession,pol,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,pol_cut
40,AB098330,FFRENLAFQQGEARKFSTEQTGANSPTSGALWDGGRDILPSEAGAE...,Query_1,92.735,234,17,0,1,234,156,389,5.920000e-155,458.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALTEICIDMEKEGK...
41,AB098331,FFRENLAFQQGEARKFSTEQTGANSPTSGALWDGGRDILPSEAGTK...,Query_1,91.880,234,19,0,1,234,156,389,2.630000e-154,456.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKVKALTEICIDMEKEGK...
42,AB098332,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...,Query_1,94.872,234,12,0,1,234,156,389,1.760000e-157,464.0,PISPIETVPVTLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGK...
43,AB098333,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...,Query_1,94.872,234,12,0,1,234,156,389,1.760000e-157,464.0,PISPIETVPVTLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGK...
83,AB220944,FFRENLAFQQRKAGEFSSEQTRANSPTSRKLGDGGRDNLLTEAGAE...,Query_1,93.590,234,15,0,1,234,154,387,3.240000e-156,461.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462010,MZ468890,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,94.017,234,14,0,1,234,100,333,1.500000e-163,461.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALVEICTEMEKEGK...
462011,MZ468891,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...,Query_1,97.863,234,5,0,1,234,100,333,1.160000e-168,474.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
462012,MZ468892,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,96.581,234,8,0,1,234,100,333,3.970000e-165,465.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
462013,MZ468893,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...,Query_1,93.590,234,15,0,1,234,100,333,5.640000e-161,454.0,PISPIETVPVKLKPGMDGPKVKQWPLTXXKIKALTEICTEMEKEGK...


In [12]:
df.to_csv(Path(data_path).joinpath(f"2-{GENE}.csv"), index=False) 

with open(Path(data_path).joinpath(f"2-{GENE}.fasta"), "w") as fasta_file:
    for _, row in df.iterrows():
        fasta_file.write(f">{row['accession']}\n")
        fasta_file.write(f"{row['pol_cut']}\n")

In [14]:
df[df["accession"] == "AB098330"].values[0][1]

'FFRENLAFQQGEARKFSTEQTGANSPTSGALWDGGRDILPSEAGAERQGPGLTFSFPQITLWQRPLVTVKIGGQLKEALLDTGADDTVFEDINLPGKWKPRMIGGIGGFIKVKQHDQILIEICGKKAIGTVLVGPTPVNIIGRNMLTQIGCTLNFPISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALTEICIDMEKEGKISRIGPENPYNTPIFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDESFRKYTAFTIPSTNNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRSKNPEIIIYQYMDDLYVGSDLEIGKHRAKIEELRAHLLSWGFTTPDKKHQKEPPFLWMGYELHPDKWTVQPIELPVKESWTVNDIQKLVGKLNWASQIYAGIQVKQLCKLLRGAKALTDIVTLTEEAELELAENREILKDPVHGVYYDPSKDLIAEIQKQGQDQWTYQIYQEPFKNLKTGKYARKRSAHTNDVKQLAEVVQKVTMESIVIWGKTPKFKLPIQKETWETWWRDYWQATWIPEWEFVNTPPLVKLWYQLEKDPIVGAETFYVDGAANRETKLGMAGYVTDRGRQKVVSLNETTNQKTELHAIYLALQDSGSEVNIVTDSQYALGIIQAQPDRSESGLVSQIIEKLIEKDKVYLSWVPAHKGIGGNEQVDKLVSSGIRKVLFLDGIDKAQEEHERYHSNWRTMASDFNLPPVVAKEIVASCDKCQLKGEAMHGQVDCSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYFLLKLAGRWPVKVVHTDNGPNFISAAVKAACWWANIKQEFGIPYNPQSQGVVESMNKELKKIIGQVRDQAEHLKTAVQMAVFIHNFKRKGGIGGYSAGERIIDIIATDIQTKELQKQITKIQKFRVYYRDSRDPIWKGPAKLLWKGEGAVVIQDNSDIKVVPRRKAKIIRDYGKQMAGDDCVAGR