In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import MuscleCommandline
from pathlib import Path

In [2]:
REF_SEQ = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDEDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGLTTPDKKHQKEPPFLWMGYEL"
REF_SEQ_LEN = len(REF_SEQ)
GENE = "pol"

In [3]:
# M41L
# E44D
# D67N
# T69D/N/S
# L74V/I
# L210W
# T215A/C/D/E/G/H/I/L/N/S/V/Y/F

In [4]:
def align_detect(ref_seq, new_seq, working_dir="tmp"):
        
    # Muscle uses files, need directory for temporary files
    working_dir = Path(working_dir)
    working_dir.mkdir(parents=True, exist_ok=True)
    
    # Input file contains two sequenes: first reference, second "new" sequence to compare with
    input_file = working_dir.joinpath("test.fasta")
    aligned_file = working_dir.joinpath("test_aligned.fasta")

    sequences = [SeqRecord(Seq(ref_seq), id="ref"), SeqRecord(Seq(new_seq), id="new")]
    SeqIO.write(sequences, input_file, "fasta")
    
    # Alignment
    muscle_cline = MuscleCommandline(input=input_file, out=aligned_file)
    muscle_cline()

    ref_aln = None
    new_aln = None
    
    aligned = SeqIO.parse(aligned_file, "fasta")
    for seq in aligned:
        if seq.id == "ref":
            ref_aln = seq
        elif seq.id == "new":
            new_aln = seq

    # Reference sequence map of {original:aligned} indices
    index_map = {}
    index_o = 0
    for index_aln, c in enumerate(ref_aln.seq):
        if c == "-":
            continue
        index_map[index_o] = index_aln
        index_o += 1

    print(index_map)
    
    print(ref_aln[index_map[43]])
# align_detect(REF_SEQ,"AAPISPIETAIPTQKLIS")

In [5]:
import pandas as pd

In [6]:
genes = pd.read_csv("../data/1-pol.csv")
blast = pd.read_csv("../data/blast/blast_all.csv", sep="\t")

In [7]:
blast = blast[(blast["alignment length"]==REF_SEQ_LEN)]

In [8]:
# Sujungia BLAST outputa su visomis sekomis, sekos kurios nepapuole i blasto outputa (panasumas < ~60%) ismetamos
df = genes.join(blast.set_index("subject acc.ver"), on='accession', how='inner')

In [9]:
# Iskerpa seka, kad butu panasi i query seka
df[f"{GENE}_cut"] = df.apply(lambda r: r[GENE][r["s. start"]-1:REF_SEQ_LEN], axis=1)

In [10]:
df[(df["pol_cut"].str[40] == "L") & (df["pol_cut"].str[43] == "D") & (df["pol_cut"].str[66] == "N")& (df["pol_cut"].str[66].isin(["D", "N", "S"]))]

Unnamed: 0,accession,pol,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,pol_cut
367,AB287369,FFREGLAFPQGEAREFSSEQTRANSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,7.090000e-155,458.0,PISSIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
368,AB287370,FFREGLAFPQGEAREFSSEQTRANSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,7.090000e-155,458.0,PISSIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
369,AB287371,FFREGLAFPQGEAREFFSEQTRTNSPTSPTRRELQVWGSGSSSPSE...,Query_1,93.162,234,16,0,1,234,159,392,1.160000e-156,462.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELEKDGK...
370,AB287372,FFREGLAFPQGEAREFFSEQTRTNSPTSPTRRELQVWGSGSSSPSE...,Query_1,92.735,234,17,0,1,234,159,392,5.700000e-156,461.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTELENDGK...
384,AB356103,PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,Query_1,94.017,234,14,0,1,234,100,333,9.200000e-164,458.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTELEKDGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461197,EU553210,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...,Query_1,89.744,234,24,0,1,234,1,234,1.120000e-157,442.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...
461220,EU553233,PISSIETVPVKLKPGMDGPKVKQWPLTKEKIEALIEICSELEKDGK...,Query_1,87.607,234,29,0,1,234,1,234,1.010000e-152,430.0,PISSIETVPVKLKPGMDGPKVKQWPLTKEKIEALIEICSELEKDGK...
461227,EU553240,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...,Query_1,86.752,234,31,0,1,234,1,234,1.430000e-153,432.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALIEICADLEKDGK...
461233,EU553246,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALMEICAELEEDGK...,Query_1,90.598,234,22,0,1,234,1,234,7.470000e-158,443.0,PISPIETVPVKLKPGMDGPRVKQWPLTEEKIKALMEICAELEEDGK...


In [11]:
# df[df["accession"] == "MH072766"].values[0][1]