In [1]:
#!python3 -m pip install biopython

In [2]:
from Bio import GenBank
from pathlib import Path
import pandas as pd
import shutil
import random

In [3]:
# CHANGE ME
gene_name = "transcriptase" # integrase | transcriptase

In [4]:
gene_fasta = f"1-{gene_name}.fasta"
gene_csv = f"1-{gene_name}.csv"

raw_data_path = Path("/data/hiv/data")

data_path = raw_data_path.joinpath(gene_name)
data_path.mkdir(parents=True, exist_ok=True)

In [5]:
def parse_record(record):
    obj = {}
    
    if 'HIV' in record.source:
        
        obj = None

        for feature in record.features:

            if feature.key == "CDS":
                sequence = None
                found = False
                
                for qualifier in feature.qualifiers:

                    if qualifier.key == "/translation=":
                        sequence = qualifier.value.replace('"','').replace("'", "")
                    
                    if qualifier.key == "/product=":
                        if gene_name in qualifier.value:
                            found = True
                
                if sequence and found:
                    obj = {
                        "accession": record.accession[0],
                        f"{gene_name}": sequence,    
                        f"{gene_name}_length": len(sequence)
                    }
    return obj
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data                

In [6]:
data = []
for filename in sorted(list(Path(raw_data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
        print(f"Found {len(sub_data)}")
    else:
        print("No useful data")

Parsing /data/hiv/data/gbvrl1.seq
Found 2532
Parsing /data/hiv/data/gbvrl10.seq
Found 2389
Parsing /data/hiv/data/gbvrl100.seq
No useful data
Parsing /data/hiv/data/gbvrl101.seq
No useful data
Parsing /data/hiv/data/gbvrl102.seq
No useful data
Parsing /data/hiv/data/gbvrl103.seq
No useful data
Parsing /data/hiv/data/gbvrl104.seq
No useful data
Parsing /data/hiv/data/gbvrl105.seq
No useful data
Parsing /data/hiv/data/gbvrl106.seq
No useful data
Parsing /data/hiv/data/gbvrl107.seq
No useful data
Parsing /data/hiv/data/gbvrl108.seq
No useful data
Parsing /data/hiv/data/gbvrl109.seq
No useful data
Parsing /data/hiv/data/gbvrl11.seq
Found 4092
Parsing /data/hiv/data/gbvrl110.seq
No useful data
Parsing /data/hiv/data/gbvrl111.seq
No useful data
Parsing /data/hiv/data/gbvrl112.seq
No useful data
Parsing /data/hiv/data/gbvrl113.seq
No useful data
Parsing /data/hiv/data/gbvrl114.seq
No useful data
Parsing /data/hiv/data/gbvrl115.seq
No useful data
Parsing /data/hiv/data/gbvrl116.seq
No useful d

No useful data
Parsing /data/hiv/data/gbvrl242.seq
No useful data
Parsing /data/hiv/data/gbvrl243.seq
No useful data
Parsing /data/hiv/data/gbvrl244.seq
No useful data
Parsing /data/hiv/data/gbvrl245.seq
No useful data
Parsing /data/hiv/data/gbvrl246.seq
No useful data
Parsing /data/hiv/data/gbvrl247.seq
No useful data
Parsing /data/hiv/data/gbvrl248.seq
No useful data
Parsing /data/hiv/data/gbvrl249.seq
No useful data
Parsing /data/hiv/data/gbvrl25.seq
Found 1782
Parsing /data/hiv/data/gbvrl250.seq
No useful data
Parsing /data/hiv/data/gbvrl251.seq
No useful data
Parsing /data/hiv/data/gbvrl252.seq
No useful data
Parsing /data/hiv/data/gbvrl253.seq
No useful data
Parsing /data/hiv/data/gbvrl254.seq
No useful data
Parsing /data/hiv/data/gbvrl255.seq
No useful data
Parsing /data/hiv/data/gbvrl256.seq
No useful data
Parsing /data/hiv/data/gbvrl257.seq
No useful data
Parsing /data/hiv/data/gbvrl258.seq
No useful data
Parsing /data/hiv/data/gbvrl259.seq
No useful data
Parsing /data/hiv/dat

'Record' object has no attribute 'name'
Found 311
Parsing /data/hiv/data/gbvrl41.seq
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
Found 40
Parsing /data/hiv/data/gbvrl42.seq
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
No useful data
Parsing /data/hiv/data/gbvrl43.seq
No useful data
Parsing /data/hiv/data/gbvrl44.seq
No useful data
Parsing /data/hiv/data/gbvrl45.seq
No useful data
Parsing /data/hiv/data/gbvrl46.seq
No useful data
Parsing /data/hiv/data/gbvrl47.seq
Found 29
Parsing /data/hiv/data/gbvrl48.seq
Found 261
Parsing /data/hiv/data/gbvrl49.seq
No useful data
Parsing /data/hiv/data/gbvrl5.seq
No useful data
Parsing /data/hiv/data/gbvrl50.seq
No useful data
Parsing /data/hiv/data/gbvrl51.seq
No useful data
Parsing 

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,accession,transcriptase,transcriptase_length
0,AB001169,MEKEGKISKIGPENPYNTPVFGIKKKDRTKWRKLV,35
1,AB001170,MEKEGKISQIGSENPYNTPVFGIKKKDSTKSRKLV,35
2,AB001171,MEKEGKISQIGPENPHNTPVLGIKKKDRTKWRKLG,35
3,AB001172,MEKEGKIAHLGPDNPYNTPALGIKKKDRTKWRKLG,35
4,AB001173,MEKEGKISQIGPDNPYNTLVFGIKKMDRTKWRKLV,35
...,...,...,...
50765,EU597193,PKVKQWPLTEEKIKALTAICDEMEKEGKITKIGPENPYNTPIFAIK...,215
50766,EU597194,PKVKQWPLTEEKIKALTAICDEMEKEGKITKIGPENPYNTPIFAIK...,213
50767,EU597195,PKVKQWPLTEEKIKALMAICDEMEKEGKITKIGPENPYNTPIFAIK...,214
50768,EU597196,EKEGKITKIGPENPYNTPIFAIKKKDSTKWRKLVDFRELNKRTQDF...,176


In [8]:
df = df[~df.duplicated([gene_name], keep='first')]

df.to_csv(Path(data_path).joinpath(gene_csv), index=False)   

with open(Path(data_path).joinpath(gene_fasta), "w") as fasta_file:
    for _, row in df.iterrows():
        fasta_file.write(f">{row['accession']}\n")
        fasta_file.write(f"{row[gene_name]}\n")

In [9]:
df

Unnamed: 0,accession,transcriptase,transcriptase_length
0,AB001169,MEKEGKISKIGPENPYNTPVFGIKKKDRTKWRKLV,35
1,AB001170,MEKEGKISQIGSENPYNTPVFGIKKKDSTKSRKLV,35
2,AB001171,MEKEGKISQIGPENPHNTPVLGIKKKDRTKWRKLG,35
3,AB001172,MEKEGKIAHLGPDNPYNTPALGIKKKDRTKWRKLG,35
4,AB001173,MEKEGKISQIGPDNPYNTLVFGIKKMDRTKWRKLV,35
...,...,...,...
50764,EU597192,KQWPLTEEKIKALTAICDEMEKEGKITKIGPENPYNTPIFAIKKKD...,210
50765,EU597193,PKVKQWPLTEEKIKALTAICDEMEKEGKITKIGPENPYNTPIFAIK...,215
50766,EU597194,PKVKQWPLTEEKIKALTAICDEMEKEGKITKIGPENPYNTPIFAIK...,213
50767,EU597195,PKVKQWPLTEEKIKALMAICDEMEKEGKITKIGPENPYNTPIFAIK...,214


# BLAST

In [10]:
# Make blast db directory
blast_path = data_path.joinpath("blast")
blast_path.mkdir(parents=True, exist_ok=True)

In [11]:
# Copy fasta file to blast dir
shutil.copyfile(data_path.joinpath(gene_fasta), blast_path.joinpath(gene_fasta))
# Copy reference seq to blast dir
shutil.copyfile(data_path.joinpath("reference.seq"), blast_path.joinpath("reference.seq"))

PosixPath('/data/hiv/data/transcriptase/blast/reference.seq')

In [19]:
!cd $blast_path && makeblastdb -in $gene_fasta -dbtype prot
!cd $blast_path && blastp -query reference.seq -db $gene_fasta -out "blast_output.csv" -outfmt 7 -max_target_seqs 1000000



Building a new DB, current time: 07/02/2022 13:48:26
New DB name:   /data/hiv/data/transcriptase/blast/1-transcriptase.fasta
New DB title:  1-transcriptase.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /data/hiv/data/transcriptase/blast/1-transcriptase.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 43762 sequences in 0.731395 seconds.




In [20]:
with open(data_path.joinpath("reference.seq"), "r") as fh:
    reference = fh.read()
        
genes_df = pd.read_csv(data_path.joinpath(gene_csv))

# Read blast output
columns = ["query acc.ver", "subject acc.ver", "% identity", "alignment length", "mismatches", "gap opens", "q. start", "q. end", "s. start", "s. end", "evalue", "bit score"]
blast_df = pd.read_csv(blast_path.joinpath("blast_output.csv"), names=columns, sep="\t", skiprows=5, skipfooter=1, engine='python')

In [21]:
blast_df

Unnamed: 0,query acc.ver,subject acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score
0,Query_1,GQ208076,99.573,234,1,0,1,234,1,234,4.400000e-174,480.0
1,Query_1,FJ530784,100.000,234,0,0,1,234,1,234,5.510000e-174,478.0
2,Query_1,JQ395781,99.573,234,1,0,1,234,1,234,9.830000e-174,476.0
3,Query_1,JQ395698,99.145,234,2,0,1,234,1,234,1.250000e-173,476.0
4,Query_1,AY801780,99.145,234,2,0,1,234,1,234,1.910000e-173,478.0
...,...,...,...,...,...,...,...,...,...,...,...,...
43315,Query_1,KF747712,92.593,27,2,0,208,234,1,27,2.620000e-12,60.8
43316,Query_1,KC681855,92.593,27,2,0,208,234,1,27,1.190000e-11,58.9
43317,Query_1,KC681848,92.593,27,2,0,208,234,1,27,1.580000e-11,58.9
43318,Query_1,KC681856,88.889,27,3,0,208,234,1,27,4.390000e-11,57.4


In [22]:
# Palieka tik tas sekas, kurios blasto alignment length panasus su reference sekos ilgiu
blast_df = blast_df[(blast_df["alignment length"]>=len(reference)-10)]
blast_df

Unnamed: 0,query acc.ver,subject acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score
0,Query_1,GQ208076,99.573,234,1,0,1,234,1,234,4.400000e-174,480.0
1,Query_1,FJ530784,100.000,234,0,0,1,234,1,234,5.510000e-174,478.0
2,Query_1,JQ395781,99.573,234,1,0,1,234,1,234,9.830000e-174,476.0
3,Query_1,JQ395698,99.145,234,2,0,1,234,1,234,1.250000e-173,476.0
4,Query_1,AY801780,99.145,234,2,0,1,234,1,234,1.910000e-173,478.0
...,...,...,...,...,...,...,...,...,...,...,...,...
41856,Query_1,KP226443,61.111,234,91,0,1,234,1,234,6.630000e-104,303.0
41864,Query_1,KJ131132,62.821,234,87,0,1,234,1,234,1.410000e-103,303.0
41918,Query_1,KP226480,63.248,234,86,0,1,234,1,234,5.290000e-100,292.0
42230,Query_1,AY029935,62.393,234,33,4,1,234,1,179,1.120000e-89,262.0


In [23]:
# Sujungia BLAST outputa su visomis sekomis
df = genes_df.join(blast_df.set_index("subject acc.ver"), on='accession', how='inner')
df

Unnamed: 0,accession,transcriptase,transcriptase_length,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score
160,AB038655,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.200000e-163,451.0
161,AB038656,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.930000e-163,451.0
162,AB038658,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.070000e-163,451.0
163,AB038659,PLSPIETIPVTLKPGMDGPKVKQWPLTEEKTKALTEICKEMEEEGK...,246,Query_1,92.308,234,18,0,1,234,1,234,1.520000e-162,448.0
164,AB038660,PISPIETIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,91.880,234,19,0,1,234,1,234,1.470000e-161,446.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43595,EU518009,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,97.863,234,5,0,1,234,1,234,1.660000e-168,475.0
43596,EU518010,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,98.291,234,4,0,1,234,1,234,1.220000e-168,476.0
43597,EU518011,PISPIKTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTDMXKEGK...,560,Query_1,95.726,234,10,0,1,234,1,234,1.790000e-165,468.0
43598,EU518012,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,96.581,234,8,0,1,234,1,234,3.370000e-165,467.0


In [24]:
# Iskerpa seka, kad butu panasi i query seka
# Specialiai padaryta 1 simboliu perdaug, nes mutacija-insercija 69 pozicijoj paslenka seka per viena i desine
df[f"{gene_name}_cut"] = df.apply(lambda r: r[gene_name][r["s. start"]-1:r["s. start"]+len(reference)], axis=1)
df

Unnamed: 0,accession,transcriptase,transcriptase_length,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,transcriptase_cut
160,AB038655,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.200000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
161,AB038656,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.930000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
162,AB038658,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.070000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
163,AB038659,PLSPIETIPVTLKPGMDGPKVKQWPLTEEKTKALTEICKEMEEEGK...,246,Query_1,92.308,234,18,0,1,234,1,234,1.520000e-162,448.0,PLSPIETIPVTLKPGMDGPKVKQWPLTEEKTKALTEICKEMEEEGK...
164,AB038660,PISPIETIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,91.880,234,19,0,1,234,1,234,1.470000e-161,446.0,PISPIETIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43595,EU518009,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,97.863,234,5,0,1,234,1,234,1.660000e-168,475.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
43596,EU518010,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,98.291,234,4,0,1,234,1,234,1.220000e-168,476.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...
43597,EU518011,PISPIKTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTDMXKEGK...,560,Query_1,95.726,234,10,0,1,234,1,234,1.790000e-165,468.0,PISPIKTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTDMXKEGK...
43598,EU518012,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,560,Query_1,96.581,234,8,0,1,234,1,234,3.370000e-165,467.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...


In [6]:
# df = pd.read_csv(Path(data_path).joinpath(f"2-{gene_name}.csv")) 

In [7]:
# Taisom raides kurios turi daugiau negu viena prasme

# Keiciam sekose J i I

i = df[df[f"{gene_name}_cut"].str.contains('J')].reset_index()
i[f"{gene_name}_cut"] = i[f"{gene_name}_cut"].str.replace("J", "I")
i["accession"] = i["accession"] + "__I"

# Keiciam sekose J i L
l = df[df[f"{gene_name}_cut"].str.contains('J')].reset_index()
l[f"{gene_name}_cut"] = l[f"{gene_name}_cut"].str.replace("J", "L")
l["accession"] = l["accession"] + "__L"

# Ismetam sekas J
df = df[~df[f"{gene_name}_cut"].str.contains('J')]

# Sudedam df, I, L 
df = pd.concat([df, i, l], ignore_index=True)
df

Unnamed: 0,accession,transcriptase,transcriptase_length,query acc.ver,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,evalue,bit score,transcriptase_cut,index
0,AB038655,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.200000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,
1,AB038656,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.930000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,
2,AB038658,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,93.162,234,16,0,1,234,1,234,1.070000e-163,451.0,PISPIDTIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,
3,AB038659,PLSPIETIPVTLKPGMDGPKVKQWPLTEEKTKALTEICKEMEEEGK...,246,Query_1,92.308,234,18,0,1,234,1,234,1.520000e-162,448.0,PLSPIETIPVTLKPGMDGPKVKQWPLTEEKTKALTEICKEMEEEGK...,
4,AB038660,PISPIETIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,246,Query_1,91.880,234,19,0,1,234,1,234,1.470000e-161,446.0,PISPIETIPVTLKPGMDGPKVKQWPLTEEKIKALTEICKEMEEEGK...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17207,EF449418__L,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,249,Query_1,96.581,234,8,0,1,234,1,234,1.070000e-168,464.0,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,16576.0
17208,EU258039__L,ISPIETVPVXLKPGMDGPXVKQWPLTEEKIKALIEICTELEKEGKI...,246,Query_1,89.270,233,25,0,2,234,1,233,8.990000e-156,431.0,ISPIETVPVXLKPGMDGPXVKQWPLTEEKIKALIEICTELEKEGKI...,16786.0
17209,EU258054__L,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEXEKXGK...,248,Query_1,92.735,234,17,0,1,234,1,234,8.620000e-163,449.0,PISPIDTVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEXEKXGK...,16799.0
17210,EU258130__L,VKLKPGMDGPRVKQWPLTEEKIKALTEICMELEKEGKISKIGPENP...,226,Query_1,91.111,225,20,0,10,234,1,225,8.440000e-155,428.0,VKLKPGMDGPRVKQWPLTEEKIKALTEICMELEKEGKISKIGPENP...,16850.0


In [8]:
df.to_csv(Path(data_path).joinpath(f"2-{gene_name}.csv"), index=False) 

with open(Path(data_path).joinpath(f"2-{gene_name}.fasta"), "w") as fasta_file:
    for _, row in df.iterrows():
        fasta_file.write(f">{row['accession']}\n")
        fasta_file.write(f"{row[f'{gene_name}_cut']}\n")