In [1]:
import pandas as pd
import os
import csv
import gffutils
import tempfile
from tqdm import tqdm
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [2]:
old_ref_fna = '../REF/old_reference_marple_genes/pst-130_388_genes.fasta'
old_ref_gff = '../REF/old_reference_marple_genes/pst-130_388_genes_as_positive_strand_landmarks.gff3'
new_ref_fna = '../REF/new_reference_all_genes/pst104e.fasta'
new_ref_gff = '../REF/new_reference_all_genes/pst104e.gff3'

In [3]:
old_genes = {}
for record in SeqIO.parse(old_ref_fna, 'fasta'):
    old_genes[record.id] = record.seq

In [4]:
def blast_match_genes(query_dict,ref_fna,ref_gff):
    db_path = os.path.splitext(ref_gff)[0] + '.db'
    if not os.path.exists(db_path):
        gffutils.create_db(ref_gff, db_path, merge_strategy='create_unique')
    db = gffutils.FeatureDB(db_path)

    sseqid_dict = {}
    
    for query_id, query_seq in tqdm(query_dict.items()):
        temp_dir = tempfile.gettempdir()
        query_temp = os.path.join(temp_dir, f"query_{query_id}.fasta")
        
        with open(query_temp, 'w') as f:
            f.write(f'>{query_id}\n{query_seq}\n')
        
        blastn_cline = NcbiblastnCommandline(query=query_temp, subject=ref_fna, outfmt=6)
        stdout, stderr = blastn_cline()
        
        # Parse the blast output
        for line in stdout.splitlines():
            line = line.split('\t')
            pident = float(line[2])
            if pident < 97:
                continue
            sseqid = line[1]
            sstart = int(line[8])
            send = int(line[9])
            old_gene_length = len(query_seq)
            if query_id not in sseqid_dict:
                sseqid_dict[query_id] = []

            # Get the new gene ID from the gff db
            for gene in db.region(region=(sseqid, sstart, send), featuretype='gene'):
                gene_id = gene.attributes['ID'][0]
                gene_length = gene.end - gene.start
                if gene_length < 1000:
                    continue
                pct_gene_len = old_gene_length / gene_length
                if pct_gene_len < 0.9 or pct_gene_len > 1.1:
                    continue
                entry = (gene_id, pident, old_gene_length, gene_length, pct_gene_len)
                sseqid_dict[query_id].append(entry)
                
        os.remove(query_temp)
    
    with open('../part3/old-to-new-genes.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Old_GeneID', 'New_GeneID', 'PctIdentity', 'Old_GeneLen', 'New_GeneLen', 'PctGeneLen'])
        for query_id, gene_info in sseqid_dict.items():
            for gene_id, pident, old_gene_length, gene_length, pct_gene_len in gene_info:
                writer.writerow([query_id, gene_id, pident, old_gene_length, gene_length, pct_gene_len])
    
    return sseqid_dict

In [5]:
sseqid_dict = blast_match_genes(old_genes, new_ref_fna, new_ref_gff)

100%|██████████| 388/388 [20:10<00:00,  3.12s/it]


In [6]:
# Check which genes we have
new_list_of_genes = pd.read_csv('../part1/updated_primers_new-code.csv')['GeneID'].unique()

with open('../part3/primers-for-old-to-new-genes.csv', 'w', newline='') as f:
    print('Old_GeneID,New_GeneID,PrimersDesigned', file=f)
    for query_id, gene_info in sseqid_dict.items():
        for gene_id, pident, old_gene_length, gene_length, pct_gene_len in gene_info:
            if gene_id in new_list_of_genes:
                print(f'{query_id},{gene_id},YES', file=f)
            else:
                print(f'{query_id},{gene_id},NO', file=f)
