In [1]:
import pandas as pd
import os
import re
import csv
import gffutils
import tempfile
from tqdm import tqdm
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline

In [21]:
old_ref_fna = '/Volumes/Diane-Saunders/loizos/PST/REF/PST134/GCA_021901695.1/GCA_021901695.1_Pst134E36_v1_pri_genomic.fna'
old_ref_gff = '/Volumes/Diane-Saunders/loizos/PST/REF/PST134/GCA_021901695.1/genomic.gff'
new_ref_gff = '/Volumes/Diane-Saunders/loizos/PST/REF/PST130_ENSEMBL/Puccinia_striiformis.PST-130_1.0.60.gff'
new_ref_fna = '/Volumes/Diane-Saunders/loizos/PST/REF/PST130_ENSEMBL/Puccinia_striiformis.PST-130_1.0.dna.toplevel.fa'
gene_list = 'gene_list.txt'

In [16]:
def extract_gene_sequences(gff_file, fasta_file, gene_list, padding=0):
    gene_array = []
    with open(gene_list, 'r') as f:
        for line in f:
            gene_id = line.strip()
            gene_array.append(gene_id)
    
    genome = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        genome[record.id] = str(record.seq)
    
    old_genes = {} 
    
    with open(gff_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9 or fields[2].lower() != 'gene':
                continue
                
            chrom = fields[0]
            start = int(fields[3])
            end = int(fields[4])
            strand = fields[6]
            
            attributes = fields[8]
            match = re.search(r'Name=([^;]+)', attributes)
            if not match:
                continue
            gene_id = match.group(1)
            
            if gene_id in gene_array and chrom in genome:
                padded_start = max(1, start - padding)
                padded_end = min(len(genome[chrom]), end + padding)
                sequence = genome[chrom][padded_start-1:padded_end]
                
                old_genes[gene_id] = sequence
    
    return old_genes

In [17]:
old_genes = extract_gene_sequences(old_ref_gff, old_ref_fna, gene_list, padding=0)

In [18]:
print(old_genes)

{'Pst134EA_000001': 'CCTTGAGGAGGACACACGATCTCGTGGCCGGAGATTTGTCCGCCCCGACTTGGTTACCACTTGTCTTCACCCCATCAAGGCAGACCGGTTGGAAGCAAATGGAGAAATGGAAGGGGACTTATCAAGTTTAGTCCATCTGAGTTCGTCGTTCAGGTAGGTTGCTTGGTCAACGAATTTCGGGCAGAAGGTCTCGCAAAAGATAACCTCCTTAAGTACGACTGTCCTCGTGCACTCCACAATACCACCGCTTGgtatcttcttattcttcctTTCAATAATCATCCGACGTTTgcggtttcttttctttacaaTTTTTTGATCTTTGACAGATCTTTGTTTACTTTTGTTATTTTTGGTGTTCAAGTATTTCAATAAACCAAACAGCAAACCCGAATCCATTAACTCGGAAATCGGCCATCAAACTATCATTTCACCAGACTCATCTGCAAGTATTGGATTCAGACCCGAGCAATGACTTCCCAATCTTCTCCGGATCAAACGATCGTGACGGCCATCCTCGCATCACAACTGATGTCTGTGCCATTCACCTGCTCTTTAACGCTACTGGTTTCAATTTTCGCCTGGCGGTACATCTCCAAAACCTCGGCACGCGCAATCCCGAAATCAGGATATTTTGAATATCCCTGTGTGAGCCGGACACCGGCTAGGGCCAGATTGGCGAGAGCAGGTGCTGGCCTAGTCGCGGTCATCTGTGTCATCTTAACAAGTATAGAGTTGGTGGTAAGTTACAACATATTTTTAAGTTCAATCTCAAGTGGCCAAGATCCAGATACTTATACTTCTGCATTTCTTTTCCCTAAGCATGTATATCAAGCATCCGTCGTACATCCTCGGGACCTTACATATTTCCAAAGAACTCCCTGGACTTTTGCATTGATCCCTGTCTTGACCGCGTTCGTAACTATCCACCTCTTCACTTGCGGCTTCATTTTTGTAAACTTCATCTATTAACCAACCG

In [None]:
def blast_match_genes(query_dict,ref_fna,ref_gff):
    db_path = os.path.splitext(ref_gff)[0] + '.db'
    if not os.path.exists(db_path):
        gffutils.create_db(ref_gff, db_path, merge_strategy='create_unique')
    db = gffutils.FeatureDB(db_path)

    sseqid_dict = {}
    
    for query_id, query_seq in tqdm(query_dict.items()):
        temp_dir = tempfile.gettempdir()
        query_temp = os.path.join(temp_dir, f"query_{query_id}.fasta")
        
        with open(query_temp, 'w') as f:
            f.write(f'>{query_id}\n{query_seq}\n')
        
        blastn_cline = NcbiblastnCommandline(query=query_temp, subject=ref_fna, outfmt=6)
        stdout, stderr = blastn_cline()
        
        # Parse the blast output
        for line in stdout.splitlines():
            line = line.split('\t')
            pident = float(line[2])
            # if pident < 97:
            #     continue
            sseqid = line[1]
            sstart = int(line[8])
            send = int(line[9])
            old_gene_length = len(query_seq)
            if query_id not in sseqid_dict:
                sseqid_dict[query_id] = []

            # Get the new gene ID from the gff db
            for gene in db.region(region=(sseqid, sstart, send), featuretype='gene'):
                gene_id = gene.attributes['ID'][0]
                if '-' in gene_id:
                    gene_id = gene_id.split('-')[1]
                    gene_id = gene_id.split('-')[0]
                gene_length = gene.end - gene.start
                pct_gene_len = old_gene_length / gene_length
                entry = (gene_id, pident, old_gene_length, gene_length, pct_gene_len)
                sseqid_dict[query_id].append(entry)
                
        os.remove(query_temp)
    
    with open('old-to-new-genes.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Old_GeneID', 'New_GeneID', 'PctIdentity', 'Old_GeneLen', 'New_GeneLen', 'PctGeneLen'])
        for query_id, gene_info in sseqid_dict.items():
            for gene_id, pident, old_gene_length, gene_length, pct_gene_len in gene_info:
                writer.writerow([query_id, gene_id, pident, old_gene_length, gene_length, pct_gene_len])
    
    return sseqid_dict

In [33]:
sseqid_dict = blast_match_genes(old_genes, new_ref_fna, new_ref_gff)
with open('old-to-new-genes_justids.csv', 'w', newline='') as f:
    print('Old_GeneID,New_GeneID', file=f)
    for query_id, gene_info in sseqid_dict.items():
        for gene_id, pident, old_gene_length, gene_length, pct_gene_len in gene_info:
            print(f'{query_id},{gene_id}', file=f)

100%|██████████| 1/1 [00:07<00:00,  7.14s/it]

ID: ['gene:maker-PST130_7723-snap-gene-0.10-mRNA-1']
biotype: ['protein_coding']
gene_id: ['maker-PST130_7723-snap-gene-0.10-mRNA-1']
logic_name: ['genemodel_jgi']



