In [9]:
import argparse
import logging
from os import path
import sys
import json
import os
import subprocess
from Bio.Blast import NCBIXML

In [10]:
print('Get current working directory : ', os.getcwd())

Get current working directory :  /Users/alina/Desktop/Internship-work/Internship


In [11]:
directory = "/Users/alina/Desktop/Internship-work/Internship/BLAST"

In [12]:
# Initial function 'process_hsp_regions'

def process_hsp_regions(hsp, doc_query, doc_subject, th_id=0.8, th_pos=0.9, th_len=10, th_gap=0.2):
    """
    Assign an alignment for each set of region (different mobidb features, keys)
    """
    valid_regions = []
    for key in doc_query:
        # Skip proteins already manually curated with this feature (key)
        if key not in doc_subject:
            for region in doc_query[key]["regions"]:

                q_r_len = region[1] - region[0] + 1  # query region length without gaps

                # Values relative to the alignment fragment that regards the query region
                m = 0  # matches in the alignment
                ident = 0  # identical in the alignment
                g = 0  # gaps in the alignment

                # Gaps till the end of the considered region. Usefult to map positions to the ungapped sequences
                g_q = 0
                g_s = 0

                # The aligned fagments, including gaps (can start after the actual region start of the query sequence and finish before)
                q_seq = []
                s_seq = []

                # The actual position of the aligned region in the subject ungapped sequence
                s_start = None
                s_end = None

                for i, (l_q, l_s) in enumerate(zip(hsp.query, hsp.sbjct)):

                    # Gaps
                    if l_q == "-":
                        g_q += 1
                    if l_s == "-":
                        g_s += 1

                    if region[0] <= (i + hsp.query_start - g_q) <= region[1]:
                        q_seq.append(l_q)
                        s_seq.append(l_s)

                        # Matches and identities
                        if l_q != "-" and l_s != "-":
                            m += 1
                            if l_q == l_s:
                                ident += 1
                        else:
                            g += 1

                        # Start end positions
                        if l_s != "-":
                            s_end = i + hsp.sbjct_start - g_s
                            if not s_start:
                                s_start = s_end

                if s_start and s_end:
                    s_r_len = s_end - s_start + 1
                    ident_perc = 2 * ident / (s_r_len + q_r_len)
                    m_perc = 2 * m / (s_r_len + q_r_len)
                    g_perc = 2 * g / (s_r_len + q_r_len)

                    # TODO check gaps in the query
                    if q_r_len >= th_len and s_r_len >= th_len and ident_perc >= th_id and m_perc >= th_pos and g_perc <= th_gap:
                        n_key = "homology-" + "-".join(key.split("-")[1:])
                        valid_regions.append((key, n_key, q_r_len, s_r_len, s_start, s_end, m_perc, ident_perc, g_perc))

    return valid_regions

In [13]:
# Initial function 'process_blast'

def process_blast(curated_file, input_dir, out_file_regions, th_evalue=0.01):
    curated = {}  # {uniprot_id: {key: {source_id: "", regions: [], content_fraction: 0, content_count: 0 }}}
    with open(curated_file) as f:
        for line in f:
            doc = json.loads(line)
            key = list(doc.keys())
            key.remove("acc")
            key = key[0]
            curated.setdefault(doc["acc"], {})[key] = doc[key]
    logging.info("Curated accessions {}".format(len(curated)))

    with open(out_file_regions, "w") as fout:
        for file_name in os.listdir(input_dir):
            logging.info("Processing {}".format(file_name))
            with open("{}/{}".format(input_dir, file_name)) as f:

                # Iterate queries
                blast_records = NCBIXML.parse(f)
                for blast_record in blast_records:

                    # Iterate query alignments
                    query_id = blast_record.query.split("|")[1]
                    if curated.get(query_id):
                        for alignment in blast_record.alignments:
                            subject_id = alignment.title.split("|")[1]
                            # print(query_id, subject_id, blast_record.query_length, alignment.length)
                            # if query_id == "P09651" and subject_id == "A0A3Q7R181":
                            for hsp in alignment.hsps:
                                if hsp.expect < th_evalue:

                                    # Calculate region overlap
                                    for key, n_key, q_r_len, s_r_len, s_start, s_end, m_perc, ident_perc, g_perc in process_hsp_regions(hsp, curated[
                                        query_id], curated.get(subject_id, {})):
                                        fout.write("{} {} {} {} {} {} {} {} {:.3f} {:.3f} {:.3f}\n".format(subject_id,
                                                                                                    query_id,
                                                                                                    n_key,
                                                                                                    curated[query_id][
                                                                                                        key].get(
                                                                                                        "source_id"),
                                                                                                    alignment.length, # The subject length, full sequence
                                                                                                    blast_record.query_length,
                                                                                                    s_start,
                                                                                                    s_end,
                                                                                                    m_perc,
                                                                                                    ident_perc,
                                                                                                    g_perc))
                    else:
                        logging.warning("Missing query ID in the curated file {}".format(query_id))
    return

In [14]:
# CORRECTED process_blast
# the key has been changed

def process_blast(curated_file, input_dir, out_file_regions, th_evalue=0.01):
    curated = {}
    with open('homology.mjson') as f:
        for line in f:
            doc = json.loads(line)
            key = list(doc.keys())
            key.remove("acc")
            key = key[0]
            source_id = doc[key]['regions_ids'][0].split('(')[0]
            curated.setdefault(source_id, {doc['acc']: {'regions': doc[key]['regions'], 
                                            'content_fraction': doc[key]['content_fraction'], 
                                            'content_count': doc[key]['content_count']}})
            logging.info("Curated accessions {}".format(len(curated)))
        logging.info("Finished processing file.")

        with open(out_file_regions, "w") as fout:
            for file_name in os.listdir(input_dir):
                logging.info("Processing {}".format(file_name))
                with open("{}/{}".format(input_dir, file_name)) as f:

                    # Iterate queries
                    blast_records = NCBIXML.parse(f)
                    for blast_record in blast_records:

                        # Iterate query alignments
                        query_id = blast_record.query.split("|")[1]
                        if curated.get(query_id):
                            for alignment in blast_record.alignments:
                                subject_id = alignment.title.split("|")[1]
                                for hsp in alignment.hsps:
                                    if hsp.expect < th_evalue:

                                        # Calculate region overlap
                                        for key, n_key, q_r_len, s_r_len, s_start, s_end, m_perc, ident_perc, g_perc in process_hsp_regions(hsp, curated[
                                            query_id], curated.get(subject_id, {})):
                                            fout.write("{} {} {} {} {} {} {} {} {:.3f} {:.3f} {:.3f}\n".format(subject_id,
                                                                                                        query_id,
                                                                                                        n_key,
                                                                                                        curated[query_id][
                                                                                                            key].get(
                                                                                                            "source_id"),
                                                                                                        alignment.length, # The subject length, full sequence
                                                                                                        blast_record.query_length,
                                                                                                        s_start,
                                                                                                        s_end,
                                                                                                        m_perc,
                                                                                                        ident_perc,
                                                                                                        g_perc))
                        else:
                            logging.warning("Missing query ID in the curated file {}".format(query_id))
    return

In [15]:
curated # {uniprot_id: {key: {source_id: "", regions: [], 
        # content_fraction: 0, 
        # content_count: 0 }}}

NameError: name 'curated' is not defined

In [16]:
process_blast('homology.mjson', directory, 'output.fasta')

