In [4]:
import pandas as pd
import os
from dotenv import load_dotenv
import sys
sys.path.append("../../bin")
from pfamenv import PFAM_USER, PFAM_HOST, PFAM_PASSWORD, PFAM_PORT, PFAM_VERSION
from mysql import connector

def is_in_pfam(uniprot_accs:list) -> pd.DataFrame:
    """
    
        Maps uniprot accs pfam uniprot table
    
    
    """
    cnx = connector.connect(user=PFAM_USER,
                            password=PFAM_PASSWORD,
                            port=PFAM_PORT,
                            host=PFAM_HOST)
    cursor = cnx.cursor()
    uniprot_accs = ",".join([f"'{u}'" for u in uniprot_accs])
    cursor.execute(f"SELECT uniprot_acc FROM {PFAM_VERSION}.uniprot \
                WHERE uniprot_acc IN ({uniprot_accs})")

    output = cursor.fetchall()

    return pd.DataFrame(output, columns=["uniprot_acc"])

In [52]:
pfam_mapped_df = pd.read_csv("tmp/pfam_mapped.csv")

In [53]:
hmm_mapped_df = pd.read_csv("tmp/hmm_df.csv")

In [54]:
hmm_mapped_df = pd.merge(hmm_mapped_df, 
        is_in_pfam(hmm_mapped_df["uniprot_acc"].unique()))

In [63]:
# Give priority to existing domains over hmm obtained ones
# Exclude hmm domains overlapping with other pfam domains


over_df = pd.merge(hmm_mapped_df,
        pfam_mapped_df[["uniprot_acc", "pfamA_acc", "seq_start", "seq_end"]].drop_duplicates()\
            .rename(columns={"pfamA_acc":"pfam_pfamA_acc", "seq_start":"pfam_seq_start", "seq_end":"pfam_seq_end"}),
        how="left")

def overlap(row):
    seq_start = row["seq_start"]
    seq_end = row["seq_end"]
    pfam_seq_start = row["pfam_seq_start"]
    pfam_seq_end = row["pfam_seq_end"]

    # Overlap in the two cases
    overlap = 0
    # Case 1: HMM is before than pfam
    if seq_start < pfam_seq_start:
        if seq_end > pfam_seq_start:
            overlap = seq_end - pfam_seq_start
    # Case 2: pfam is before than HMM
    elif pfam_seq_start < seq_start:
        if pfam_seq_end > seq_start:
            overlap = pfam_seq_end - seq_start
    
    # Normalize on target length
    return overlap / (pfam_seq_end - pfam_seq_start) 

# Find overlap
over_df["overlap"] = over_df.apply(lambda x: overlap(x), axis=1).fillna(0)
# Find overlapping domains
over_df = over_df[over_df["overlap"]>.3][["uniprot_acc", "pfamA_acc", "seq_start", "seq_end", "pfamA_id"]].drop_duplicates()
over_df["overlapping"] = True
hmm_mapped_df =pd.merge(hmm_mapped_df, over_df, how="left").query('overlapping.isna()')[["uniprot_acc", "pfamA_acc", "seq_start", "seq_end", "pfamA_id"]]

In [64]:
OUTPUT = "output/afdb_isopep_domains.csv"
pd.concat([pfam_mapped_df, hmm_mapped_df]).drop_duplicates().to_csv(OUTPUT, index=False)

In [51]:
over_df[over_df["uniprot_acc"]=="A0A073K724"]

Unnamed: 0,uniprot_acc,pfamA_acc,seq_start,seq_end,pfamA_id,pfam_pfamA_acc,pfam_seq_start,pfam_seq_end,overlap
9955,A0A073K724,PF24346,465,573,DUF7507,PF01345,335.0,446.0,0.0
9956,A0A073K724,PF24346,465,573,DUF7507,PF01345,467.0,578.0,0.954955


In [49]:
hmm_mapped_df[hmm_mapped_df["uniprot_acc"]=="A0A073K724"]

Unnamed: 0,uniprot_acc,pfamA_acc,seq_start,seq_end,pfamA_id
9955,A0A073K724,PF24346,465,573,DUF7507


In [50]:
pfam_mapped_df[pfam_mapped_df["uniprot_acc"]=="A0A073K724"]

Unnamed: 0,uniprot_acc,pfamA_acc,pfamA_id,seq_start,seq_end
6050871,A0A073K724,PF01345,DUF11,335,446
6050872,A0A073K724,PF01345,DUF11,467,578


In [11]:
from mysql import connector
from pathlib import Path
import re
import numpy as np
import sys
sys.path.append("../../bin")
from pfamenv import PFAM_USER, PFAM_HOST, PFAM_PASSWORD, PFAM_PORT, PFAM_VERSION
TMP_HMM_DIR = "tmp/HMM"
HMMSEARCH_OUTPUT = "tmp/hmmsearch.out"
HMMSEARCH = "/hps/software/users/agb/research/francesco/software/hmmer-3.3.2/bin/hmmsearch"
HMMPRESS = "/hps/software/users/agb/research/francesco/software/hmmer-3.3.2/bin/hmmpress"
# Add all additional domains
DOMAINS = ["PF24346", "PF24514", "PF24517", "PF24547", "PF24558", "PF24593", "PF24595", "PF25546", "PF25548", "PF25549", "PF25551", "PF25564"]
HMMSEARCH_OUTPUT = "tmp/hmmsearch.out"

def download_hmm() -> dict:
    """
    
        Download HMM files
    
    """
    # Download HMMs and concat
    domain_data = {}
    for domain in DOMAINS:
        
        Path(os.path.join(TMP_HMM_DIR, "HMM")).touch()
        # Download if not already existing
        if not os.path.exists(os.path.join(TMP_HMM_DIR, domain)):
            cmd = f"cd {TMP_HMM_DIR}; pfco {domain}"
            r = subprocess.run(cmd, shell=True, text=True, capture_output=True)
            if r.returncode != 0:
                print("Error encountered")
                print(r.stderr)
                sys.exit()
        
        # Get gathering thresholds
        ga = re.compile(r"GA.+(\d+\d+.\d+\d+).+(\d+\d+.\d+\d+);")
        ac = re.compile(r"ID   (.+)")
        cl = re.compile(r"CL   (CL\d+)")
        clan_acc = np.NaN
        with open(os.path.join(TMP_HMM_DIR, domain, "DESC"), "rt") as desc:
            for line in desc:
                if line.startswith("ID"):
                    pfamA_id = ac.findall(line)[0]
                if line.startswith("GA"):
                    thresholds = ga.findall(line)
                if line.startswith("CL"):
                    #print(line, cl.findall(line))
                    clan_acc = cl.findall(line)[0]

        d_thr, s_thr = float(thresholds[0][0]), float(thresholds[0][1])
        domain_data[domain] = {"pfamA_id": pfamA_id, "domain_threshold": d_thr, 
                                        "sequence_threshold": s_thr, "clan_acc": clan_acc}
        
        # Concat to HMM file
        with open(os.path.join(TMP_HMM_DIR, "HMM"), "a") as outfile:
            outfile.write("\n")
            with open(os.path.join(TMP_HMM_DIR, domain, "HMM"), "rt") as hmm:
                for line in hmm:
                    if "NAME  SEED" in line:
                        line = f"NAME  {domain}\n"
                    outfile.write(line)
    return domain_data

In [25]:
pf_df = pd.read_csv(HMMSEARCH_OUTPUT, comment='#', 
                          names=["target name", "accession1", "tlen", "query name", "accession2", "qlen", "E-value", "full_seq_score", "full_seq_bias",
                                 "#", "of", "c-Evalue", "i-Evalue", "domain_score", "bias", "hmm_from",
                                  "hmm_to", "ali_from", "ali_to", "from", "to", "acc", "description of target"], sep=r'\s+')
domain_data = download_hmm()

p_df = pd.DataFrame()
for pfamA_acc in domain_data:
    p_df = pd.concat([p_df, 
                        pf_df[(pf_df["full_seq_score"]>domain_data[pfamA_acc]["sequence_threshold"]) &\
                    (pf_df["domain_score"]>domain_data[pfamA_acc]["domain_threshold"]) &\
                    (pf_df["query name"] == pfamA_acc)]\
            [["target name", "query name", "from", "to"]]\
                .rename(columns={"target name": "uniprot_acc", "query name": "pfamA_acc", "from":"seq_start", "to":"seq_end"})\
                .assign(pfamA_id=domain_data[pfamA_acc]["pfamA_id"])]
    )
p_df

Unnamed: 0,uniprot_acc,pfamA_acc,seq_start,seq_end,pfamA_id
1,A0A0C1E0D1,PF24346,41,148,DUF7507
2,A0A0C1E0D1,PF24346,159,266,DUF7507
3,A0A0C1E0D1,PF24346,277,384,DUF7507
4,A0A0C1E0D1,PF24346,395,502,DUF7507
5,A0A0C1E0D1,PF24346,513,620,DUF7507
...,...,...,...,...,...
82549,A0A1V5RIS2,PF25564,28,141,DUF7933
82558,A0A4Q3JR11,PF25564,297,395,DUF7933
82570,A0A840IDH3,PF25564,198,310,DUF7933
82581,A4A4U7,PF25564,90,188,DUF7933


In [15]:
domain_data[pfamA_acc]["sequence_threshold"]

26.5

In [23]:
seq_threshold = domain_data[pfamA_acc]["sequence_threshold"]
dom_threshold = domain_data[pfamA_acc]["domain_threshold"]
pf_df[(pf_df["query name"] == pfamA_acc)]

Unnamed: 0,target name,accession1,tlen,query name,accession2,qlen,E-value,full_seq_score,full_seq_bias,#,...,domain_score,bias,hmm_from,hmm_to,ali_from,ali_to,from,to,acc,description of target
79564,A0A0K0Y022,-,1203,PF25564,-,123,5.600000e-224,743.5,109.5,1,...,106.1,10.2,1,122,26,146,26,147,0.99,-
79565,A0A0K0Y022,-,1203,PF25564,-,123,5.600000e-224,743.5,109.5,2,...,118.9,7.1,3,123,429,548,427,548,0.99,-
79566,A0A0K0Y022,-,1203,PF25564,-,123,5.600000e-224,743.5,109.5,3,...,150.6,6.5,1,123,552,673,552,673,1.00,-
79567,A0A0K0Y022,-,1203,PF25564,-,123,5.600000e-224,743.5,109.5,4,...,109.4,5.3,3,123,680,798,679,798,0.98,-
79568,A0A0K0Y022,-,1203,PF25564,-,123,5.600000e-224,743.5,109.5,5,...,96.9,14.1,1,123,803,921,803,921,0.98,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84095,A0A537XYP0,-,318,PF25564,-,123,9.900000e+00,19.3,6.5,1,...,18.0,6.5,16,122,212,313,204,314,0.82,-
84096,A0A7C3EG33,-,328,PF25564,-,123,9.900000e+00,19.3,3.9,1,...,4.6,0.2,13,91,46,117,38,128,0.68,-
84097,A0A7C3EG33,-,328,PF25564,-,123,9.900000e+00,19.3,3.9,2,...,15.2,0.4,13,103,219,305,209,318,0.80,-
84098,A0A7V1DPC2,-,328,PF25564,-,123,9.900000e+00,19.3,3.9,1,...,4.6,0.2,13,91,46,117,38,128,0.68,-


In [20]:
seq_threshold

26.5

In [21]:
dom_threshold

26.5