# Extracting Data From CHEMBL29
## Defining Imports

In [1]:
from itertools import combinations
import pandas as pd
from tqdm.auto import tqdm

## Loading Preprocessed ChEMBL29 Medium Confidence Data
Medium confidence data, in contrast to high confidence data, contains data from proteins with mutations and is not restricted to single protein assays.

In [2]:
chembl_mconf_df = pd.read_csv("medium_conf_chembl29/final_data.tsv.gz", sep="\t")
chembl_human_df =  chembl_mconf_df.query("organism == 'Homo sapiens'")
chembl_active_df = chembl_human_df.query("pPot_mean > 5")

In [3]:
print("Number of interactions: {}".format(chembl_active_df.shape[0]))
print("Number of targets: {}".format(chembl_active_df.accession.nunique()))
print("Number of cpds: {}".format(chembl_active_df.nonstereo_aromatic_smiles.nunique()))

Number of interactions: 356854
Number of targets: 2000
Number of cpds: 234320


Recording for each target (specified by UniProt-ID) active compounds as set.

In [4]:
chembl_target_cpds = chembl_active_df.groupby("accession").nonstereo_aromatic_smiles.agg(set)

## Protein Information
Protein information were extracted from [UniProt](https://www.uniprot.org/) with this [tool](https://github.com/c-feldmann/UniProtClient).

In [5]:
protein_information_df = pd.read_csv("medium_conf_chembl29/uniprot_info.tsv", sep="\t", index_col="entry")

### Protein Functions
Protein functions from [QuickGO](https://www.ebi.ac.uk/QuickGO/) were accessed with this [tool](https://github.com/c-feldmann/QuickGOProteinAnnotation). It is used to determine (functionally) unrelated proteins.  
Obtained functions of each protein are arranged as set.

In [6]:
protein_function_df = pd.read_csv("medium_conf_chembl29/protein_functions.tsv", sep="\t")
protein_functions = protein_function_df.groupby("uniprot_id").protein_function.agg(set)

## Forming Pairs and Recording Intersection and Symmetric Difference of Both Proteins

In [7]:
unique_targets = chembl_active_df.accession.unique()
total_pairs = int((len(unique_targets)**2 -len(unique_targets)) / 2)

out_df = []
for t1, t2 in tqdm(combinations(unique_targets, r=2), total=total_pairs):
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]
    
    functions_t1 = protein_functions[t1]
    functions_t2 = protein_functions[t2]
   
    if "no_function" in functions_t1 or "no_function" in functions_t2:
        functionally_related = "undetermined"
    else:
        if functions_t1 & functions_t2:
            functionally_related = "yes"
        else:
            functionally_related = "no"
    
    info_dict= {"t1": t1,
                "t2": t2,
                "functionally_related": functionally_related,
                "n_chembl_t1": len(chembl_cpds_t1 -chembl_cpds_t2),
                "n_chembl_t2": len(chembl_cpds_t2 -chembl_cpds_t1),
                "dt_chembl": len(chembl_cpds_t1 & chembl_cpds_t2),
               }
    out_df.append(info_dict)
out_df = pd.DataFrame(out_df)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1999000.0), HTML(value='')))




Adding protein names

In [8]:
out_df = out_df.merge(protein_information_df["primary_name"], left_on="t1", right_index=True)
out_df.rename(columns={"primary_name" : "t1_name"}, inplace=True)
out_df = out_df.merge(protein_information_df["primary_name"], left_on="t2", right_index=True)
out_df.rename(columns={"primary_name" : "t2_name"}, inplace=True)

## Selecting Functionally Unrelated Pairs
Pairs should have at least:
 * 40 shared CPDs
 * each target should have 40 specific cpds

In [9]:
distinct_pairs = out_df.query("functionally_related == 'no' & dt_chembl >= 50 & n_chembl_t1 >= 50 & n_chembl_t2 >= 50")
distinct_pairs

Unnamed: 0,t1,t2,functionally_related,n_chembl_t1,n_chembl_t2,dt_chembl,t1_name,t2_name
69382,P27338,P22303,no,2037,1933,70,Amine oxidase [flavin-containing] B,Acetylcholinesterase
6060,Q9Y5N1,P31645,no,2740,2724,180,Histamine H3 receptor,Sodium-dependent serotonin transporter
67440,P35462,P31645,no,2844,2843,61,D(3) dopamine receptor,Sodium-dependent serotonin transporter
120117,P14416,P31645,no,4001,2731,173,D(2) dopamine receptor,Sodium-dependent serotonin transporter
137515,P31645,P50406,no,2828,2838,76,Sodium-dependent serotonin transporter,5-hydroxytryptamine receptor 6
106541,Q01959,P08908,no,1450,2760,75,Sodium-dependent dopamine transporter,5-hydroxytryptamine receptor 1A
135596,P23975,P08908,no,1872,2737,98,Sodium-dependent noradrenaline transporter,5-hydroxytryptamine receptor 1A
137538,P31645,P34969,no,2791,1741,113,Sodium-dependent serotonin transporter,5-hydroxytryptamine receptor 7
69437,P27338,P29274,no,2055,3593,52,Amine oxidase [flavin-containing] B,Adenosine receptor A2a
137560,P31645,P25103,no,2836,542,68,Sodium-dependent serotonin transporter,Substance-P receptor


## Extracting Data for Selected Pairs
Pairs analyzed in previous study.
Furthermore selecting the pair of `Histamine H3 receptor` and `Sodium-dependent serotonin transporter` because of increased data availability.

In [10]:
pair1 = ("P27338", "P29274")
pair2 = ("P27338", "P22303")
pair3 = ("Q9Y5N1", "P31645")
pair_list = [pair1, pair2, pair3]

In [11]:
distinct_pairs.set_index(["t1", "t2"]).loc[pair_list]

Unnamed: 0_level_0,Unnamed: 1_level_0,functionally_related,n_chembl_t1,n_chembl_t2,dt_chembl,t1_name,t2_name
t1,t2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P27338,P29274,no,2055,3593,52,Amine oxidase [flavin-containing] B,Adenosine receptor A2a
P27338,P22303,no,2037,1933,70,Amine oxidase [flavin-containing] B,Acetylcholinesterase
Q9Y5N1,P31645,no,2740,2724,180,Histamine H3 receptor,Sodium-dependent serotonin transporter


In [12]:
for t1, t2 in pair_list:
    pair_df = []
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]
    
    dt_cpds = chembl_cpds_t1 & chembl_cpds_t2
    t1_cpds = chembl_cpds_t1 - chembl_cpds_t2
    t2_cpds = chembl_cpds_t2 - chembl_cpds_t1
    
    # Sets are not necessarily in the same order when restarting notebook. Sorting for reproducibility.
    for cpd_smi in sorted(dt_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": "dual_target"
                       })
    for cpd_smi in sorted(t1_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t1
                       })
    for cpd_smi in sorted(t2_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t2
                       })
    pair_df = pd.DataFrame(pair_df)
    pair_df.to_csv(f"./chembl29_dt_cpds_{t1}_{t2}.tsv", sep="\t", index=False)

## Predicting Target
Selecting two functionally unrelated targets without shared CPDs. ML task is to predict to correct target for a given compound.

In [13]:
out_df.query("dt_chembl == 0 & functionally_related == 'no'").sort_values(["n_chembl_t1", "n_chembl_t2"], ascending=False).head(10)

Unnamed: 0,t1,t2,functionally_related,n_chembl_t1,n_chembl_t2,dt_chembl,t1_name,t2_name
120130,P14416,P42336,no,4174,3459,0,D(2) dopamine receptor,"Phosphatidylinositol 4,5-bisphosphate 3-kinase..."
120199,P14416,P56817,no,4174,3024,0,D(2) dopamine receptor,Beta-secretase 1
120257,P14416,Q13547,no,4174,2886,0,D(2) dopamine receptor,Histone deacetylase 1
120115,P14416,Q9Y233,no,4174,2798,0,D(2) dopamine receptor,"cAMP and cAMP-inhibited cGMP 3',5'-cyclic phos..."
120179,P14416,P28845,no,4174,2417,0,D(2) dopamine receptor,Corticosteroid 11-beta-dehydrogenase isozyme 1
120231,P14416,P42345,no,4174,2417,0,D(2) dopamine receptor,Serine/threonine-protein kinase mTOR
120200,P14416,P27487,no,4174,2414,0,D(2) dopamine receptor,Dipeptidyl peptidase 4
120303,P14416,O60674,no,4174,2389,0,D(2) dopamine receptor,Tyrosine-protein kinase JAK2
120510,P14416,O00763,no,4174,2117,0,D(2) dopamine receptor,Acetyl-CoA carboxylase 2
120826,P14416,Q15858,no,4174,2108,0,D(2) dopamine receptor,Sodium channel protein type 9 subunit alpha


In [14]:
task2_pair1 = ["P14416", "P42336"] # D(2) dopamine receptor and Phosphatidylinositol 4,5-bisphosphate 3-kinase catalytic subunit alpha isoform
task2_pairlist= [task2_pair1]

In [15]:
for t1, t2 in task2_pairlist:
    pair_df = []
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]   

    for cpd_smi in sorted(chembl_cpds_t1):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t1
                       })
    for cpd_smi in sorted(chembl_cpds_t2):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t2
                       })
    pair_df = pd.DataFrame(pair_df)
    pair_df.to_csv(f"./chembl29_predicting_target_{t1}_{t2}.tsv", sep="\t", index=False)