# Extracting Data From CHEMBL29
## Defining Imports

In [1]:
from itertools import combinations
import numpy as np
from numpy import random
import pandas as pd
from tqdm.auto import tqdm
from typing import *

## Loading Preprocessed ChEMBL29 Medium Confidence Data
Medium confidence data, in contrast to high confidence data, contains data from proteins with mutations and is not restricted to single protein assays.

In [2]:
chembl_mconf_df = pd.read_csv("medium_conf_chembl29/final_data.tsv.gz", sep="\t")
chembl_human_df =  chembl_mconf_df.query("organism == 'Homo sapiens'")
chembl_active_df = chembl_human_df.query("pPot_mean > 5")

In [3]:
print("Number of interactions: {}".format(chembl_active_df.shape[0]))
print("Number of targets: {}".format(chembl_active_df.accession.nunique()))
print("Number of cpds: {}".format(chembl_active_df.nonstereo_aromatic_smiles.nunique()))

Number of interactions: 356854
Number of targets: 2000
Number of cpds: 234320


Recording for each target (specified by UniProt-ID) active compounds as set.

In [4]:
chembl_target_cpds = chembl_active_df.groupby("accession").nonstereo_aromatic_smiles.agg(set)

## Protein Information
Protein information were extracted from [UniProt](https://www.uniprot.org/) with this [tool](https://github.com/c-feldmann/UniProtClient).

In [5]:
protein_information_df = pd.read_csv("medium_conf_chembl29/uniprot_info.tsv", sep="\t", index_col="entry")

### Protein Functions
Protein functions from [QuickGO](https://www.ebi.ac.uk/QuickGO/) were accessed with this [tool](https://github.com/c-feldmann/QuickGOProteinAnnotation). It is used to determine (functionally) unrelated proteins.  
Obtained functions of each protein are arranged as set.

In [6]:
protein_function_df = pd.read_csv("medium_conf_chembl29/protein_functions.tsv", sep="\t")
protein_functions = protein_function_df.groupby("uniprot_id").protein_function.agg(set)

In [7]:
class FunctionHandler:
    def __init__(self, function_dict: Dict[str, Set[str]]):
        self.function_dict = function_dict
    
    def are_related(self, uniprot_id1, uniprot_id2):
            functions_t1 = self.function_dict[uniprot_id1]
            functions_t2 = self.function_dict[uniprot_id2]

            if "no_function" in functions_t1 or "no_function" in functions_t2:
                return "undetermined"
            else:
                if functions_t1 & functions_t2:
                    return "yes"
                else:
                    return "no"
    @property
    def proteins(self) -> Set[str]:
        return set(self.function_dict.keys())

    def get_related_proteins(self, uniprot_id1, undertermined_equals_related=False) -> Set[str]:
        related_set = set()
        for protein in sorted(self.proteins):
            is_related = self.are_related(uniprot_id1, protein)
            if is_related == "yes":
                related_set.add(protein)
            elif is_related == "undetermined" and undertermined_equals_related:
                related_set.add(protein)
        return related_set

In [8]:
protein_function_handler = FunctionHandler(protein_functions)

## Forming Pairs and Recording Intersection and Symmetric Difference of Both Proteins

In [None]:
unique_targets = chembl_active_df.accession.unique()
total_pairs = int((len(unique_targets)**2 -len(unique_targets)) / 2)

out_df = []
for t1, t2 in tqdm(combinations(unique_targets, r=2), total=total_pairs):
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]
    
    functionally_related = protein_function_handler.are_related(t1, t2)
    
    info_dict= {"t1": t1,
                "t2": t2,
                "functionally_related": functionally_related,
                "n_chembl_t1": len(chembl_cpds_t1 -chembl_cpds_t2),
                "n_chembl_t2": len(chembl_cpds_t2 -chembl_cpds_t1),
                "dt_chembl": len(chembl_cpds_t1 & chembl_cpds_t2),
               }
    out_df.append(info_dict)
out_df = pd.DataFrame(out_df)

  0%|          | 0/1999000 [00:00<?, ?it/s]

Adding protein names

In [None]:
out_df = out_df.merge(protein_information_df["primary_name"], left_on="t1", right_index=True)
out_df.rename(columns={"primary_name" : "t1_name"}, inplace=True)
out_df = out_df.merge(protein_information_df["primary_name"], left_on="t2", right_index=True)
out_df.rename(columns={"primary_name" : "t2_name"}, inplace=True)

## Selecting Functionally Unrelated Pairs
Pairs should have at least:
 * 50 shared CPDs
 * each target should have 50 specific cpds

In [None]:
distinct_pairs = out_df.query("functionally_related == 'no' & dt_chembl >= 50 & n_chembl_t1 >= 50 & n_chembl_t2 >= 50")
distinct_pairs

## Extracting Data for Selected Pairs
Pairs analyzed in previous study.
Furthermore selecting the pair of `Histamine H3 receptor` and `Sodium-dependent serotonin transporter` because of increased data availability.

In [None]:
pair1 = ("P27338", "P29274")
pair2 = ("P27338", "P22303")
pair3 = ("Q9Y5N1", "P31645")
pair_list = [pair1, pair2, pair3]

In [None]:
distinct_pairs.set_index(["t1", "t2"]).loc[pair_list]

Defining random number generator for random sample of ChEMBL CPDs

In [None]:
seed = 20012022
r_generator = random.default_rng(seed)

In [None]:
protein_functions["O08689"]

In [None]:
chembl_active_df.head()

In [None]:
all_cpds = set(chembl_active_df.nonstereo_aromatic_smiles.unique())
for t1, t2 in pair_list:
    pair_df = []
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]
    
    dt_cpds = chembl_cpds_t1 & chembl_cpds_t2
    t1_cpds = chembl_cpds_t1 - chembl_cpds_t2
    t2_cpds = chembl_cpds_t2 - chembl_cpds_t1
    
    # Sets are not necessarily in the same order when restarting notebook. Sorting for reproducibility.
    for cpd_smi in sorted(dt_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": "dual_target"
                       })
    for cpd_smi in sorted(t1_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t1
                       })
    for cpd_smi in sorted(t2_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t2
                       })
        
    # Random Sample of ChEMBL CPDs
    # Restriction: Excluding CPDs active agaisnt either target or related targets.
    excluded_cpds = set()
    for t in [t1, t2]:
        # undertermined_equals_related is set to true in order to remove also CPDs against proteins potentially related to target
        related_proteins = protein_function_handler.get_related_proteins(t, undertermined_equals_related=True)
        assert t in related_proteins
        for protein in related_proteins:
            if protein in chembl_target_cpds:
                cpds = chembl_target_cpds[protein]
                excluded_cpds.update(cpds)
        available_cpds = sorted(all_cpds-excluded_cpds)
        sample_size = len(chembl_cpds_t1 | chembl_cpds_t2)
        sampled_cpds = r_generator.choice(available_cpds, size=sample_size, replace=False).tolist()
        #print(sampled_cpds)
        for cpd_smi in sorted(sampled_cpds):
            pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                            "target_pair": f"{t1}_{t2}",
                            "label": "random_cpd"
                           })

    pair_df = pd.DataFrame(pair_df)
    pair_df.to_csv(f"./chembl29_dt_cpds_{t1}_{t2}.tsv", sep="\t", index=False)

## Predicting Target
Selecting two functionally unrelated targets without shared CPDs. ML task is to predict to correct target for a given compound.

In [None]:
out_df.query("dt_chembl == 0 & functionally_related == 'no'").sort_values(["n_chembl_t1", "n_chembl_t2"], ascending=False).head(10)

In [None]:
task2_pair1 = ["P14416", "P42336"] # D(2) dopamine receptor and Phosphatidylinositol 4,5-bisphosphate 3-kinase catalytic subunit alpha isoform
task2_pairlist= [task2_pair1]

In [None]:
for t1, t2 in task2_pairlist:
    pair_df = []
    chembl_cpds_t1 = chembl_target_cpds[t1]
    chembl_cpds_t2 = chembl_target_cpds[t2]   

    for cpd_smi in sorted(chembl_cpds_t1):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t1
                       })
    for cpd_smi in sorted(chembl_cpds_t2):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": t2
                       })
    # Random Sample of ChEMBL CPDs
    # Restriction: Excluding CPDs active agaisnt either target or related targets.
    excluded_cpds = set()
    for t in [t1, t2]:
        for protein in protein_function_handler.get_related_proteins(t, undertermined_equals_related=True):
            if protein in chembl_target_cpds: 
                cpds = chembl_target_cpds [protein]
                excluded_cpds.update(cpds)
    available_cpds = sorted(all_cpds-excluded_cpds)
    sample_size = len(chembl_cpds_t1 | chembl_cpds_t2)
    sampled_cpds = r_generator.choice(available_cpds, size=sample_size, replace=False).tolist()
    for cpd_smi in sorted(sampled_cpds):
        pair_df.append({"nonstereo_aromatic_smiles": cpd_smi,
                        "target_pair": f"{t1}_{t2}",
                        "label": "random_cpd"
                       })
    
    pair_df = pd.DataFrame(pair_df)
    pair_df.to_csv(f"./chembl29_predicting_target_{t1}_{t2}.tsv", sep="\t", index=False)