# Merging Compound files distributed by CLUE

We consolidate drug and sample resources information into a single file for easier downstream processing.

The data were originally retrieved from https://clue.io/repurposing.
See [`clue/README.md`](clue/README.md) for more details.

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
import numpy as np
import pandas as pd
import rdkit.Chem.inchi as inchi



<IPython.core.display.Javascript object>

## Load Data

In [3]:
data_dir = "clue"
date = "20200324"

<IPython.core.display.Javascript object>

In [4]:
drug_file = os.path.join(data_dir, f"repurposing_drugs_{date}.txt")
drug_df = pd.read_csv(drug_file, encoding="ISO-8859-1", sep="\t", comment="!")

print(drug_df.shape)
drug_df.head(2)

(6798, 6)


Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,


<IPython.core.display.Javascript object>

In [5]:
sample_file = os.path.join(data_dir, f"repurposing_samples_{date}.txt")
sample_df = pd.read_csv(sample_file, encoding="ISO-8859-1", sep="\t", comment="!")

print(sample_df.shape)
sample_df.head(2)

(13553, 12)


Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,0,98.9,MedChemEx,HY-12723A,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
1,BRD-K76022557-003-02-7,(R)-(-)-apomorphine,0,97.34,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,


<IPython.core.display.Javascript object>

## Checking for `pert_iname` Discrepancies

In [6]:
# Assert that all pert_inames exist in both resources
assert len(set(drug_df.pert_iname.values).difference(set(sample_df.pert_iname))) == 0
assert len(set(sample_df.pert_iname.values).difference(set(drug_df.pert_iname))) == 0

<IPython.core.display.Javascript object>

## Merge the Samples and Drugs data

In [7]:
combined_df = drug_df.merge(sample_df, on="pert_iname", how="inner").reset_index(
    drop=True
)

# Move broad_id to first column
col_order = combined_df.columns.tolist()
col_order.insert(0, col_order.pop(col_order.index("broad_id")))
combined_df = combined_df.loc[:, col_order].assign(
    InChIKey14=combined_df.InChIKey.apply(
        lambda x: inchi.InchiToInchiKey(x) if (x.startswith("InChI")) else x
    ).apply(lambda x: str(x)[:14])
)

# Output to file
output_file = "repurposing_info"
combined_df.to_csv(f"{output_file}.tsv", sep="\t", index=False)

print(combined_df.shape)
combined_df.head()

(13553, 18)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id,InChIKey14
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,HY-12723A,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC
1,BRD-K76022557-003-02-7,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,97.34,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC
2,BRD-K76022557-003-29-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,97.36,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC
3,BRD-K76022557-001-03-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,95.8,Selleck,S4350,R-(-)-Apomorphine HCl Hemihydrate,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC
4,BRD-K75516118-001-04-1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,,0,93.92,Tocris,1349,(R)-(-)-Rolipram,275.152,COc1ccc(cc1OC1CCCC1)[C@@H]1CNC(=O)C1,HJORMJIFDVBMOB-LBPRGKRZSA-N,448055.0,,HJORMJIFDVBMOB


<IPython.core.display.Javascript object>

## Create a "Long" version where we split MOA and Target delimiters

Certain compounds have multiple MOA classes and targets that are delimited by pipes (`|`).
Each MOA class and target can be considered to have equal support (see https://github.com/broadinstitute/lincs-cell-painting/issues/5).

Split the combined data on both MOA and target along each pipe and elongate the table.
This is done to reduce computational burden of multiple downstream analyses performing the same splits.

In [8]:
# The splitting strategy does not work with missing values
# Add a dummy variable, that will be replaced downstream
combined_df.moa = combined_df.moa.fillna("replace_with_na")
combined_df.target = combined_df.target.fillna("replace_with_na")

<IPython.core.display.Javascript object>

In [9]:
# Make sure the original index is preserved
split_col_index = f"{output_file}_index"

<IPython.core.display.Javascript object>

In [10]:
moa_split_df = (
    pd.DataFrame(combined_df.moa.str.split("|").tolist(), index=combined_df.index)
    .stack()
    .reset_index()
)
moa_split_df.columns = [split_col_index, "_", "moa_unique"]

print(moa_split_df.shape)
moa_split_df.head()

(14900, 3)


Unnamed: 0,repurposing_info_index,_,moa_unique
0,0,0,dopamine receptor agonist
1,1,0,dopamine receptor agonist
2,2,0,dopamine receptor agonist
3,3,0,dopamine receptor agonist
4,4,0,phosphodiesterase inhibitor


<IPython.core.display.Javascript object>

In [11]:
target_split_df = (
    pd.DataFrame(combined_df.target.str.split("|").tolist(), index=combined_df.index)
    .stack()
    .reset_index()
)

target_split_df.columns = [split_col_index, "_", "target_unique"]

print(target_split_df.shape)
target_split_df.head()

(32226, 3)


Unnamed: 0,repurposing_info_index,_,target_unique
0,0,0,ADRA2A
1,0,1,ADRA2B
2,0,2,ADRA2C
3,0,3,CALY
4,0,4,DRD1


<IPython.core.display.Javascript object>

In [12]:
long_combined_df = (
    combined_df.merge(
        moa_split_df.loc[:, [split_col_index, "moa_unique"]],
        left_index=True,
        right_on=split_col_index,
        how="left",
    )
    .merge(
        target_split_df.loc[:, [split_col_index, "target_unique"]],
        on=split_col_index,
        how="left",
    )
    .reset_index(drop=True)
)

# Put back missing values
long_combined_df.loc[long_combined_df.moa == "replace_with_na", "moa"] = np.nan
long_combined_df.loc[
    long_combined_df.moa_unique == "replace_with_na", "moa_unique"
] = np.nan
long_combined_df.loc[long_combined_df.target == "replace_with_na", "target"] = np.nan
long_combined_df.loc[
    long_combined_df.target_unique == "replace_with_na", "target_unique"
] = np.nan

# Output to file
long_combined_df.to_csv("repurposing_info_long.tsv", sep="\t", index=False)

print(long_combined_df.shape)
long_combined_df.head()

(39471, 21)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,...,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id,InChIKey14,repurposing_info_index,moa_unique,target_unique
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,...,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC,0,dopamine receptor agonist,ADRA2A
1,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,...,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC,0,dopamine receptor agonist,ADRA2B
2,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,...,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC,0,dopamine receptor agonist,ADRA2C
3,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,...,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC,0,dopamine receptor agonist,CALY
4,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,...,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC,0,dopamine receptor agonist,DRD1


<IPython.core.display.Javascript object>