In [1]:
import pandas as pd

In [2]:
# Load dataset

from tdc.multi_pred import TrialOutcome
data = TrialOutcome(name = 'phase1') # 'phase2' / 'phase3'
split = data.get_split()

Downloading...
100%|██████████| 7.75M/7.75M [00:00<00:00, 23.8MiB/s]
Loading...
Done!


In [3]:
template = """### Instructions: Answer the following question about clinical trials.
### Context: Clinical trial is the most time and cost-consuming step in the drug discovery process. Phase 1 clinical trials test the safety and basic properties of a new 
drug or treatment in a small group of people for the first time. Optimizing and designing trials with machine learning could drastically lead to the speedup of delivery 
of life-saving therapeutics to patients. Clinical trial outcome prediction is a machine learning task that aims to forecast the outcome of clinical trials, such as the 
approval rate of a drug or treatment. It utilizes various clinical trial features, including the drug’s molecular structure and patient disease.
### Question: Given a drug SMILES string and disease, predict if the phase 1 trial
(A) would not be approved (B) would be approved 
Drug SMILES: {}
Disease: {} 
### Answer: {}"""


In [4]:
split["train"].head()

Unnamed: 0,nctid,start_date,complete_date,drug_molecules,icdcodes,eligibility_criteria,Y
0,NCT00002790,March 1996,,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\\C=C\\C...,D89.810__D89.811__D89.813__D89.812__C95.91__C9...,DISEASE CHARACTERISTICS: See General Eligibili...,0
1,NCT00002863,June 1996,July 2000,[H][C@@]12N(C)C3=CC(OC)=C(C=C3[C@@]11CCN3CC=C[...,C96.A__C46.9__C96.22__C46.0__C46.2__C92.31__C9...,DISEASE CHARACTERISTICS: Biopsy-proven soft ti...,0
2,NCT00003005,December 1997,March 2001,[H][C@@]1(CO)C[C@@]([H])(O)[C@@]([H])(O1)N1C=N...,C95.91__C95.92__Z80.6__Z85.6__C90.11__C90.12__...,DISEASE CHARACTERISTICS: TdT positive acute ly...,0
3,NCT00003060,March 1995,"February 22, 2001",CS(=O)(=O)OCCCCOS(C)(=O)=O__ClCCN(CCCl)P1(=O)N...,C43.51__C43.9__C43.52__D03.51__C43.8__Z85.820_...,DISEASE CHARACTERISTICS: Biopsy proven relapse...,0
4,NCT00003194,July 1997,"December 19, 2002",N[C@@H](CCCNC(N)=N)C(O)=O__ClCCN(CCCl)P1(=O)NC...,H01.009__H02.209__H02.009__H02.109__H04.209__H...,DISEASE CHARACTERISTICS: - Histologically prov...,0


In [9]:
split["train"].loc[0,"icdcodes"]

'D89.810__D89.811__D89.813__D89.812__C95.91__C95.92__Z80.6__Z85.6__C90.11__C90.12__C91.01__D46.9__D46.C__D46.Z'

In [10]:
split["train"].loc[0,"nctid"]

'NCT00002790'

In [11]:
split["train"].loc[0,"eligibility_criteria"]

'DISEASE CHARACTERISTICS: See General Eligibility Criteria PATIENT CHARACTERISTICS: Age: 13 and over Performance status: Not specified Hematopoietic: Not specified Hepatic: Not specified Renal: Creatinine no greater than 2.0 mg/dL Cardiovascular: No cardiac disease No clinically significant cardiac abnormality No ischemia No recent injury on EKG Other: No intolerance or unresponsiveness to rapamycin No hypersensitivity to macrolide antibiotics, e.g., erythromycin, azithromycin, clarithromycin No requirement for medications that may significantly affect rapamycin metabolism, i.e.: Carbamazepine Ketoconazole Primidone Cimetidine Nicardipine Rifampin Diltiazem Phenobarbital Valproic acid Erythromycin Phenytoin Verapamil No uncontrolled systemic infection No pregnant or nursing women Negative pregnancy test required of fertile women Effective contraception required of fertile patients during and for 3 months after study Able to tolerate less than 400 mL of liquid oral intake PRIOR CONCURRE

In [5]:
# process dataset

def formatting_prompts_func(examples):
    peptide = examples["Peptide"]
    mhc = examples["MHC"]
    outputs       = examples["Y"]
    texts = []
    for p,m,o in zip(peptide, mhc, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(p,m, "(B)" if o else "(A)")
        texts.append(text)
    return { "text" : texts, "peptide":peptide, "mhc":mhc, "outputs":outputs, "ds_ident":"TrialOutcome_phase1"}

def formatting_prompts_func_test(examples):
    peptide = examples["Peptide"]
    mhc = examples["MHC"]
    outputs       = examples["Y"]
    texts = []
    for p,m,o in zip(peptide, mhc, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = template.format(p,m, "")
        texts.append(text)
    return { "text" : texts, "peptide":peptide, "mhc":mhc, "outputs":outputs, "ds_ident":"TrialOutcome_phase1"}

In [6]:
train_dataset = formatting_prompts_func(split["train"])
valid_dataset = formatting_prompts_func_test(split["valid"])
test_dataset = formatting_prompts_func_test(split["test"])

In [7]:
len(train_dataset["text"]), len(valid_dataset["text"]), len(test_dataset["text"])

(130190, 18598, 37197)

In [8]:
import pickle
import os


def dump_file(obj, path, file_name):
    os.makedirs(path, exist_ok=True)
    file_name = os.path.join(path,file_name)
    with open(file_name, "wb") as f0:
        pickle.dump(obj,f0)

In [9]:
dump_file(train_dataset, "processed", "train.pkl")
dump_file(valid_dataset, "processed", "valid.pkl")
dump_file(test_dataset, "processed", "test.pkl")