In [1]:
from model import *

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [2]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

stop_list = ["\n\n", "\n\n\n", "Task:\nBelow"]
# stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
print(stop_token_ids)

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

[tensor([29871,    13,    13], device='cuda:0'), tensor([29871,    13,    13,    13], device='cuda:0'), tensor([ 9330, 29901,    13, 21140,   340], device='cuda:0')]


In [3]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    # streamer = transformers.TextStreamer(tokenizer)
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [24]:
%load_ext autoreload
%autoreload 2

from prompt_examples.chunked_examples import prompt, prompt_zero, examples


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
print(prompt_zero.format(criteria="""Inclusion Criteria:
    •	Adults with a confirmed diagnosis of unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC with no curative-intent treatment options and documented activating KRAS mutation (without known additional actionable driver mutations such as EGFR, ALK or ROS1)
    •	Documented progression and measurable disease after ‚â• 1 prior line of systemic therapy (‚â• 2 and ‚â§ 4 prior lines for NSCLC) with adequate washout period and resolution of treatment-related toxicities to ‚â§ Grade 2
    •	ECOG PS of 0-2 (0-1 for PDAC) and a life expectancy > 3 months in the opinion of the Investigator
    •	Adequate hematological, liver, and renal function
    •	Men and women of childbearing potential must use adequate birth control measures for the duration of the trial and at least 90 days after discontinuing study treatment
    •	Symptomatic and/or untreated CNS or brain metastasis, pre-existing ILD or pericardial/pleural effusion of ‚â• grade 2 or requiring chronic oxygen therapy for COPD or pleural effusions
    •	Serious concomitant disorder including infection
    •	Known positive test for HIV, HCV, HBV surface antigen

    Exclusion Criteria:
    •	Concurrent malignancy in the previous 2 years
    •	Prior menin inhibitor therapy
    •	Requiring treatment with a strong or moderate CYP3A inhibitor/inducer
    •	Significant cardiovascular disease or QTcF or QTcB prolongation.
    •	Major surgery within 4 weeks prior to first dose
    •	Women who are pregnant or lactating.
"""))

You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: 
Type: listing whether criterion is an Exclusion or Inclusion criterion
Original Text: the original text of the criterion
Disease/Condition: If the criterion contains a disease or condition name it by its canonical name
Procedure: If the criterion contains a therapeutic procedure name it by its canonical name
Drug:  If the criterion contains a therapeutic drug name it by its canonical name
Biomarker:  If the criterion contains a biomarker name it by its canonical name
Computable Rule: Translate the criteria into a logical expression that could be interpreted programmatically
    Inclusion Criteria:
    •	Adults with a confirmed diagnosis of unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC with no curative-in

In [8]:
from token_counting import *
globalize_token_metrics(examples)
print('  avg prompt:', AVG_PROMPT_LEN)
print('  min prompt:', MIN_PROMPT_LEN)
print('  max prompt:', MAX_PROMPT_LEN)
print('avg response:', AVG_RES_LEN)

  avg prompt: 794
  min prompt: 702
  max prompt: 947
avg response: 220


In [9]:
from chunking import *

In [26]:
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain_zero = LLMChain(llm=llm, prompt=prompt_zero)

In [27]:
import langchain
langchain.debug = True
langchain.verbose = True

n = '01'
folder = f'test_results/trial{n}'

original_doc = parse_file(f'{folder}/unstructured_ec.txt')
doc_chunks = chunk_ec(original_doc)
doc_reassembled = ECDoc([], [])

for idx, doc in enumerate(doc_chunks):
    results = llm_chain_zero.invoke(input={'criteria': str(doc)})
    with open(f'{folder}/chunking_task_zeroshot_{idx}_output.txt', 'w', encoding='utf-8') as fileout:
        fileout.write(results['text'])

    # doc_reassembled.inc.extend(doc.inc)
    # doc_reassembled.exc.extend(doc.exc)

# with open(f'{folder}/unstructured_ec_reassembled.txt', 'w') as docfile:
#         docfile.write(str(doc_reassembled).rstrip())

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "criteria": "Inclusion Criteria\n•\tCOHORT A: At least one measurable CNS metastasis, defined as >= 10 mm in at least one dimension\n•\tCOHORT A: Unequivocal evidence of new and/or progressive brain metastases, and at least one of the following scenarios:\n•\tTreated with stereotactic radiosurgery (SRS) or surgery with residual un-treated lesions remaining. Such participants are eligible for immediate enrollment on this study providing that at least one untreated lesion is measurable\n•\tParticipants who have had prior whole brain radiotherapy (WBRT) and/or SRS and then whose lesions have subsequently progressed or who have new lesions are also eligible. In this case, lesions which have been treated with SRS may be considered as target lesions if there is unequivocal evidence, in the opinion of the treating physician, of progression following SRS\n•\tParticipants who have not previously been tr

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:LLMChain > 2:llm:HuggingFacePipeline] [21.31s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Inclusion Criteria\n•\tAge ≥ 18 years\n•\tEastern Cooperative Oncology Group performance status ≤2\n•\tLife expectancy >3 months\n•\tAdequate bone marrow function (absolute neutrophil count ≥1500/mm3; platelets ≥100,000/mm3)\n•\tAdequate hepatic function (total bilirubin <1.5 x ULN; AST/ALT <2.5 x ULN)\n•\tAdequate renal function (creatinine clearance >60 ml/min)\n•\tAble to swallow tablets\n•\tAble to understand and willing to sign a written informed consent document\n\nExclusion Criteria\n•\tPregnant or breastfeeding women\n•\tUncontrolled hypertension\n•\tActive central nervous system (CNS) metastases (i.e., patients with active brain metastases should be treated with steroids and anticonvulsants to optimize CNS management before entering this trial)\n•\tHistory of myocard",
        "generation_info": null,
      

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:LLMChain > 2:llm:HuggingFacePipeline] [20.96s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "### Dose Limiting Toxicity\nThe maximum tolerated dose (MTD) is defined as the highest dose level at which no more than one of six patients experience a DLT during cycle 1. The MTD is determined using the Bayesian logistic regression model described below.\n\n### Dose Escalation\nA starting dose of 10mg twice daily is planned. This dose is selected based on the results from the phase I study of GDC-0084 in combination with paclitaxel in patients with advanced solid tumors. The starting dose of 10mg twice daily is also supported by preclinical data demonstrating that this dose is well tolerated in cynomolgus monkeys.\n\n### Dose Reduction\nDose reduction may occur due to adverse events or abnormalities in laboratory values. Dose reductions are made according to the following guidelines:\n\n### Dose Modifications\nDose

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:LLMChain > 2:llm:HuggingFacePipeline] [21.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Inclusion Criteria\n•\tThe participant is >= 18 years old\n•\tAbsolute neutrophil count >= 1,000/ul\n•\tPlatelets >= 75,000/ul\n•\tHemoglobin >= 9 g/dL\n•\tTotal bilirubin =< 1.5 mg/dL (upper limit of normal) except subject with documented Gilbert's syndrome (=< 5 x upper limit of normal [ULN]) or liver metastasis, who must have a baseline total bilirubin =< 3.0 mg/dL\n•\tAspartate aminotransferase (AST) (serum glutamic oxaloacetic transaminase [SGOT])/alanine aminotransferase (ALT) (serum glutamate pyruvate transaminase [SGPT]) =< 2.5 x institutional ULN OR =< 5.0 x institutional ULN for patients with documented liver metastases\n\nExclusion Criteria\n•\tThe subject has an un",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/e

In [28]:
# Copy the prompts that Langchain logs
prompts_used = [
    "You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n•\tCOHORT A: At least one measurable CNS metastasis, defined as >= 10 mm in at least one dimension\n•\tCOHORT A: Unequivocal evidence of new and/or progressive brain metastases, and at least one of the following scenarios:\n•\tTreated with stereotactic radiosurgery (SRS) or surgery with residual un-treated lesions remaining. Such participants are eligible for immediate enrollment on this study providing that at least one untreated lesion is measurable\n•\tParticipants who have had prior whole brain radiotherapy (WBRT) and/or SRS and then whose lesions have subsequently progressed or who have new lesions are also eligible. In this case, lesions which have been treated with SRS may be considered as target lesions if there is unequivocal evidence, in the opinion of the treating physician, of progression following SRS\n•\tParticipants who have not previously been treated with cranial radiation (e.g., WBRT or SRS) are eligible to enter the study, but such participants must be asymptomatic from their CNS metastases and not requiring corticosteroids for symptom control\n•\tParticipants who present with systemic stable/absent or progressive disease are eligible to this trial, as long as they fulfill one of the above criteria\n\nExclusion Criteria\n•\tVisceral crisis or impending visceral crisis at time of screening\n•\tCNS complications for whom urgent neurosurgical intervention is indicated (e.g., resection, shunt placement)\n•\tKnown leptomeningeal metastases (defined as positive CSF cytology and/or unequivocal radiological evidence of clinically significant leptomeningeal involvement. CSF sampling is not required in the absence of suggestive symptoms to exclude leptomeningeal involvement)\n•\tPatients with known contraindication to magnetic resonance imaging (MRI) (e.g., due to pacemaker, ferromagnetic implants, claustrophobia, extreme obesity, hypersensitivity, etc.). However, head computed tomography (CT) with contrast may be used in place of MRI at baseline and throughout the trial if MRI is contraindicated and a participant’s brain metastases are clearly measurable by head CT",
    "You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n•\tCOHORT B: New and/or progressive brain metastasis(es) with clinical indication for resection\n•\tPathologically confirmed HER2-positive MBC by local laboratory with the following requirements: HER2 overexpressed or amplified (immunohistochemistry of 3+ or HER2 gene amplification by in situ hybridization with a ratio of HER2-gene signals to centromere 17 signals >= 2.0 or average HER2 copy number >= 6.0 signals/cells)\n•\tEastern Cooperative Oncology Group (ECOG) performance status of =< 2\n•\tLeft ventricular ejection fraction (LVEF) >= 50% by echocardiogram (ECHO) or multigated acquisition (MUGA) scan\n•\tStable or decreasing corticosteroid dose for at least 7 days prior to initiation of treatment\n•\tConcurrent administration of other anti-cancer therapy during the course of this study is not allowed. Note that concurrent use of supportive care medications (e.g. anti-resorptive agents, pain medications) is allowed\n\nExclusion Criteria\n•\tChemotherapy or targeted therapy within 14 days prior to initiation of protocol therapy. No washout is required for trastuzumab\n•\tHas received prior therapy with a PI3K or mTOR inhibitor\n•\tNo washout is required for endocrine therapy. If a patient has been on ovarian suppression for at least 28 days prior to initiation of study treatment, continuation of ovarian suppression is permitted on protocol. Starting a new endocrine therapy during protocol therapy is not permitted\n•\tCurrent use or history of receiving a non-approved, investigational treatment within 14 days prior to initiation of protocol therapy\n•\tSubjects with a history of hypersensitivity to compounds of similar biologic composition to paxalisib (GDC-0084) or any constituent of the product",
    "You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n•\tThe participant is >= 18 years old\n•\tAbsolute neutrophil count >= 1,000/ul\n•\tPlatelets >= 75,000/ul\n•\tHemoglobin >= 9 g/dL\n•\tTotal bilirubin =< 1.5 mg/dL (upper limit of normal) except subject with documented Gilbert's syndrome (=< 5 x upper limit of normal [ULN]) or liver metastasis, who must have a baseline total bilirubin =< 3.0 mg/dL\n•\tAspartate aminotransferase (AST) (serum glutamic oxaloacetic transaminase [SGOT])/alanine aminotransferase (ALT) (serum glutamate pyruvate transaminase [SGPT]) =< 2.5 x institutional ULN OR =< 5.0 x institutional ULN for patients with documented liver metastases\n\nExclusion Criteria\n•\tThe subject has an uncontrolled intercurrent illness, including, but not limited to, ongoing or active infection, uncontrolled hypertension, unstable angina pectoris, uncontrolled cardiac arrhythmia, congestive heart failure-New York Heart Association class III or IV, active ischemic heart disease, myocardial infarction within the previous six months, uncontrolled diabetes mellitus (DM), gastric or duodenal ulceration diagnosed within the previous 6 months, chronic liver or renal disease, or severe malnutrition. If a participant has controlled DM but is unable to monitor blood sugars at home, they will be excluded from the trial\n•\tThe subject is pregnant or breast-feeding\n•\tNo active, second potentially life-threatening cancer\n•\tHas had major surgery within 21 days before initiation of protocol therapy\n•\tActive infection requiring IV antibiotics at the time of protocol therapy initiation",
    "You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n•\tFasting glucose =< 140 mg/dL and glycosylated hemoglobin measurement (HbA1c) =< 7%\n•\tSerum creatinine =< 1.5 mg/dL (or glomerular filtration rate >= 30 ml/min as determined by the Cockcroft-Gault equation)\n•\tFemale subjects of childbearing potential must have a negative serum or urine pregnancy test within 8 days of initiating protocol therapy | \n•\tThe effects of paxalisib (GDC-0084) on the developing human fetus are unknown and radiotherapy has known teratogenic effects so women of child-bearing potential and men must agree to use adequate contraception (barrier method of birth control; abstinence) prior to study entry and for the duration of study participation and 7 months after completion of trastuzumab administration per recommendations from the trastuzumab package insert\n•\tThe subject is capable of understanding and complying with the protocol and has signed the informed consent document\n•\tParticipant must be able to swallow and retain oral medication\n\nExclusion Criteria\n•\tSymptomatic intrinsic lung disease or extensive tumor involvement of the lungs, resulting in dyspnea at rest\n•\tKnown intolerance to trastuzumab that persists after appropriate medical management. Patients who have a history of prior intolerance to trastuzumab that is controlled after medical management and who tolerate trastuzumab thereafter without reactions are eligible to participate\n•\tCorrected QT (QTc) interval time of >= 470 msec\n•\tNote: The correction may be made using any method of QTc calculation\n•\tParticipants receiving any medications or substances that are strong inhibitors or strong inducers of CYP3A4 are ineligible. Should a participant be taking one of these agents and is able to discontinue the therapy or switch to a different agent, no washout will be required prior to starting study medication. Corticosteroids, which are weak CYP3A4 inducers are allowed. Because the lists of these agents are constantly changing, it is important to regularly consult a frequently-updated list; medical reference texts such as the Physiciansâ€™ Desk Reference may also provide this information. As part of the enrollment/informed consent procedures, the patient will be counseled on the risk of interactions with other agents, and what to do if new medications need to be prescribed or if the patient is considering a new over-the-counter medicine or herbal product"
]

for idx, prompt_used in enumerate(prompts_used):
    prompt_len = get_token_len(prompt_used)
    with open(f'{folder}/chunking_task_zeroshot_{idx}_prompt.txt', 'w', encoding='utf-8') as fileout:
        fileout.write("""[metrics]
length={prompt_len}

[prompt]
{prompt}""".format(prompt=prompt_used, prompt_len=prompt_len))