In [1]:
from model import *

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [2]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

stop_list = ["\n\n", "\n\n\n", "Task:\nBelow"]
# stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
print(stop_token_ids)

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

[tensor([29871,    13,    13], device='cuda:0'), tensor([29871,    13,    13,    13], device='cuda:0'), tensor([ 9330, 29901,    13, 21140,   340], device='cuda:0')]


In [3]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    # streamer = transformers.TextStreamer(tokenizer)
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [4]:
%load_ext autoreload
%autoreload 2

from prompt_examples.chunked_examples import prompt, examples


In [None]:
print(prompt.format(criteria="""Inclusion Criteria:
    •	Adults with a confirmed diagnosis of unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC with no curative-intent treatment options and documented activating KRAS mutation (without known additional actionable driver mutations such as EGFR, ALK or ROS1)
    •	Documented progression and measurable disease after ‚â• 1 prior line of systemic therapy (‚â• 2 and ‚â§ 4 prior lines for NSCLC) with adequate washout period and resolution of treatment-related toxicities to ‚â§ Grade 2
    •	ECOG PS of 0-2 (0-1 for PDAC) and a life expectancy > 3 months in the opinion of the Investigator
    •	Adequate hematological, liver, and renal function
    •	Men and women of childbearing potential must use adequate birth control measures for the duration of the trial and at least 90 days after discontinuing study treatment
    •	Symptomatic and/or untreated CNS or brain metastasis, pre-existing ILD or pericardial/pleural effusion of ‚â• grade 2 or requiring chronic oxygen therapy for COPD or pleural effusions
    •	Serious concomitant disorder including infection
    •	Known positive test for HIV, HCV, HBV surface antigen

    Exclusion Criteria:
    •	Concurrent malignancy in the previous 2 years
    •	Prior menin inhibitor therapy
    •	Requiring treatment with a strong or moderate CYP3A inhibitor/inducer
    •	Significant cardiovascular disease or QTcF or QTcB prolongation.
    •	Major surgery within 4 weeks prior to first dose
    •	Women who are pregnant or lactating.
"""))

In [8]:
from token_counting import *
globalize_token_metrics(examples)
print('  avg prompt:', AVG_PROMPT_LEN)
print('  min prompt:', MIN_PROMPT_LEN)
print('  max prompt:', MAX_PROMPT_LEN)
print('avg response:', AVG_RES_LEN)

  avg prompt: 794
  min prompt: 702
  max prompt: 947
avg response: 220


In [9]:
from chunking import *

In [10]:
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
import langchain
langchain.debug = True
langchain.verbose = True

n = '06'
folder = f'test_results/trial{n}'

original_doc = parse_file(f'{folder}/unstructured_ec.txt')
doc_chunks = chunk_ec(original_doc)
doc_reassembled = ECDoc([], [])

for idx, doc in enumerate(doc_chunks):
    results = llm_chain.invoke(input={'criteria': str(doc)})
    with open(f'{folder}/chunking_task_{idx}_output.txt', 'w', encoding='utf-8') as fileout:
        fileout.write(results['text'])

    doc_reassembled.inc.extend(doc.inc)
    doc_reassembled.exc.extend(doc.exc)

with open(f'{folder}/unstructured_ec_reassembled.txt', 'w') as docfile:
        docfile.write(str(doc_reassembled).rstrip())

In [23]:
# Copy the prompts that Langchain logs
prompts_used = [
    "You are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n    •\tParticipants must be ‚â• 18 years of age\n    •\tHistologically or cytologically confirmed diagnosis of metastatic solid tumors\n    •\tEastern Cooperative Oncology Group (ECOG) performance status 0-1\n    •\tAll participants should have at least 1 measurable disease per RECIST v1.1. An irradiated lesion can be considered measurable only if progression has been demonstrated on the irradiated lesion.\n    •\tBody weight within [45 - 150 kg] (inclusive)\n    •\tAll Contraceptive use by men and women should be consistent with local regulations regarding the methods of contraception for those participating in clinical studies.\n    •\tCapable of giving signed informed consent\n    •\tAny clinically significant cardiac disease\n    •\tHistory of or current interstitial lung disease or pneumonitis\n\n    Exclusion Criteria\n    •\tUncontrolled or unresolved acute renal failure\n    •\tPrior solid organ or hematologic transplant.\n    •\tKnown positivity with human immunodeficiency virus (HIV), known active hepatitis A, B, and C, or uncontrolled chronic or ongoing infectious requiring parenteral treatment.\n    •\tReceipt of a live-virus vaccination within 28 days of planned treatment start\n    •\tParticipation in a concurrent clinical study in the treatment period.\n    •\tInadequate hematologic, hepatic and renal function\n    •\tParticipant not suitable for participation, whatever the reason, as judged by the Investigator, including medical or clinical conditions.\n\n| Type | Original Text | Disease/Condition | Procedure | Drug | Biomarker | Computable Rule |\n| --- | --- | --- | --- | --- | --- | --- |\n| Inclusion | Histologically or cytologically confirmed diagnosis of metastatic solid tumors | Metastatic solid tumor | | | | diagnosis == \"Metastatic solid tumor\" |\n| Exclusion | Prior solid organ or hematologic transplant. | | Solid organ transplantation | | | Solid organ transplantation is True |\n\n\nYou are in the role of an abstractor who will analyze eligibility criteria for a clinical trial and represent the information as a list of individual criteria in a tabular format that will contain the following columns: \nType: listing whether criterion is an Exclusion or Inclusion criterion\nOriginal Text: the original text of the criterion\nDisease/Condition: If the criterion contains a disease or condition name it by its canonical name\nProcedure: If the criterion contains a therapeutic procedure name it by its canonical name\nDrug:  If the criterion contains a therapeutic drug name it by its canonical name\nBiomarker:  If the criterion contains a biomarker name it by its canonical name\nComputable Rule: Translate the criteria into a logical expression that could be interpreted programmatically\n    Inclusion Criteria\n•\tProvision of signed and dated informed consent form\n•\tStated willingness to comply with all study procedures and availability for the duration of the study\n•\tMale patients aged 18 years and older\n•\tIn good general health as evidenced by medical history to be a candidate for curative-intent prostate cancer treatment \n•\tAbility to receive pelvic radiotherapy and be willing to adhere to the SUPR-SABR regimen \n•\tPreviously untreated prostate cancer (with cytotoxic chemotherapy, cryotherapy, surgical or radiation therapy)\n•\tLocalized adenocarcinoma of the prostate with the following features\n\tcT1-T2c\n\tProstate specific antigen (PSA) < 20\n\t** Patients receiving a 5-alpha reductase inhibitor must have a PSA < 10\n\tGrade group 1-3\n\tLymph node negative\n\tNegative for distant metastases\n•\tProstate volume < 120 cc \n•\tHistory and physical including a digital rectal exam 90 days prior to registration \n•\tEastern Cooperative Oncology Group (ECOG) performance status 0-2\n•\tBe able to undergo MRI prostate and pelvis as a component of radiation therapy (RT) planning\n•\tBone and soft tissue imaging as clinically indicated (for unfavorable intermediate risk or symptomatic patients only) within 120 days prior to registration\n•\tInternational Prostate Symptom Score (IPSS) score =< 20 at time of initial history and physical with treating radiation oncologist\n\nExclusion Criteria\n\tConcurrent use of testosterone supplementation as it is contraindicated during prostate cancer treatment\n\tKnown homozygous for ATM pathogenic mutation\n\tPrior pelvic RT \n\tPre-existing conditions or overall health status which disqualifies the patient from curative- intent RT. Patients with life expectancy less than 10 years are not eligible\n\tPrior or concurrent invasive pelvic malignancy (except non-melanomatous skin cancer) or lymphomatous or hematogenous malignancy, unless disease free for a minimum of 5 years\n\tPrior prostatectomy, cryotherapy, high-intensity focused ultrasound directed towards the prostate for any prostate disease or condition",
]

for idx, prompt_used in enumerate(prompts_used):
    prompt_len = get_token_len(prompt_used)
    with open(f'{folder}/chunking_task_{idx}_prompt.txt', 'w', encoding='utf-8') as fileout:
        fileout.write("""[metrics]
length={prompt_len}

[prompt]
{prompt}""".format(prompt=prompt_used, prompt_len=prompt_len))