In [None]:
!pip install -r requirements.txt

In [3]:
from torch import cuda, bfloat16
import transformers

model_id="epfl-llm/meditron-7b"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    # token=token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    # token=token
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    # token=token
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [4]:
def get_token_len(text: str):
    return len(tokenizer.encode(text))

In [5]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

stop_list = ["\n\n", "\n\n\n", "Task:\nBelow"]
# stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
print(stop_token_ids)

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

[tensor([29871,    13,    13], device='cuda:0'), tensor([29871,    13,    13,    13], device='cuda:0'), tensor([ 9330, 29901,    13, 21140,   340], device='cuda:0')]


In [6]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    streamer = transformers.TextStreamer(tokenizer)
)
llm = HuggingFacePipeline(pipeline=generate_text)

In [7]:
from prompt_examples.summarization_examples import prompt

In [8]:
print(prompt.format(criteria="""
Inclusion Criteria
•	COHORT A: At least one measurable CNS metastasis, defined as >= 10 mm in at least one dimension
•	COHORT A: Unequivocal evidence of new and/or progressive brain metastases, and at least one of the following scenarios:
•	Treated with stereotactic radiosurgery (SRS) or surgery with residual un-treated lesions remaining. Such participants are eligible for immediate enrollment on this study providing that at least one untreated lesion is measurable
•	Participants who have had prior whole brain radiotherapy (WBRT) and/or SRS and then whose lesions have subsequently progressed or who have new lesions are also eligible. In this case, lesions which have been treated with SRS may be considered as target lesions if there is unequivocal evidence, in the opinion of the treating physician, of progression following SRS
•	Participants who have not previously been treated with cranial radiation (e.g., WBRT or SRS) are eligible to enter the study, but such participants must be asymptomatic from their CNS metastases and not requiring corticosteroids for symptom control
•	Participants who present with systemic stable/absent or progressive disease are eligible to this trial, as long as they fulfill one of the above criteria
•	COHORT B: New and/or progressive brain metastasis(es) with clinical indication for resection
•	Pathologically confirmed HER2-positive MBC by local laboratory with the following requirements: HER2 overexpressed or amplified (immunohistochemistry of 3+ or HER2 gene amplification by in situ hybridization with a ratio of HER2-gene signals to centromere 17 signals >= 2.0 or average HER2 copy number >= 6.0 signals/cells)
•	Eastern Cooperative Oncology Group (ECOG) performance status of =< 2
•	Left ventricular ejection fraction (LVEF) >= 50% by echocardiogram (ECHO) or multigated acquisition (MUGA) scan

Exclusion Criteria
•	Visceral crisis or impending visceral crisis at time of screening
•	CNS complications for whom urgent neurosurgical intervention is indicated (e.g., resection, shunt placement)
•	Known leptomeningeal metastases (defined as positive CSF cytology and/or unequivocal radiological evidence of clinically significant leptomeningeal involvement. CSF sampling is not required in the absence of suggestive symptoms to exclude leptomeningeal involvement)
•	Patients with known contraindication to magnetic resonance imaging (MRI) (e.g., due to pacemaker, ferromagnetic implants, claustrophobia, extreme obesity, hypersensitivity, etc.). However, head computed tomography (CT) with contrast may be used in place of MRI at baseline and throughout the trial if MRI is contraindicated and a participant’s brain metastases are clearly measurable by head CT
•	Chemotherapy or targeted therapy within 14 days prior to initiation of protocol therapy. No washout is required for trastuzumab
•	Has received prior therapy with a PI3K or mTOR inhibitor
•	No washout is required for endocrine therapy. If a patient has been on ovarian suppression for at least 28 days prior to initiation of study treatment, continuation of ovarian suppression is permitted on protocol. Starting a new endocrine therapy during protocol therapy is not permitted
•	Current use or history of receiving a non-approved, investigational treatment within 14 days prior to initiation of protocol therapy
•	Subjects with a history of hypersensitivity to compounds of similar biologic composition to paxalisib (GDC-0084) or any constituent of the product
•	The subject has an uncontrolled intercurrent illness, including, but not limited to, ongoing or active infection, uncontrolled hypertension, unstable angina pectoris, uncontrolled cardiac arrhythmia, congestive heart failure-New York Heart Association class III or IV, active ischemic heart disease, myocardial infarction within the previous six months, uncontrolled diabetes mellitus (DM), gastric or duodenal ulceration diagnosed within the previous 6 months, chronic liver or renal disease, or severe malnutrition. If a participant has controlled DM but is unable to monitor blood sugars at home, they will be excluded from the trial
"""))

Task:
Below is an example of ...




Task:
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your task is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    
Inclusion Criteria
•	COHORT A: At least one measurable CNS metastasis, defined as >= 10 mm in at least one dimension
•	COHORT A: Unequivocal evidence of new and/or progressive brain metastases, and at least one of the following scenarios:
•	Treated with stereotactic radiosurgery (SRS) or surgery with residual un-treated lesions remaining. Such participants are eligible for immediate enrollment on this study providing that at lea

## Example token count

In [7]:
total = 0
total_res = 0
minm = 2000
maxm = 0
for example in examples:
    context_len = get_token_len(example['context'])
    answer_len = get_token_len(example['answer'])
    combined_len = context_len + answer_len
    if combined_len < minm:
        minm = combined_len
    elif combined_len > maxm:
        maxm = combined_len
    total += combined_len
    total_res += answer_len
AVG_EXAMPLE_LEN = total // len(examples)
MAX_W_SIZE = 2048
AVG_RES_LEN = total_res // len(examples)
print('         avg:', AVG_EXAMPLE_LEN)
print('         min:', minm)
print('         max:', maxm)
print('avg response:', AVG_RES_LEN)

         avg: 594
         min: 491
         max: 704
avg response: 72


In [22]:
class ECDoc:
    inc: list[str]
    exc: list[str]
    template = """Inclusion Criteria
{inclusion}

Exclusion Criteria
{exclusion}
"""
    
    def __init__(self, inc: list[str], exc: list[str]):
        self.inc = inc
        self.exc = exc

    @property
    def size(self) -> int:
        return get_token_len(str(self))

    def __str__(self) -> list[str]:
        return self.template.format(inclusion=''.join(self.inc).rstrip(),
                                   exclusion=''.join(self.exc).rstrip())

    def split(self):
        inc_len = len(self.inc)
        exc_len = len(self.exc)
        inc_midpoint = inc_len // 2
        exc_midpoint = exc_len // 2
        # Prevent Inclusion section from splitting too small
        if inc_len <= 5:
            inc_chunk_1 = self.inc
            inc_chunk_2 = self.inc
        else:
            inc_chunk_1 = self.inc[:inc_midpoint]
            inc_chunk_2 = self.inc[inc_midpoint:]
        # Prevent Exclusion section from splitting too small
        if exc_len <= 5:
            exc_chunk_1 = self.exc
            exc_chunk_2 = self.exc
        else:
            exc_chunk_1 = self.exc[:exc_midpoint]
            exc_chunk_2 = self.exc[exc_midpoint:]
        doc_chunk_1 = ECDoc(inc=inc_chunk_1, exc=exc_chunk_1)
        doc_chunk_2 = ECDoc(inc=inc_chunk_2, exc=exc_chunk_2)
        return doc_chunk_1, doc_chunk_2


def parse_file(filename: str) -> ECDoc:
    inc = []
    exc = []
    inclusion = True
    with open(filename) as filein:
        for line in filein.readlines():
            if line.strip().startswith('Inclusion Criteria'):
                continue
            elif line.strip().startswith('Exclusion Criteria'):
                inclusion = False
                continue
            elif line == '\n':
                continue
            if inclusion:
                inc.append(line)
            else:
                exc.append(line)
    return ECDoc(inc=inc, exc=exc)


def chunk_ec(doc: ECDoc) -> list[ECDoc]:
    # Not too large
    can_fit = doc.size + AVG_EXAMPLE_LEN < MAX_W_SIZE - AVG_RES_LEN
    if can_fit:
        return [doc]
   
    last_pass_chunks = [doc]
    while not can_fit:
        new_chunks = []
        for chunk in last_pass_chunks:
            new_chunks.extend(chunk.split())
        new_chunk_size = max([new_chunk.size for new_chunk in new_chunks])
        can_fit = new_chunk_size + AVG_EXAMPLE_LEN < MAX_W_SIZE - AVG_RES_LEN
        last_pass_chunks = new_chunks
    return new_chunks

## Process and test trial

In [9]:
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
trialN = '10'
original_doc = parse_file(f'test_results/trial{trialN}/unstructured_ec.txt')
doc_chunks = chunk_ec(original_doc)
doc_reassembled = ECDoc([], [])
for idx, doc in enumerate(doc_chunks):
    with open(f'test_results/trial{trialN}/unstructured_ec_chunk{idx}', 'w') as chunkfile:
        chunkfile.writelines(str(doc))
    output = llm_chain.run(criteria=str(doc))
    with open(f'test_results/trial{trialN}/output_chunk{idx}', 'w') as chunkfile:
        chunkfile.writelines(output)
    doc_reassembled.inc.extend(doc.inc)
    doc_reassembled.exc.extend(doc.exc)
with open(f'test_results/trial{trialN}/unstructured_ec_reassembled', 'w') as docfile:
        docfile.write(str(doc_reassembled).rstrip())