In [1]:
from model import *

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [2]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    # stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    # streamer = transformers.TextStreamer(tokenizer)
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [3]:
%load_ext autoreload
%autoreload 2

from prompt_examples.single_criterion_examples import prompt, examples

In [4]:
print(prompt.format(criterion="""Patient must have undergone complete surgical resection of their stage IIA, IIB, IIIA or IIIB non-squamous or squamous b NSCLC per American Joint Committee on Cancer (AJCC) 8th edition and have had negative margins. N3 disease is not allowed."""))

You are in the role of an abstractor who will analyze an eligibility criterion for a clinical trial and extract the relevant entities as described below.
Original Text: the original text of the criterion
Disease/Condition: If the criterion contains a disease or condition name it by its canonical name
Procedure: If the criterion contains a therapeutic procedure name it by its canonical name
Drug:  If the criterion contains a therapeutic drug name it by its canonical name
Biomarker:  If the criterion contains a biomarker name it by its canonical name
Computable Rule: Translate the criteria into a logical expression that could be interpreted programmatically

Criterion:
    Subject must not have any affected lymph nodes or metastatic disease

Output:
    Original Text: Subject must not have any affected lymph nodes or metastatic disease
    Disease/Condition: Lymph node-Metastases, Metastatic Disease
    Procedure: none
    Drug: none
    Biomarker: none
    Computable Rule: LK MTS == Fal

In [6]:
from token_counting import *
globalize_token_metrics(examples)
print('  avg prompt:', AVG_PROMPT_LEN)
print('  min prompt:', MIN_PROMPT_LEN)
print('  max prompt:', MAX_PROMPT_LEN)
print('avg response:', AVG_RES_LEN)

  avg prompt: 121
  min prompt: 66
  max prompt: 227
avg response: 89


In [12]:
from chunking import parse_file_with_pipes

In [None]:
import langchain
import time
from pathlib import Path
from loguru import logger
from langchain.chains import LLMChain


langchain.debug = False
langchain.verbose = False

n = '01'
folder = f'test_results_final/trial{n}'
folderp = Path(folder)
logfile = folderp / "outputs.log"
logger.add(logfile, colorize=False, enqueue=True)
handler = langchain.callbacks.FileCallbackHandler(logfile)

llm_chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler], verbose=False)

criterions = parse_file_with_pipes(folderp / 'ec_with_pipes.txt')
start = time.time()
invoke_times = []
for idx, criterion in enumerate(criterions):
    idx += 1
    invoke_start = time.time()
    results = llm_chain.invoke(input={'criterion': criterion.value})
    invoke_times.append(time.time() - invoke_start)
    with open(folderp / f'{idx:02}_output.txt', 'w', encoding='utf-8') as fileout:
        fileout.write(results['text'])
    with open(folderp / f'{idx:02}_stats.yaml', 'w', encoding='utf-8') as fileout:
        fileout.write(f"""elapsed_time: {int(time.time() - invoke_start)}s
was_input_captured: {criterion.value in results['text']}
original_text: '{criterion.value}'
inclusion: {criterion.inclusion}
""")
with open(folderp / 'stats.yaml', 'w', encoding='utf-8') as fileout:
    fileout.write(f"""total_time: {(time.time() - start) // 60}min
avg_invoke_time: {sum(invoke_times) // len(invoke_times)}s
""")

In [10]:
# Copy the prompts that Langchain logs
import json

with open(f'{folder}/single_criterion_task_prompts.json') as filein:
    prompts_used = json.load(filein)

for idx, prompt_used in enumerate(prompts_used):
    prompt_used = prompt_used['prompts'][0]
    prompt_len = get_token_len(prompt_used)
    with open(f'{folder}/single_criterion_task_{idx:02}_prompt.txt', 'w', encoding='utf-8') as fileout:
        fileout.write("""[metrics]
length={prompt_len}

[prompt]
{prompt}""".format(prompt=prompt_used, prompt_len=prompt_len))

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0
