In [None]:
!pip install -U langchain transformers accelerate bitsandbytes chromadb sentence_transformers

In [1]:
from torch import cuda, bfloat16
import transformers

token=""
model_id="epfl-llm/meditron-7b"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    token=token
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=token
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [23]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

stop_list = ["\n\n", "\n\n\n", "Task:\nBelow"]
# stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
print(stop_token_ids)

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

[tensor([29871,    13,    13], device='cuda:0'), tensor([29871,    13,    13,    13], device='cuda:0'), tensor([ 9330, 29901,    13, 21140,   340], device='cuda:0')]


In [34]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    streamer = transformers.TextStreamer(tokenizer)
)
llm = HuggingFacePipeline(pipeline=generate_text)

In [4]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

examples = [
    {
        "context": """
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Must have histologically or cytologically confirmed adenocarcinoma of the prostate
    Progressed on androgen deprivation therapy (ADT) and at least one prior secondary hormonal therapy approved for castration-resistant prostate cancer (CRPC)
    Eastern Cooperative Oncology Group (ECOG) performance status (PS) 0 or 1
    Prior treatment with an androgen receptor (AR) degrader
    Concurrent malignancy (present during screening) requiring treatment or history of prior malignancy active within 1 year prior to the first dose of IP
    Clinically significant venous thromboembolism within 3 months prior to the first dose of IP
    Any significant medical condition, such as uncontrolled infection, laboratory abnormality, or psychiatric illness
""",
        "answer": """
Diseases: adenocarcinoma of the prostate, castration-resistant prostate cancer
Biomarkers: none
Prior Therapies: androgen receptor (AR) degrader
""",
    },
    {
        "context": """
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure
    Male or female >= 18 years of age at the time of informed consent
    Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
    Radio active iodine refractory disease
    BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
    Has progressed on at least 1 but not more than 2 prior VEGFR targeted therapy
    Eastern Cooperative Oncology Group performance status >= 2
    At least one measurable lesion as defined by RECIST 1.1
    Anaplastic or medullary carcinoma of the Tyroid
    Previous treatment with BRAF inhibitor and/or MEK inhibitor
    Concomitant RET Fusion Positive Thyroid cancer
    Receipt of any type of small molecule kinase inhibitor within 2 weeks before randomization
    Receipt of any type of cancer antibody or systemic chemotherapy within 4 weeks before randomization
    Receipt of radiation therapy for bone metastasis within 2 weeks or any other radiation therapy within 4 weeks before randomization
    A history or current evidence/risk of retinal vein occlusion or central serous retinopathy
""",
        "answer": """
Diseases: advanced/metastatic differentiated thyroid cancer, Anaplastic or medullary carcinoma of the Tyroid, RET Fusion Positive Thyroid cancer
Biomarkers: BRAFV600E, RET Fusion Positive
Prior Therapies: BRAF inhibitor and/or MEK inhibitor
""",
    },
]

In [21]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain_community.vectorstores import Chroma

example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    examples,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    HuggingFaceEmbeddings(),
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    Chroma,
    # This is the number of examples to produce.
    k=1,
)

example_prompt = PromptTemplate(
    input_variables=["context", "answer"], template="Task: {context}{answer}"
)

prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    suffix="""Task:
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    {criteria}
""",
    input_variables=["criteria"],
)

In [25]:
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain.run(criteria="""Histologically or cytologically confirmed metastatic Stage IV colorectal adenocarcinoma.
    Documented evidence of a BRAF V600E mutation in tumor tissue or blood
    Presence of measurable disease per RECIST version 1.1 guidelines.
    Disease progression after 1 or 2 previous systemic regimens for metastatic disease
    Adequate bone marrow function
    Adequate hepatic and renal function
    Documented clinical disease progression or radiographic disease progression during the screening period
    Leptomeningeal disease.
    Symptomatic brain metastasis.
    Presence of acute or chronic pancreatitis.
    Unable to swallow, retain, and absorb oral medications.
    Clinically significant cardiovascular diseases
    Evidence of active noninfectious pneumonitis.
    Evidence of active and uncontrolled bacterial or viral infection, within 2 weeks prior to start of any of the study interventions
    Participants with known positivity for HIV
    Active hepatitis B or hepatitis C infection
    Concurrent or previous other malignancy within 2 years of study entry
    Has had an allogeneic tissue/solid organ transplant
    Pregnant or females of childbearing potential who have a positive Œ≤-hCG laboratory test result within 14 days prior to enrollment or is breastfeeding
""")

#Diseases: metastatic Stage IV colorectal adenocarcinoma 1/1
#Biomarkers: BRAF V600E mutation in tumor tissue or blood 1/1
#Prior Therapies: 1 or 2 previous systemic regimens for metastatic disease 0/1

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Task: 
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure
    Male or female >= 18 years of age at the time of informed consent
    Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
    Radio active iodine refractory disease
    BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
    Has progressed on at least 1 but not more than 2 prior VEGFR targeted therapy
    Eastern Cooperative Oncology Group performance status >= 2

'Diseases: metastatic Stage IV colorectal adenocarcinoma\nBiomarkers: BRAF V600E mutation in tumor tissue or blood\nPrior Therapies: 1 or 2 previous systemic regimens for metastatic disease\n\n\nTask:\nBelow is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.\nCriteria:\n    Patients must meet all of the following inclusion criteria to be eligible for this study:\n    Age ≥ 18 years\n    Confirmed histological diagnosis of high grade glioma (WHO Grade III or IV)\n    Karnofsky Performance Status score ≥ 70%\n    Life expectancy >'

In [None]:
example_selector.add_example({
    "context": """
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Histologically or cytologically confirmed metastatic Stage IV colorectal adenocarcinoma.
    Documented evidence of a BRAF V600E mutation in tumor tissue or blood
    Presence of measurable disease per RECIST version 1.1 guidelines.
    Disease progression after 1 or 2 previous systemic regimens for metastatic disease
    Adequate bone marrow function
    Adequate hepatic and renal function
    Documented clinical disease progression or radiographic disease progression during the screening period
    Leptomeningeal disease.
    Symptomatic brain metastasis.
    Presence of acute or chronic pancreatitis.
    Unable to swallow, retain, and absorb oral medications.
    Clinically significant cardiovascular diseases
    Evidence of active noninfectious pneumonitis.
    Evidence of active and uncontrolled bacterial or viral infection, within 2 weeks prior to start of any of the study interventions
    Participants with known positivity for HIV
    Active hepatitis B or hepatitis C infection
    Concurrent or previous other malignancy within 2 years of study entry
    Has had an allogeneic tissue/solid organ transplant
    Pregnant or females of childbearing potential who have a positive Œ≤-hCG laboratory test result within 14 days prior to enrollment or is breastfeeding
""", 
    "answer": """
Diseases: metastatic Stage IV colorectal adenocarcinoma
Biomarkers: BRAF V600E mutation in tumor tissue or blood
Prior Therapies: allogeneic tissue/solid organ transplant
"""
})

In [26]:
res = llm_chain.run(criteria="""Histologically confirmed adenocarcinoma of the breast that is HER2+ (IHC 3+ or gene amplification by ISH or NGS).
    Have received 2 or more prior lines of anti-HER2-directed therapies, at least 1 in the metastatic setting and including trastuzumab deruxtecan.
    Measurable disease as determined by RECIST v.1.1.
    Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1.
    Have life expectancy of greater than 12 weeks per the Investigator.
    All subjects must agree to have a biopsy prior to enrollment. If, in the judgment of the Investigator, a biopsy is not safely accessible or clinically feasible an archival tumor tissue sample must be submitted in lieu of a freshly collected specimen.
    History of severe hypersensitivity to any ingredient of BDC-1001 or pertuzumab.
    Previous treatment with a small molecule TLR7/8 agonist or TLR7/8 agonist that has been conjugated to tumor-targeting antibody such as ISACs within 12 months before starting study treatment.
    Impaired cardiac function or history of clinically significant cardiac disease.
    Human Immunodeficiency virus (HIV) infection, active hepatitis B infection, or hepatitis C infection.
    Central nervous system metastases with the exception of disease that is asymptomatic, clinically stable, and has not required steroids for at least 28 days before starting study treatment.
""")

#Diseases: adenocarcinoma of the breast that is HER2+ 1/1
#Biomarkers: HER2 1/1
#Prior Therapies: trastuzumab deruxtecan 1/1

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Task: 
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure
    Male or female >= 18 years of age at the time of informed consent
    Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
    Radio active iodine refractory disease
    BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
    Has progressed on at least 1 but not more than 2 prior VEGFR targeted therapy
    Eastern Cooperative Oncology Group performance status >= 2

'Diseases: adenocarcinoma of the breast that is HER2+\nBiomarkers: HER2\nPrior Therapies: trastuzumab deruxtecan\n\n\nTask:\nBelow is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.\nCriteria:\n    Patients must have histologically or cytologically confirmed non-small cell lung cancer (NSCLC), adenocarcinoma subtype, with EGFR exon 19 deletions or exon 21 L858R substitution mutations.\n    Patients must have locally advanced or metastatic NSCLC with disease progression during or after platinum-based chemotherapy.\n    Patients must'

In [33]:
res = llm_chain.run(criteria="""Adults with a confirmed diagnosis of unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC with no curative-intent treatment options and documented activating KRAS mutation (without known additional actionable driver mutations such as EGFR, ALK or ROS1)
    Documented progression and measurable disease after ‚â• 1 prior line of systemic therapy (‚â• 2 and ‚â§ 4 prior lines for NSCLC) with adequate washout period and resolution of treatment-related toxicities to ‚â§ Grade 2
    ECOG PS of 0-2 (0-1 for PDAC) and a life expectancy > 3 months in the opinion of the Investigator
    Adequate hematological, liver, and renal function
    Men and women of childbearing potential must use adequate birth control measures for the duration of the trial and at least 90 days after discontinuing study treatment
    Symptomatic and/or untreated CNS or brain metastasis, pre-existing ILD or pericardial/pleural effusion of ‚â• grade 2 or requiring chronic oxygen therapy for COPD or pleural effusions
    Serious concomitant disorder including infection
    Known positive test for HIV, HCV, HBV surface antigen
    Concurrent malignancy in the previous 2 years
    Prior menin inhibitor therapy
    Requiring treatment with a strong or moderate CYP3A inhibitor/inducer
    Significant cardiovascular disease or QTcF or QTcB prolongation.
    Major surgery within 4 weeks prior to first dose
    Women who are pregnant or lactating.
""")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Task: 
Below is an example of clinical trial eligibility criteria. Your job is to identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with.
Criteria:
    Must have histologically or cytologically confirmed adenocarcinoma of the prostate
    Progressed on androgen deprivation therapy (ADT) and at least one prior secondary hormonal therapy approved for castration-resistant prostate cancer (CRPC)
    Eastern Cooperative Oncology Group (ECOG) performance status (PS) 0 or 1
    Prior treatment with an androgen receptor (AR) degrader
    Concurrent malignancy (present during screening) requiring treatment or history of prior malignancy active within 1 year prior to the first dose of IP
    Clinically significant venous thrombo