In [None]:
!pip install -U langchain transformers accelerate bitsandbytes chromadb sentence_transformers

In [1]:
from torch import cuda, bfloat16
import transformers

token=""
model_id="epfl-llm/meditron-7b"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    token=token
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    token=token
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    token=token
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [11]:
def get_token_len(text: str):
    return len(tokenizer.encode(text))

In [2]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

stop_list = ["\n\n", "\n\n\n", "Task:\nBelow"]
# stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
print(stop_token_ids)

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

[tensor([29871,    13,    13], device='cuda:0'), tensor([29871,    13,    13,    13], device='cuda:0'), tensor([ 9330, 29901,    13, 21140,   340], device='cuda:0')]


In [4]:
from langchain.llms import HuggingFacePipeline

generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=256,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True,
    streamer = transformers.TextStreamer(tokenizer)
)
llm = HuggingFacePipeline(pipeline=generate_text)

In [16]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

examples = [
    {
        "context": """
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria
    •	Age 18 or older
    •	Willing and able to provide informed consent
    •	Metastatic breast cancer, biopsy proven
    o	Estrogen receptor (ER)+/HER2-, defined as > 5% ER+ staining
    o	HER2+ (regardless of ER status), including HER2-low and high expressors
    •	History of at least 6 months, sustained response to systemic therapy (clinically or radiographically defined as complete or stable response without progression)
    •	Isolated site of disease progression on fludeoxyglucose F-18 (FDG) positron emission tomography (PET) scan
    •	Consented to 12-245
    •	Eastern Cooperative Oncology Group (ECOG) performance status 0-1

    Exclusion Criteria
    •	Pregnancy
    •	Serious medical comorbidity precluding radiation, including connective tissue disorders
    •	Intracranial disease (including previous intracranial involvement)
    •	Previous radiotherapy to the intended treatment site that precludes developing a treatment plan that respects normal tissue tolerances
""",
        "answer": """
Diseases: Metastatic breast cancer (inclusion)
Biomarkers: Estrogen receptor (ER)+/HER2-, HER2+ (inclusion)
Prior Therapies: Previous radiotherapy to the intended treatment site (exclusion)
""",
    },
    {
        "context": """
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria
    •	Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure
    •	Male or female >= 18 years of age at the time of informed consent
    •	Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
    •	Radio active iodine refractory disease
    •	BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
    •	Has progressed on at least 1 but not more than 2 prior VEGFR targeted therapy
    •	Eastern Cooperative Oncology Group performance status >= 2
    •	At least one measurable lesion as defined by RECIST 1.1
    •	Anaplastic or medullary carcinoma of the Tyroid

    Exclusion Criteria
    •	Previous treatment with BRAF inhibitor and/or MEK inhibitor
    •	Concomitant RET Fusion Positive Thyroid cancer
    •	Receipt of any type of small molecule kinase inhibitor within 2 weeks before randomization
    •	Receipt of any type of cancer antibody or systemic chemotherapy within 4 weeks before randomization
    •	Receipt of radiation therapy for bone metastasis within 2 weeks or any other radiation therapy within 4 weeks before randomization
    •	A history or current evidence/risk of retinal vein occlusion or central serous retinopathy
""",
        "answer": """
Diseases: thyroid cancer (inclusion), Anaplastic or medullary carcinoma of the Tyroid (inclusion)
Biomarkers: BRAFV600E mutation positive (inclusion), RET Fusion Positive (exclusion)
Prior Therapies: VEGFR targeted therapy (inclusion), BRAF inhibitor and/or MEK inhibitor (exclusion)
""",
    },
    {
        "context": """
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria
    •	Participants must be ‚â• 18 years of age
    •	Histologically or cytologically confirmed diagnosis of metastatic solid tumors
    •	Eastern Cooperative Oncology Group (ECOG) performance status 0-1
    •	All participants should have at least 1 measurable disease per RECIST v1.1. An irradiated lesion can be considered measurable only if progression has been demonstrated on the irradiated lesion.
    •	Body weight within [45 - 150 kg] (inclusive)
    •	All Contraceptive use by men and women should be consistent with local regulations regarding the methods of contraception for those participating in clinical studies.
    •	Capable of giving signed informed consent
    •	Any clinically significant cardiac disease
    •	History of or current interstitial lung disease or pneumonitis

    Exclusion Criteria
    •	Uncontrolled or unresolved acute renal failure
    •	Prior solid organ or hematologic transplant.
    •	Known positivity with human immunodeficiency virus (HIV), known active hepatitis A, B, and C, or uncontrolled chronic or ongoing infectious requiring parenteral treatment.
    •	Receipt of a live-virus vaccination within 28 days of planned treatment start
    •	Participation in a concurrent clinical study in the treatment period.
    •	Inadequate hematologic, hepatic and renal function
    •	Participant not suitable for participation, whatever the reason, as judged by the Investigator, including medical or clinical conditions.
""",
        "answer": """
Diseases: Metastatic solid tumor (inclusion)
Biomarkers: none
Prior Therapies: organ or hematologic transplant (exclusion)
"""
    },
    {
        "context": """
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria:
    •	Adults with a confirmed diagnosis of unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC with no curative-intent treatment options and documented activating KRAS mutation (without known additional actionable driver mutations such as EGFR, ALK or ROS1)
    •	Documented progression and measurable disease after ‚â• 1 prior line of systemic therapy (‚â• 2 and ‚â§ 4 prior lines for NSCLC) with adequate washout period and resolution of treatment-related toxicities to ‚â§ Grade 2
    •	ECOG PS of 0-2 (0-1 for PDAC) and a life expectancy > 3 months in the opinion of the Investigator
    •	Adequate hematological, liver, and renal function
    •	Men and women of childbearing potential must use adequate birth control measures for the duration of the trial and at least 90 days after discontinuing study treatment
    •	Symptomatic and/or untreated CNS or brain metastasis, pre-existing ILD or pericardial/pleural effusion of ‚â• grade 2 or requiring chronic oxygen therapy for COPD or pleural effusions
    •	Serious concomitant disorder including infection
    •	Known positive test for HIV, HCV, HBV surface antigen

    Exclusion Criteria:
    •	Concurrent malignancy in the previous 2 years
    •	Prior menin inhibitor therapy
    •	Requiring treatment with a strong or moderate CYP3A inhibitor/inducer
    •	Significant cardiovascular disease or QTcF or QTcB prolongation.
    •	Major surgery within 4 weeks prior to first dose
    •	Women who are pregnant or lactating.
""",
        "answer": """
Diseases: unresectable, locally advanced and/or metastatic Stage IIIB/IV NSCLC, Stage III/IV PDAC and/or Stage III/IV CRC (inclusion)
Biomarkers: KRAS mutation (inclusion)
Prior Therapies: menin inhibitor therapy (exclusion)
"""
    }
]

In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain_community.vectorstores import Chroma

example_selector = SemanticSimilarityExampleSelector.from_examples(
    # This is the list of examples available to select from.
    examples,
    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
    HuggingFaceEmbeddings(),
    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
    Chroma,
    # This is the number of examples to produce.
    k=1,
)

example_prompt = PromptTemplate(
    input_variables=["context", "answer"], template="Task: {context}{answer}"
)

prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    suffix="""Task:
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    {criteria}
""",
    input_variables=["criteria"],
)

In [18]:
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain

llm_chain = LLMChain(llm=llm, prompt=prompt)
llm_chain.run(criteria="""
Inclusion Criteria
•	Have documented CD20+ aggressive B-NHL, with disease that has progressed after at least 2 lines of systemic therapy containing an anti-CD20 antibody and an alkylating agent.
•	Measurable disease on cross sectional imaging as defined in the protocol
•	Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1
•	Adequate bone marrow, renal and hepatic function as defined in the protocol
•	During dose expansion phase of the study, participant should be willing to undergo mandatory tumor biopsies, if in the opinion of the investigator, the participant has an accessible lesion that can be biopsied without significant risk to the participant.

Exclusion Criteria
•	Prior treatments with allogeneic stem cell transplantation or solid organ transplantation, treatment with anti-CD20 x anti- CD3 bispecific antibody, such as odronextamab
•	Diagnosis of mantle cell lymphoma (MCL)
•	Primary central nervous system (CNS) lymphoma or known involvement by non-primary CNS lymphoma
•	Treatment with any systemic anti-lymphoma therapy within 5 half-lives or within 14 days prior to first administration of study drug, whichever is shorter
•	Standard radiotherapy within 14 days of first administration of study drug.
•	Continuous systemic corticosteroid treatment with more than 10 mg per day of prednisone or corticosteroid equivalent within 72 hours of start of odronextamab
•	Co-morbid conditions, as described in the protocol
•	Infections, as described in the protocol
•	Allergy/hypersensitivity: Known hypersensitivity to both allopurinol and rasburicase
""")

# Diseases: Aggressive B-cell Non-Hodgkin Lymphoma (inclusion) 1/1
# Biomarkers: CD20 (inclusion) 1/1
# Prior Therapies: Allogeneic stem cell transplantation or solid organ transplantation (exclusion) 1/1

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Task: 
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria
    •	Participants must be ‚â• 18 years of age
    •	Histologically or cytologically confirmed diagnosis of metastatic solid tumors
    •	Eastern Cooperative Oncology Group (ECOG) performance status 0-1
    •	All participants should have at least 1 measurable disease per RECIST v1.1. An irradiated lesion can be considered measurable only if progression has been demonstrated on the irradiated lesion.
    •	Body weight within [45 - 150 kg] (inclusive)
    •	A

'Diseases: Aggressive B-cell Non-Hodgkin Lymphoma (inclusion)\nBiomarkers: CD20 (inclusion)\nPrior Therapies: Allogeneic stem cell transplantation or solid organ transplantation (exclusion)\n\n\nTask:\nBelow is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.\nCriteria:\n    \nInclusion Criteria\n•\tPatients must be ≥ 18 years old\n•\tPatients must have histologically or cytologically confirmed advanced or metastatic solid tumor\n•\tPatients must have measurable disease according to Response'

In [19]:
print(get_token_len("""Task: 
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    Inclusion Criteria
    •	Participants must be ‚â• 18 years of age
    •	Histologically or cytologically confirmed diagnosis of metastatic solid tumors
    •	Eastern Cooperative Oncology Group (ECOG) performance status 0-1
    •	All participants should have at least 1 measurable disease per RECIST v1.1. An irradiated lesion can be considered measurable only if progression has been demonstrated on the irradiated lesion.
    •	Body weight within [45 - 150 kg] (inclusive)
    •	All Contraceptive use by men and women should be consistent with local regulations regarding the methods of contraception for those participating in clinical studies.
    •	Capable of giving signed informed consent
    •	Any clinically significant cardiac disease
    •	History of or current interstitial lung disease or pneumonitis

    Exclusion Criteria
    •	Uncontrolled or unresolved acute renal failure
    •	Prior solid organ or hematologic transplant.
    •	Known positivity with human immunodeficiency virus (HIV), known active hepatitis A, B, and C, or uncontrolled chronic or ongoing infectious requiring parenteral treatment.
    •	Receipt of a live-virus vaccination within 28 days of planned treatment start
    •	Participation in a concurrent clinical study in the treatment period.
    •	Inadequate hematologic, hepatic and renal function
    •	Participant not suitable for participation, whatever the reason, as judged by the Investigator, including medical or clinical conditions.

Diseases: Metastatic solid tumor (inclusion)
Biomarkers: none
Prior Therapies: organ or hematologic transplant (exclusion)


Task:
Below is an example of clinical trial eligibility inclusion/exclusion criteria. Your identify 3 categories of data within it. The 3 categories are: 1) Disease: a disorder affecting humans, 2) Biomarker: genes, proteins, or other substances that can be tested for to reveal important details about a patient’s cancer, and 3) Prior Therapy: medications, surgeries, or procedures that a patient may be treated with. For each of the identified categories, state whether it is an inclusion or exclusion.
Criteria:
    
Inclusion Criteria
•	Have documented CD20+ aggressive B-NHL, with disease that has progressed after at least 2 lines of systemic therapy containing an anti-CD20 antibody and an alkylating agent.
•	Measurable disease on cross sectional imaging as defined in the protocol
•	Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1
•	Adequate bone marrow, renal and hepatic function as defined in the protocol
•	During dose expansion phase of the study, participant should be willing to undergo mandatory tumor biopsies, if in the opinion of the investigator, the participant has an accessible lesion that can be biopsied without significant risk to the participant.

Exclusion Criteria
•	Prior treatments with allogeneic stem cell transplantation or solid organ transplantation, treatment with anti-CD20 x anti- CD3 bispecific antibody, such as odronextamab
•	Diagnosis of mantle cell lymphoma (MCL)
•	Primary central nervous system (CNS) lymphoma or known involvement by non-primary CNS lymphoma
•	Treatment with any systemic anti-lymphoma therapy within 5 half-lives or within 14 days prior to first administration of study drug, whichever is shorter
•	Standard radiotherapy within 14 days of first administration of study drug.
•	Continuous systemic corticosteroid treatment with more than 10 mg per day of prednisone or corticosteroid equivalent within 72 hours of start of odronextamab
•	Co-morbid conditions, as described in the protocol
•	Infections, as described in the protocol
•	Allergy/hypersensitivity: Known hypersensitivity to both allopurinol and rasburicase

Diseases: Aggressive B-cell Non-Hodgkin Lymphoma (inclusion)
Biomarkers: CD20 (inclusion)
Prior Therapies: Allogeneic stem cell transplantation or solid organ transplantation (exclusion)
"""))

1239
