In [1]:
token=""
model_id="epfl-llm/meditron-7b"

In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

model_config = AutoConfig.from_pretrained(model_id,token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
model = AutoModelForCausalLM.from_pretrained(model_id, token=token, load_in_8bit=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

### Settings

In [None]:
from transformers import set_seed
set_seed(0)

In [3]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList

stop_list = ["\n12", "\n12:", "\n12:", "\n12:   Sentence:"]
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to('cuda') for x in stop_token_ids]

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        # print(input_ids[0][-3:])
        for stop_ids in stop_token_ids:
            # decoded = tokenizer.decode(input_ids[0][-3])
            if torch.eq(input_ids[0][-3:], torch.tensor([29896, 29906, 29901]).to('cuda')).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

settings = {
    'max_new_tokens': 256,
    # 'temperature': 0.6,
    # 'top_p': 0.5,
    # 'top_k': 35,
    # 'do_sample': True,
    # 'repetition_penalty': 1.1,
    'penalty_alpha': 0.5,
    'top_k': 4,
    'pad_token_id': tokenizer.eos_token_id,
    # 'stopping_criteria': stopping_criteria,
    # 'return_dict_in_generate': True
}

In [4]:
print(len(stop_token_ids[0]))
print(stop_token_ids)
print(tokenizer.batch_decode(stop_token_ids))

4
[tensor([29871,    13, 29896, 29906], device='cuda:0'), tensor([29871,    13, 29896, 29906, 29901], device='cuda:0'), tensor([29871,    13, 29896, 29906, 29901], device='cuda:0'), tensor([29871,    13, 29896, 29906, 29901,   259, 28048,   663, 29901],
       device='cuda:0')]
['\n12', '\n12:', '\n12:', '\n12:   Sentence:']


### Outputs

In [4]:
from transformers import TextStreamer
import gc

def do_generate(prompt: str, use_stream=True):
    new_streamer = TextStreamer(tokenizer) if use_stream else None
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    len_inputs = len(inputs['input_ids'][0])
    print(' len(input): ', len_inputs)
    outputs = model.generate(**inputs, **settings)
    print('len(output): ', len(outputs[0]) - len_inputs)
    print(' len(total): ', len(outputs[0]))
    # gen_text = tokenizer.batch_decode(outputs[0][len_inputs:])
    gc.collect()
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
print(do_generate("""
0:    Sentence:      Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure.
1:    biomarkers:    none
2:    Sentence:      Male or female >= 18 years of age at the time of informed consent
3:    biomarkers:    none
4:    Sentence:      Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
5:    biomarkers:    none
6:    Sentence:      BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
7:    biomarkers:    BRAFV600E
8:    Sentence:      Concomitant RET Fusion Positive Thyroid cancer
9:    biomarkers:    RET Fusion Positive
10:   Sentence:      Estrogen receptor (ER)+/HER2-, defined as > 5% ER+ staining
11:   biomarkers:    """, use_stream=False)[0])

In [None]:
tokenizer.batch_decode([[29896, 29906, 29901],[29906, 29901,   259]])

In [6]:
call_prompt = """
0:    Sentence:      Signed informed consent must be obtained prior to performing any specific pre-screening and screening procedure.
1:    biomarkers:    none
2:    Sentence:      Male or female >= 18 years of age at the time of informed consent
3:    biomarkers:    none
4:    Sentence:      Histologically or cytologically confirmed diagnosis of advanced/metastatic differentiated thyroid cancer
5:    biomarkers:    none
6:    Sentence:      BRAFV600E mutation positive tumor sample as per Novartis designated central laboratory result
7:    biomarkers:    BRAFV600E
8:    Sentence:      Concomitant RET Fusion Positive Thyroid cancer
9:    biomarkers:    RET Fusion Positive
10:   Sentence:      {sentence}
11:   biomarkers:    """