In [1]:
from transformers import AutoModelForCausalLM, TrainingArguments,GenerationConfig, AutoTokenizer, LlamaConfig
from peft import LoraModel, LoraConfig, get_peft_model, PeftModel
import pandas as pd
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from sklearn.model_selection import train_test_split
import datetime
import torch
import evaluate
from datasets import Dataset
import wandb
import pandas as pd

In [2]:
def compute_metrics(test,model,tokenizer):
    max_tokens = 2000
    rouge = evaluate.load('rouge')
    bleu = evaluate.load("bleu")
    meteor = evaluate.load('meteor')
    answers = list()
    references = list()
    for i,instance in enumerate(test):
        prompt = instance['prompt']
        inputs = tokenizer.encode(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(
                    inputs,max_length = max_tokens,
                    pad_token_id=tokenizer.pad_token_id)
        result = tokenizer.decode(outputs[0])[len(prompt)+3:-4]
        answers.append(result)
        references.append(instance['completion'])
        if i % 100 == 0:
            print(i,datetime.datetime.now())
    bleu_score = bleu.compute(predictions = answers, references = references)
    rouge_score = rouge.compute(predictions = answers, references = references)
    meteor_score = meteor.compute(predictions = answers, references = references)
    print(bleu_score)
    print(rouge_score)
    print(meteor_score)

In [3]:
def make_cfg():
    config = LoraConfig(
        r=64,
        lora_alpha=16,
        bias='none',
        lora_dropout=0.1,
        task_type="CAUSAL_LM",
        use_rslora=True
    )
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",token = "",trust_remote_code = True)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",token = "",
                                                torch_dtype=torch.float32,
                                                 device_map='auto')
  
    
    tokenizer.add_special_tokens({"pad_token":"<pad>"})
    tokenizer.padding_side = "right"
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    model = get_peft_model(model, config, "default")
    return model, tokenizer

In [4]:
def make_trainer(train, test, model, tokenizer):
    args = TrainingArguments(
        output_dir="ft_April_2",
        auto_find_batch_size=True,
        evaluation_strategy="steps",
        eval_steps=50,
        logging_steps=1,
        num_train_epochs=1,
        weight_decay=0.1,
        bf16=False,
        warmup_steps=70,
        group_by_length=True,
        lr_scheduler_type="cosine",
        learning_rate=2e-4,
        save_steps=8000,
        fp16=False,
        push_to_hub=False,
        report_to="wandb",
        adam_beta2 = 0.95,
        adam_epsilon = 1e-5,
        #neftune_noise_alpha=5,
        #remove_unused_columns=False
    )
    response_template = "\n[/INST]"
    response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
    collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer,mlm=False)
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=args,
        data_collator=collator,
        train_dataset=train,
        eval_dataset=test,
        packing=False,
        max_seq_length=4096,
        dataset_text_field='text',
    )
    return trainer

In [5]:
prompt1 = '''[INST] <<SYS>>
Du är en hjälpsam medicinsk assistent som hjälper läkare och sjuksköterskor genom att sammanfatta information om patienter.
Svara på svenska.
<</SYS>>
Nedan ges anamnes för en patient under en dag
<anamnes>'''

prompt2 = '''
</anamnes>
Du ska plocka ut information som passar i mallen nedan. Undvik onödig information och plocka endast ut sådant som rör varje rubrik. Om relevant information saknas så lämnar du rubriken tom.
Formattera ditt svar enligt mallen. Ingen information kan finnas under flera rubriker.
<mall>
*Sjukdomshistoria (Patientens diagnoser, sjukdomshistorik och riskfaktorer (t.ex. sjukdomar i familjen))*

*Sökorsaker (Patientens symtom och/eller datum för ingrepp)*

*Åtgärder (Planerade undersökningar, behandlingar och åtgärder)*
</mall>
[/INST]'''
data = pd.read_parquet("../../OpenAI/synthetic_5th_april.parquet")
#data2 = pd.read_parquet("../../OpenAI/synthetic_bertil.parquet")
#data3 = pd.read_parquet("../../OpenAI/synthetic_2nd.parquet")
#data4 = pd.read_parquet("../../OpenAI/synthetic_211.parquet")

#data = pd.concat([data1,data2,data3],ignore_index=True)

formatted = []
for index, row in data.iterrows():
    #print(row)
    elem = {'text':f"<s> {prompt1}{row['description']}{prompt2} {row['summary']} </s>"}
    formatted.append(elem)
    

train, test = train_test_split(formatted,test_size = 0.05,random_state = 42)
train = Dataset.from_list(train)
test = Dataset.from_list(test)

In [6]:
print(data.iloc[212]['summary'])


*Sjukdomshistoria*
Diagnoser: Inga tidigare allvarliga sjukdomar.
Riskfaktorer: Far med liknande symtom under medelåldern.

*Sökorsaker*
Känsla av att inte kunna hålla benen stilla, vilket förvärras på kvällen och påverkar sömnen.

*Åtgärder*
Påbörjad medicinsk behandling med dopaminagonist för Restless Legs Syndrome och återbesök om 6 veckor för uppföljning.


In [7]:
model, tokenizer = make_cfg()
#compute_metrics(test,model,tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
trainer = make_trainer(train,test,model, tokenizer)
trainer.evaluate()
trainer.train()
trainer.evaluate()
trainer.save_model('../../../data/finetuned/lora_5th_apr_nodomain')
#trainer.model.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/model')
#trainer.tokenizer.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/tokenizer')

wandb.finish()

Map:   0%|          | 0/434 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mlundalb[0m ([33mmaster-t[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,0.9975,0.6831
100,1.0029,0.64254
150,0.8044,0.613558
200,0.7696,0.601643
250,0.4403,0.583947
300,0.6294,0.57645
350,0.3934,0.5647
400,1.2353,0.558035



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-13b-chat-hf/resolve/main/config.json.
Repo model meta-llama/Llama-2-13b-chat-hf is gated. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-13b-chat-hf.


VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▂▂▂▁▁▁▁▁▁
eval/runtime,█▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁████████▇
eval/steps_per_second,▁██████▇█▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▂▃▅▅▇███████▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
train/loss,▆▆▄▅▃▆▂▁█▅▂▄▄▄▄▃▂▃▄▃▂▄▃▄▂▄▅▃▂▄▂▁▇▂▃▃▄▆▂▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.55782
eval/runtime,41.494
eval/samples_per_second,0.554
eval/steps_per_second,0.072
train/epoch,1.0
train/global_step,434.0
train/learning_rate,0.0
train/loss,1.1364
train/total_flos,3.330604539260928e+16
train/train_loss,0.67491
