In [8]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb
%pip install jsonschema datamodel_code_generator

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from huggingface_hub import login
import os

def load_token(token_path = ".vscode/.huggingface_token"):
    token_path = os.path.expanduser(token_path) 
    try:
        with open(token_path, 'r') as file:
            token = file.read().strip()
        return token
    except FileNotFoundError:
        raise Exception("Hugging Face token file not found. Please ensure it exists at ~/.huggingface/token.")

hf_token = load_token(".vscode/.huggingface_token")
wandb_token = load_token(".vscode/.wandb_token")
print(f"Hugging Face Token: {hf_token}")
login(token = hf_token)
wandb.login(key=wandb_token)

run = wandb.init(
    project='Fine-tune Llama 3 4B on Rappel Dataset', 
    job_type="training", 
    anonymous="allow"
)

Hugging Face Token: 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
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/lujun/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/lujun/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlilujun588588[0m ([33maistar[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
base_model = "openbmb/MiniCPM3-4B"
new_model = f"{base_model}-ft-insurance"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch_dtype, bnb_4bit_use_double_quant=True,)

# Load model
model = AutoModelForCausalLM.from_pretrained( base_model, quantization_config=bnb_config, device_map="auto", 
                                             attn_implementation=attn_implementation, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

# LoRA config
peft_config = LoraConfig( r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'])
model = get_peft_model(model, peft_config)

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM3-4B:
- configuration_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM3-4B:
- modeling_minicpm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


KeyboardInterrupt: 

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Patient"]},
               {"role": "assistant", "content": row["Doctor"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

trainer.train()

In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
messages = [
    {
        "role": "user",
        "content": ""
    }
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False,  add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True,  truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=150, num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])