In [1]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from typing import Dict, List

import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=bnb_config, device_map="auto")

PAD_TOKEN = "<|pad|>"
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right"

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


Embedding(128264, 4096)

In [3]:
dataset = load_dataset(
    "json",
    data_files={"train": "train.json"},
)

Generating train split: 3884 examples [00:00, 87117.78 examples/s]


In [4]:
response_template = "<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [5]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [6]:
sft_config = SFTConfig(
    output_dir="./chatbot_07",
    dataset_text_field="text",
    max_seq_length=512,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    optim="paged_adamw_8bit",
    learning_rate=1e-4,
    fp16=True,
    warmup_ratio=0.1,
    save_safetensors=True,
    dataset_kwargs={
        "add_special_tokens": False, 
        "append_concat_token": False,
    },
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
)

Map: 100%|██████████| 3884/3884 [00:00<00:00, 8482.49 examples/s]


In [None]:
trainer.train()

In [None]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')