<a href="https://colab.research.google.com/github/Frinkles/Simple-Exercises/blob/main/phi3_mini_midiam_traning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets #Not needed with custom dataset
!pip install peft
!pip install -U transformers accelerate
!pip install trl
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install flash-attn --no-build-isolation
!pip install huggingface_hub

In [None]:
import sys
# import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 50,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,  # Reduce batch size to lower memory usage
    "per_device_train_batch_size": 4,  # Reduce batch size to lower memory usage
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
}

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [None]:
################
# Model Loading
################
# checkpoint_path = "microsoft/Phi-3-medium-4k-instruct"
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # loading the model with flash-attention support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.63s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3FlashAttention2(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_feature

In [None]:
##################
# Data Processing
##################
def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # Add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

raw_dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
train_dataset = raw_dataset["train_sft"]
test_dataset = raw_dataset["test_sft"]
column_names = list(train_dataset.features)

# Reduce training data to 1/8
# train_dataset_split = train_dataset.train_test_split(test_size=1/40, seed=42)
train_dataset_split = train_dataset.train_test_split(test_size=0.00481, seed=42)
reduced_train_dataset = train_dataset_split['test']

processed_train_dataset = reduced_train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

In [None]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 0 examples [00:00, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2503 > 2048). Running this sequence through the model will result in indexing errors
Generating train split: 15526 examples [00:40, 382.90 examples/s]
Using auto half precision backend
***** Running training *****
  Num examples = 679
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 170
  Number of trainable parameters = 25,165,824
 29%|██▉       | 50/170 [06:57<13:04,  6.54s/it] 

{'loss': 1.211, 'grad_norm': 0.396484375, 'learning_rate': 4.83118057351089e-06, 'epoch': 0.29}


 59%|█████▉    | 100/170 [13:22<08:05,  6.93s/it]Saving model checkpoint to ./checkpoint_dir\checkpoint-100


{'loss': 1.1613, 'grad_norm': 0.359375, 'learning_rate': 2.6154586466143495e-06, 'epoch': 0.59}


loading configuration file config.json from cache at C:\Users\frink\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\ff07dc01615f8113924aed013115ab2abd32115b\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05

{'loss': 1.1759, 'grad_norm': 0.3515625, 'learning_rate': 2.620917716123444e-07, 'epoch': 0.88}


100%|██████████| 170/170 [23:01<00:00, 10.42s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 170/170 [23:01<00:00,  8.13s/it]

{'train_runtime': 1381.7771, 'train_samples_per_second': 0.491, 'train_steps_per_second': 0.123, 'train_loss': 1.1797744750976562, 'epoch': 1.0}
***** train metrics *****
  epoch                    =        1.0
  total_flos               = 29121994GF
  train_loss               =     1.1798
  train_runtime            = 0:23:01.77
  train_samples_per_second =      0.491
  train_steps_per_second   =      0.123





In [None]:
#############
# Evaluation
#############
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 15526
  Batch size = 4
 61%|██████▏   | 2378/3882 [1:23:07<1:57:54,  4.70s/it]

KeyboardInterrupt: 

In [None]:
############
# Save model
############
trainer.save_model(train_conf.output_dir)

Saving model checkpoint to ./checkpoint_dir
loading configuration file config.json from cache at C:\Users\frink\.cache\huggingface\hub\models--microsoft--Phi-3-mini-4k-instruct\snapshots\d269012bea6fbe38ce7752c8940fea010eea3383\config.json
Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms

In [None]:
from huggingface_hub import login

login(token="。。。")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\frink\.cache\huggingface\token
Login successful


In [None]:
# Prepare the model and tokenizer
model.save_pretrained("Frinkles/Phi3AdapterModel")
tokenizer.save_pretrained("Frinkles/Phi3AdapterModel")

Configuration saved in Frinkles/Phi3AdapterModel\config.json
Configuration saved in Frinkles/Phi3AdapterModel\generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at Frinkles/Phi3AdapterModel\model.safetensors.index.json.
