## Params

In [1]:
from typing import Optional, List, Dict
from dataclasses import dataclass, field
import torch

In [2]:
@dataclass
class ModelArguments:
    model_name_or_path: str = field(
        default="meta-llama/Llama-2-7b-chat-hf",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
    )
    tokenizer_name_or_path: Optional[str] = field(
        default=None,
        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
    )
    adapter_paths: List[str] = field(
        default=None,
        metadata={"help": "A list of paths to multi model adapters."},
    )
    model_max_length: int = field(
        default=512,
        metadata={"help": "Max length of the tokenizer"},
    )
    huggingface_token: Optional[str] = field(
        default=None,
        metadata={"help": "Huggingface token for private model"},
    )
    use_fast_tokenizer: bool = field(
        default=False,
        metadata={"help": "Whether to use fast tokenizer"},
    )
    output_dir: str = field(
        default="./",
        metadata={"help": "Output directory for the model"},
    )

In [3]:
@dataclass
class DataArguments:
    train_file: str = field(
        default=None, 
        metadata={"help": "Path to the train data in jsonl format."}
    )
    dataset_text_field: Optional[str] = field(
        default="instruction", 
        metadata={"help": "The field in dataset for completion pretraining. Mandatory if 'pre_sft' is True."}
    )
    user_prompt_format: Optional[str] = field(
        default="llama2",
        metadata={"help": "The name of a known llm model prompt format or a the user custom prompt. Mandatory if 'instruct_sft' is True."}
    )
    user_response_sentence: Optional[str] = field(
        default=None,
        metadata={"help": "The response sentence for instruction pretraining"}
    )

In [4]:
@dataclass
class TrainingArguments:
    pre_sft: bool = field(
        default=False,
        metadata={"help": "Whether to pretrain the model"},
    )
    instruct_sft: bool = field(
        default=True,
        metadata={"help": "Whether to instruct the model"},
    )
    packing: bool = field(
        default=True,
        metadata={"help": "Whether to pack the data into constant length batches to accelerate training"},
    )
    fp16: bool = True
    bf16: bool = False
    num_epochs: int = 3
    batch_size: int = 16
    learning_rate: float = 5e-5
    optim: str = "adamw_apex_fused"
    gradient_accumulation_steps: int = 1
    logging_strategy: str = "steps"
    logging_steps: int = 100
    logging_dir: str = "/app/logging_finetuning/",
    save_strategy: str = "no"
    save_steps: int = 1000

In [5]:
@dataclass
class AdapterArguments:
    # TODO: How to get the optimal paratmeters for lora?
    use_lora: bool = field(
        default=True,
        metadata={"help": "Whether to use lora"},
    )
    lora_target_modules: List[str] = field(
        default_factory=lambda:["qkv_proj"],
        metadata={"help": "Target modules for lora"},
    )
    lora_r: int = field(
        default=8,
        metadata={"help": "R for lora"},
    )
    lora_alpha: int = field(
        default=16,
        metadata={"help": "Alpha for lora"},
    )
    lora_dropout: float = field(
        default=0.05,
        metadata={"help": "Dropout for lora"},
    )
    lora_bias: str = field(
        default="none",
        metadata={"help": "Bias for lora"},
    )

## Train

In [6]:
from typing import Dict

import transformers
import huggingface_hub

from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from params import (
    ModelArguments, 
    DataArguments, 
    TrainingArguments, 
    AdapterArguments
)
from prompt_template import PromptFormatter
from callbacks import SavingCallback, SavingCallbackHalf
from utils import merge_peft_model

import pandas as pd
import json

In [7]:
model_args = ModelArguments(
    model_name_or_path="../../home/drossini/models/phi_mean_pooled/",
    tokenizer_name_or_path="microsoft/Phi-3-mini-128k-instruct",
    adapter_paths=None,
    model_max_length=2048,
    huggingface_token="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",
    use_fast_tokenizer=True,
    output_dir="../../home/drossini/models/"
)

In [8]:
data_args = DataArguments(
    train_file="../SlimPajama_chunk/slim_pajama_chunk_of_chunk.jsonl",
    dataset_text_field="text",
    user_prompt_format=None,
    user_response_sentence=None
)

In [9]:
training_args = TrainingArguments(
    pre_sft=True,
    instruct_sft=False,
    packing=True,
    fp16=True,
    bf16=False,
    num_epochs=3,
    batch_size=2,
    learning_rate=3e-5,
    optim="adamw_hf",
    gradient_accumulation_steps=2,
    logging_strategy="steps",
    logging_steps=500,
    logging_dir="../Fine_tuning/pre_sft_logging_finetuning/",
    save_strategy="steps",
    save_steps=1000
)

In [10]:
adapter_args = AdapterArguments(
    use_lora=True,
    lora_target_modules=["qkv_proj"],  # Set the target module to qkv_proj
    lora_r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    lora_bias="none"
)

In [11]:
def prepare_args(
    model_args: ModelArguments, 
    data_args: DataArguments,
    training_args: TrainingArguments,
    adapter_args: AdapterArguments,
    args: Dict
):
    return transformers.TrainingArguments(
        output_dir=model_args.output_dir,
        fp16=training_args.fp16,
        bf16=training_args.bf16,
        num_train_epochs=training_args.num_epochs,
        per_device_train_batch_size=training_args.batch_size,
        learning_rate=training_args.learning_rate,
        optim=training_args.optim,
        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
        logging_dir=training_args.logging_dir,
        logging_strategy=training_args.logging_strategy,
        logging_steps=training_args.logging_steps,
        save_strategy=training_args.save_strategy,
        save_steps=training_args.save_steps,
        **args
    )

In [12]:
def train_clm(
    model_args: ModelArguments, 
    data_args: DataArguments,
    training_args: TrainingArguments,
    adapter_args: AdapterArguments
    ):

    args = {}

    # huggingface login
    if model_args.huggingface_token:
        huggingface_hub.login(token=model_args.huggingface_token)

    # load model
    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path,
        trust_remote_code=True
    )

    # apply adapter
    if model_args.adapter_paths:
        model = merge_peft_model(model, model_args.adapter_paths)

    lora_config = None
    if adapter_args.use_lora:
        lora_config = LoraConfig(
            r = adapter_args.lora_r,
            lora_alpha = adapter_args.lora_alpha,
            lora_dropout = adapter_args.lora_dropout,
            target_modules = adapter_args.lora_target_modules,
            bias= adapter_args.lora_bias
        )
        args["remove_unused_columns"] = False
    
    # load tokenizer
    tokenizer_name_or_path = model_args.model_name_or_path
    if model_args.tokenizer_name_or_path:
        tokenizer_name_or_path = model_args.tokenizer_name_or_path

    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path,
        padding_side="right",
        use_fast=model_args.use_fast_tokenizer,
        trust_remote_code=True,
        truncation=True
    )

    tokenizer.pad_token = tokenizer.eos_token
    
    # set TrainingArguments for huggingface trainer
    args = prepare_args(model_args, data_args, training_args, adapter_args, args)
    # completion pretraining
    if training_args.pre_sft:
        dataset = load_dataset('json', data_files=data_args.train_file, split="train")
        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            peft_config=lora_config,
            train_dataset=dataset,
            dataset_text_field=data_args.dataset_text_field,
            packing=training_args.packing,
            max_seq_length=model_args.model_max_length,
            args=args,
            callbacks=[SavingCallback() if adapter_args.use_lora else SavingCallbackHalf()]
        )
    # instruction pretraining
    elif training_args.instruct_sft:
        dataset = load_dataset('json', data_files=data_args.train_file, split="train")

        # format to correct prompt template
        prompt_formatter = PromptFormatter(data_args.user_prompt_format)
        response_template = data_args.user_response_sentence
        if response_template is None:
            response_template = prompt_formatter.response_template

        collator = DataCollatorForCompletionOnlyLM(
            response_template=response_template, 
            tokenizer=tokenizer
        )

        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            train_dataset=dataset,
            peft_config=lora_config,
            max_seq_length=model_args.model_max_length,
            args=args,
            formatting_func=prompt_formatter.formatting_train_prompts_func,
            data_collator=collator,
            callbacks=[SavingCallback() if adapter_args.use_lora else SavingCallbackHalf()]
        )
    else:
        raise ValueError("Please specify the training mode: pre_sft or instruct_sft")

    trainer.train()

In [13]:
import os

# Set the environment variable
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [14]:
train_clm(model_args, data_args, training_args, adapter_args)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/drossini/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating train split: 0 examples [00:00, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


KeyboardInterrupt: 