In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install 'transformers==4.51.3'
!pip install 'numpy==2.1.2'
!pip install 'datasets==3.5.0'
!pip install 'huggingface_hub[cli]==0.30.2'
!pip install 'deepspeed==0.16.7' --use-pep517
!pip install 'accelerate==1.6.0'

In [12]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [14]:
# Set Environment Variables
default_environment_variables = {
    "output_dir": "./training_output",
    "wandb_account_name": "arekunoimar-deepspeed",
    "wandb_project_name": "llama-3-2-1b",
    "model_name": "meta-llama/Llama-3.2-1B",
    "dataset": "alpaca_data.json",
    "dataset_max_length": 512,
    "apply_dataset_rate": 0.1,
    "dataset_train_rate": 0.8,
    "dataset_validation_rate": 0.1,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "max_grad_norm": 1.0,
    "optim": "adamw_torch",
    "logging_steps": 1,
    "learning_rate": 2e-5,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 700,
    "seed": 1024,
    "fp16": False,
    "bf16": True,
    "save_strategy": "steps",
    "save_steps": 1000,
    "save_total_limit": 1,
    "eval_strategy": "steps",
    "eval_steps": 1000,
    "do_eval": True,
    "logging_distance_time": 1,
    # "deepspeed_zero0": False,
    # "deepspeed_train_config_zero0_path":"deepspeed_train_config_zero0.json",
    # "deepspeed_zero1": False,
    # "deepspeed_train_config_zero1_path":"deepspeed_train_config_zero1.json",
    # "deepspeed_zero2": True,
    # "deepspeed_train_config_zero2_path":"deepspeed_train_config_zero2.json",
    # "deepspeed_zero3": False,
    # "deepspeed_train_config_zero3_path":"deepspeed_train_config_zero3.json",
    # "deepspeed_zero3_infinity": False,
    # "deepspeed_train_config_zero3_infinity_path":"deepspeed_train_config_zero3_infinity.json",
}

In [None]:
deepspeed_zero2_config = {
  "zero_optimization": {
    "stage": 2
  },
  "train_batch_size": 1,
  "eval_batch_size": 1
}

In [None]:
# download model
from huggingface_hub import snapshot_download, login
# login()
snapshot_download(repo_id=default_environment_variables["model_name"], local_dir_use_symlinks=False, revision="main")

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# load model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["model_name"])
model = AutoModelForCausalLM.from_pretrained(default_environment_variables["model_name"])

tokenizer.pad_token_id = tokenizer.eos_token_id

In [25]:
# read dataset
import pandas
import json

def load_alpaca_dataset():
  with open(default_environment_variables["dataset"], 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[['instruction', 'input', 'output']]
  dataframe.head(100)
  return dataframe

In [None]:
dataframe = load_alpaca_dataset()
print(dataframe.head())

In [27]:
# apply dataformat

def apply_dataset_dataformat(dataframe):
  formated_dataframe = dataframe.apply(lambda x: {'###instruction': x['instruction'], '###input': x['input'], '###output': x['output']}, axis=1)
  return formated_dataframe


In [None]:
formated_dataframe = apply_dataset_dataformat(dataframe)
print(formated_dataframe.head())

In [31]:
# split dataset

def split_alpaca_dataset(dataframe):
  total_size_dataset = dataframe.sample(frac=default_environment_variables["apply_dataset_rate"])
  total_size_count = len(total_size_dataset)

  train_size = int(len(total_size_dataset) * default_environment_variables["dataset_train_rate"])
  validation_size = int(len(total_size_dataset) * default_environment_variables["dataset_validation_rate"])
  test_size = total_size_count - (train_size + validation_size)

  train_dataset = total_size_dataset.iloc[:train_size]
  validation_dataset = total_size_dataset.iloc[train_size:train_size + validation_size]
  test_dataset = total_size_dataset.iloc[train_size + validation_size:]

  return train_dataset, validation_dataset, test_dataset

In [None]:
train_dataset, validation_dataset, test_dataset = split_alpaca_dataset(load_alpaca_dataset())
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [33]:
from transformers import pipeline

In [None]:
# check befor model output
test_qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=100, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
prompt = "###instruction: Please answer the questions.  ###input: what is AI ?  ### output:"
generate_text = test_qa_pipeline(prompt, max_length=100, num_return_sequences=1, temperature=0.8)[0]["generated_text"]
print(generate_text)

In [None]:
from transformers import TrainingArguments

In [None]:
# set training_args
training_arguments = TrainingArguments(
    output_dir=default_environment_variables["output_dir"],
    num_train_epochs=default_environment_variables["num_train_epochs"],
    per_device_train_batch_size=default_environment_variables["per_device_train_batch_size"],
    per_device_eval_batch_size=default_environment_variables["per_device_eval_batch_size"],
    gradient_accumulation_steps=default_environment_variables["gradient_accumulation_steps"],
    max_grad_norm=default_environment_variables["max_grad_norm"],
    optim=default_environment_variables["optim"],
    learning_rate=default_environment_variables["learning_rate"],
    weight_decay=default_environment_variables["weight_decay"],
    lr_scheduler_type=default_environment_variables["lr_scheduler_type"],
    warmup_steps=default_environment_variables["warmup_steps"],
    logging_steps=default_environment_variables["logging_steps"],
    seed=default_environment_variables["seed"],
    fp16=default_environment_variables["fp16"],
    bf16=default_environment_variables["bf16"],
    deepspeed=deepspeed_zero2_config,
    save_strategy=default_environment_variables["save_strategy"],
    save_steps=default_environment_variables["save_steps"],
    save_total_limit=default_environment_variables["save_total_limit"],
    eval_strategy=default_environment_variables["eval_strategy"],
    eval_steps=default_environment_variables["eval_steps"],
    do_eval=default_environment_variables["do_eval"]
)

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
# data_collator
def data_collator(tokenizer):
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)
  return data_collator