In [1]:
password = input("Enter the password: ")
!echo {password} | sudo -S apt-get install mpich -y

パッケージリストを読み込んでいます... 完了0%
依存関係ツリーを作成しています... 完了%         
状態情報を読み取っています... 完了          
mpich はすでに最新バージョン (4.0-3) です。
アップグレード: 0 個、新規インストール: 0 個、削除: 0 個、保留: 24 個。


In [2]:
# set env
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["LOCAL_RANK"] = "0"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"

In [3]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install 'transformers==4.51.3'
!pip install 'numpy==1.26.4'
!pip install 'datasets==3.5.0'
!pip install 'huggingface_hub[cli]==0.30.2'
!pip install 'deepspeed==0.16.7' --use-pep517
!pip install 'trl==0.19.0'
!pip install 'mpi4py'
!pip install 'peft==0.15.2'

Looking in indexes: https://download.pytorch.org/whl/cu124


In [4]:
# Set Environment Variables
default_environment_variables = {
    "output_dir": "./output/llama-3-2-1b-alpaca-lora",
    "wandb_account_name": "arekunoimar-deepspeed",
    "wandb_project_name": "llama-3-2-1b",
    "model_name": "meta-llama/Llama-3.2-1B",
    "dataset": "../dataset/alpaca_data.json",
    "dataset_max_length": 512,
    "apply_dataset_rate": 0.6,
    "dataset_train_rate": 0.8,
    "dataset_validation_rate": 0.1,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "max_grad_norm": 1.0,
    "optim": "adamw_torch",
    "logging_steps": 1,
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 700,
    "seed": 1024,
    "fp16": False,
    "bf16": True,
    "save_strategy": "steps",
    "save_steps": 1000,
    "save_total_limit": 1,
    "eval_strategy": "steps",
    "eval_steps": 1000,
    "do_eval": True,
    "logging_distance_time": 1,
    "weight_decay": 0.01,
    "lora_r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.05,
    "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    "lora_bias": "none",
    # "deepspeed_zero0": False,
    # "deepspeed_train_config_zero0_path":"deepspeed_train_config_zero0.json",
    # "deepspeed_zero1": False,
    # "deepspeed_train_config_zero1_path":"deepspeed_train_config_zero1.json",
    # "deepspeed_zero2": True,
    # "deepspeed_train_config_zero2_path":"deepspeed_train_config_zero2.json",
    # "deepspeed_zero3": False,
    # "deepspeed_train_config_zero3_path":"deepspeed_train_config_zero3.json",
    # "deepspeed_zero3_infinity": False,
    # "deepspeed_train_config_zero3_infinity_path":"deepspeed_train_config_zero3_infinity.json",
}

In [5]:
# set deepspeed config
deepspeed_zero2_config = {
  "zero_optimization": {
    "stage": 2
  },
  "train_batch_size": 1,
  "eval_batch_size": 1
}

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# load model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["model_name"])
model = AutoModelForCausalLM.from_pretrained(default_environment_variables["model_name"])

[2025-06-24 16:51:17,136] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:
# set tokenizer special token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [9]:
from peft import LoraConfig, get_peft_model, TaskType

In [10]:
# lora setting
peft_config = LoraConfig(
    r=default_environment_variables["lora_r"],
    lora_alpha=default_environment_variables["lora_alpha"],
    target_modules=default_environment_variables["lora_target_modules"],
    lora_dropout=default_environment_variables["lora_dropout"],
    bias=default_environment_variables["lora_bias"],
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

In [11]:
# check trainable parameters
if hasattr(model, 'print_trainable_parameters'):
    print("\n=== PEFT Model Information ===")
    print(model.print_trainable_parameters())


=== PEFT Model Information ===
trainable params: 45,088,768 || all params: 1,280,903,168 || trainable%: 3.5201
None


In [12]:
# read dataset
import pandas
import json

def load_alpaca_dataset():
  with open(default_environment_variables["dataset"], 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[['instruction', 'input', 'output']]
  dataframe.head(100)
  return dataframe

In [13]:
dataframe = load_alpaca_dataset()
print(dataframe.head())

                                         instruction input  \
0               Give three tips for staying healthy.         
1                 What are the three primary colors?         
2                 Describe the structure of an atom.         
3                   How can we reduce air pollution?         
4  Describe a time when you had to make a difficu...         

                                              output  
0  1.Eat a balanced diet and make sure to include...  
1  The three primary colors are red, blue, and ye...  
2  An atom is made up of a nucleus, which contain...  
3  There are a number of ways to reduce air pollu...  
4  I had to make a difficult decision when I was ...  


In [14]:
# apply dataformat
def apply_dataset_dataformat(dataframe):
    def format_instruction(row):
        if row['input']:
            text = f"###instruction:\n{row['instruction']}\n###input:\n{row['input']}\n###output:\n{row['output']}"
        else:
            text = f"###instruction:\n{row['instruction']}\n###output:\n{row['output']}"
        return text
    
    formated_dataframe = dataframe.apply(format_instruction, axis=1)
    return formated_dataframe

In [15]:
formated_dataframe = apply_dataset_dataformat(dataframe)
print(formated_dataframe.head())

0    ###instruction:\nGive three tips for staying h...
1    ###instruction:\nWhat are the three primary co...
2    ###instruction:\nDescribe the structure of an ...
3    ###instruction:\nHow can we reduce air polluti...
4    ###instruction:\nDescribe a time when you had ...
dtype: object


In [16]:
# split dataset
def split_alpaca_dataset(dataframe):
  total_size_dataset = dataframe.sample(frac=default_environment_variables["apply_dataset_rate"])
  total_size_count = len(total_size_dataset)

  train_size = int(len(total_size_dataset) * default_environment_variables["dataset_train_rate"])
  validation_size = int(len(total_size_dataset) * default_environment_variables["dataset_validation_rate"])
  test_size = total_size_count - (train_size + validation_size)

  train_dataset = total_size_dataset.iloc[:train_size]
  validation_dataset = total_size_dataset.iloc[train_size:train_size + validation_size]
  test_dataset = total_size_dataset.iloc[train_size + validation_size:]

  return train_dataset, validation_dataset, test_dataset

In [17]:
train_dataset, validation_dataset, test_dataset = split_alpaca_dataset(load_alpaca_dataset())
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 24960
Validation dataset size: 3120
Test dataset size: 3121


In [18]:
from transformers import pipeline

In [19]:
# check befor model output
test_qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=100, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
prompt = """
###instruction:\nPlease answer the questions.\n###input:\nWhat is AI?\n###output:
"""
generate_text = test_qa_pipeline(prompt, max_length=100, num_return_sequences=1, temperature=0.8)[0]["generated_text"]
print(generate_text)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo


###instruction:
Please answer the questions.
###input:
What is AI?
###output:
AI is the acronym for artificial intelligence. It is the ability of machines to think and learn for themselves. This is not the same as human intelligence, which is the ability to reason and make decisions. AI is more closely related to human intelligence than it is to human cognitive abilities, but it has also been called the 'first artificial intelligence' because it was one of the first applications of computers to solve


In [20]:
from trl import DataCollatorForCompletionOnlyLM

In [21]:
# DataCollator setting
collator = DataCollatorForCompletionOnlyLM(response_template="###output:\n", tokenizer=tokenizer)

In [22]:
from transformers import TrainingArguments

In [23]:
# set training_args
training_arguments = TrainingArguments(
    output_dir=default_environment_variables["output_dir"],
    num_train_epochs=default_environment_variables["num_train_epochs"],
    per_device_train_batch_size=default_environment_variables["per_device_train_batch_size"],
    per_device_eval_batch_size=default_environment_variables["per_device_eval_batch_size"],
    gradient_accumulation_steps=default_environment_variables["gradient_accumulation_steps"],
    max_grad_norm=default_environment_variables["max_grad_norm"],
    optim=default_environment_variables["optim"],
    learning_rate=default_environment_variables["learning_rate"],
    weight_decay=default_environment_variables["weight_decay"],
    lr_scheduler_type=default_environment_variables["lr_scheduler_type"],
    warmup_steps=default_environment_variables["warmup_steps"],
    logging_steps=default_environment_variables["logging_steps"],
    seed=default_environment_variables["seed"],
    fp16=default_environment_variables["fp16"],
    bf16=default_environment_variables["bf16"],
    deepspeed=deepspeed_zero2_config,
    save_strategy=default_environment_variables["save_strategy"],
    save_steps=default_environment_variables["save_steps"],
    save_total_limit=default_environment_variables["save_total_limit"],
    eval_strategy=default_environment_variables["eval_strategy"],
    eval_steps=default_environment_variables["eval_steps"],
    do_eval=default_environment_variables["do_eval"]
)

[2025-06-24 16:51:21,923] [INFO] [comm.py:669:init_distributed] cdb=None
[2025-06-24 16:51:21,923] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


RuntimeError: The server socket has failed to listen on any local network address. port: 29500, useIpv6: 0, code: -98, name: EADDRINUSE, message: address already in use

In [None]:
# format dataset
train_formatted = apply_dataset_dataformat(train_dataset)
print(f"train_formatted: {train_formatted}")
validation_formatted = apply_dataset_dataformat(validation_dataset)
print(f"validation_formatted: {validation_formatted}")

In [25]:
from trl import SFTTrainer
from datasets import Dataset

In [None]:
# check dataset values
train_hf_dataset = Dataset.from_dict({"text": train_formatted.tolist()})
validation_hf_dataset = Dataset.from_dict({"text": validation_formatted.tolist()})
print(f"train_hf_dataset: {train_hf_dataset}")
print(f"validation_hf_dataset: {validation_hf_dataset}")

print('-'*10 + 'train_hf_dataset' + '-'*10)
for i in range(10):
    print(train_hf_dataset[i]['text'])
print('-'*10 + 'validation_hf_dataset' + '-'*10)
for i in range(10):
    print(validation_hf_dataset[i]['text'])

In [None]:
# SFTTrainer setting
trainer = SFTTrainer(
    model=model,
    train_dataset=train_hf_dataset,
    eval_dataset=validation_hf_dataset,
    args=training_arguments,
    data_collator=collator,
)

In [None]:
# train
trainer.train()

In [29]:
# trainer.save_model() # save model
trainer.save_state() # save metrics