In [None]:
password = input("Enter the password: ")
!echo {password} | sudo -S apt-get install mpich -y

In [2]:
# set env
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["LOCAL_RANK"] = "0"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install 'transformers==4.51.3'
!pip install 'numpy==1.26.4'
!pip install 'datasets==3.5.0'
!pip install 'huggingface_hub[cli]==0.30.2'
!pip install 'deepspeed==0.16.7' --use-pep517
!pip install 'trl==0.19.0'
!pip install 'mpi4py'
!pip install 'wandb==0.19.10'
!pip install 'peft==0.15.2'
!pip install ipykernel jupyter_client

In [4]:
# Set Environment Variables
default_environment_variables = {
    "output_dir": "./output/llama-3-2-1b-alpaca",
    "wandb_account_name": "arekunoimar-deepspeed",
    "wandb_project_name": "llama-3-2-1b",
    "model_name": "meta-llama/Llama-3.2-1B",
    "dataset": "../dataset/alpaca_data.json",
    "dataset_max_length": 512,
    "apply_dataset_rate": 1.0,
    "dataset_train_rate": 0.8,
    "dataset_validation_rate": 0.2,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "max_grad_norm": 1.0,
    "optim": "adamw_torch",
    "logging_steps": 1,
    "learning_rate": 2e-5,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.05,
    "seed": 1024,
    "fp16": False,
    "bf16": True,
    "save_strategy": "steps",
    "save_steps": 10000,
    "save_total_limit": 5,
    "eval_strategy": "steps",
    "eval_steps": 10000,
    "do_eval": True,
    "logging_distance_time": 1,
    "weight_decay": 0.001,
}

In [None]:
import wandb
import time

wandb_name = "llama-3.2-1b-alpaca-deepspeed-zero2-" + time.strftime("%Y-%m-%d_%H-%M-%S")

wandb.init(project=default_environment_variables["wandb_project_name"], entity=default_environment_variables["wandb_account_name"], name=wandb_name)

In [6]:
# set deepspeed config
deepspeed_zero2_config = {
  "zero_optimization": {
    "stage": 2
  },
  "train_batch_size": 1,
  "eval_batch_size": 1
}

In [7]:
import torch
# マトリックス乗算で TF32 を許可
torch.backends.cuda.matmul.allow_tf32 = True
# cuDNN（畳み込み等）で TF32 を許可
torch.backends.cudnn.allow_tf32 = True

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
# load model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["model_name"])
model = AutoModelForCausalLM.from_pretrained(default_environment_variables["model_name"])

In [10]:
# set tokenizer special token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [11]:
# read dataset
import pandas
import json

def load_alpaca_dataset():
  with open(default_environment_variables["dataset"], 'r', encoding='utf-8') as f:
    data = json.load(f)

  dataframe = pandas.DataFrame(data)
  dataframe = dataframe[['instruction', 'input', 'output']]
  dataframe.head(100)
  return dataframe

In [None]:
dataframe = load_alpaca_dataset()
print(dataframe.head())

In [13]:
# apply dataformat
def apply_dataset_dataformat(dataframe):
    def format_instruction(row):
        if row['input']:
            text = f"###instruction:\n{row['instruction']}\n###input:\n{row['input']}\n###output:\n{row['output']}"
        else:
            text = f"###instruction:\n{row['instruction']}\n###output:\n{row['output']}"
        return text
    
    formated_dataframe = dataframe.apply(format_instruction, axis=1)
    return formated_dataframe

In [None]:
formated_dataframe = apply_dataset_dataformat(dataframe)
print(formated_dataframe.head())

In [15]:
# split dataset
def split_alpaca_dataset(dataframe):
  total_size_dataset = dataframe.sample(frac=default_environment_variables["apply_dataset_rate"])
  total_size_count = len(total_size_dataset)

  train_size = int(len(total_size_dataset) * default_environment_variables["dataset_train_rate"])
  validation_size = int(len(total_size_dataset) * default_environment_variables["dataset_validation_rate"])
  test_size = total_size_count - (train_size + validation_size)

  train_dataset = total_size_dataset.iloc[:train_size]
  validation_dataset = total_size_dataset.iloc[train_size:train_size + validation_size]
  test_dataset = total_size_dataset.iloc[train_size + validation_size:]

  return train_dataset, validation_dataset, test_dataset

In [None]:
train_dataset, validation_dataset, test_dataset = split_alpaca_dataset(load_alpaca_dataset())
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [17]:
from transformers import pipeline

In [None]:
# check befor model output
test_qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=100, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
prompt = """
###instruction:\nPlease answer the questions.\n###input:\nWhat is AI?\n###output:
"""
generate_text = test_qa_pipeline(prompt, max_length=512, num_return_sequences=1, temperature=0.8)[0]["generated_text"]
print(generate_text)

In [19]:
from trl import DataCollatorForCompletionOnlyLM

In [20]:
# DataCollator setting
collator = DataCollatorForCompletionOnlyLM(response_template="###output:\n", tokenizer=tokenizer)

In [21]:
from transformers import TrainingArguments

In [None]:
# set training_args
training_arguments = TrainingArguments(
    output_dir=default_environment_variables["output_dir"],
    num_train_epochs=default_environment_variables["num_train_epochs"],
    per_device_train_batch_size=default_environment_variables["per_device_train_batch_size"],
    per_device_eval_batch_size=default_environment_variables["per_device_eval_batch_size"],
    gradient_accumulation_steps=default_environment_variables["gradient_accumulation_steps"],
    max_grad_norm=default_environment_variables["max_grad_norm"],
    optim=default_environment_variables["optim"],
    learning_rate=default_environment_variables["learning_rate"],
    weight_decay=default_environment_variables["weight_decay"],
    lr_scheduler_type=default_environment_variables["lr_scheduler_type"],
    warmup_ratio=default_environment_variables["warmup_ratio"],
    logging_steps=default_environment_variables["logging_steps"],
    seed=default_environment_variables["seed"],
    fp16=default_environment_variables["fp16"],
    bf16=default_environment_variables["bf16"],
    deepspeed=deepspeed_zero2_config,
    save_strategy=default_environment_variables["save_strategy"],
    save_steps=default_environment_variables["save_steps"],
    save_total_limit=default_environment_variables["save_total_limit"],
    eval_strategy=default_environment_variables["eval_strategy"],
    eval_steps=default_environment_variables["eval_steps"],
    do_eval=default_environment_variables["do_eval"]
)

In [None]:
# format dataset
train_formatted = apply_dataset_dataformat(train_dataset)
print(f"train_formatted: {train_formatted}")
validation_formatted = apply_dataset_dataformat(validation_dataset)
print(f"validation_formatted: {validation_formatted}")

In [24]:
from trl import SFTTrainer
from datasets import Dataset

In [None]:
# check dataset values
train_hf_dataset = Dataset.from_dict({"text": train_formatted.tolist()})
validation_hf_dataset = Dataset.from_dict({"text": validation_formatted.tolist()})
print(f"train_hf_dataset: {train_hf_dataset}")
print(f"validation_hf_dataset: {validation_hf_dataset}")

print('-'*10 + 'train_hf_dataset' + '-'*10)
for i in range(10):
    print(train_hf_dataset[i]['text'])
print('-'*10 + 'validation_hf_dataset' + '-'*10)
for i in range(10):
    print(validation_hf_dataset[i]['text'])

In [None]:
# SFTTrainer setting
trainer = SFTTrainer(
    model=model,
    train_dataset=train_hf_dataset,
    eval_dataset=validation_hf_dataset,
    args=training_arguments,
    data_collator=collator,
)

In [None]:
# train
trainer.train()

In [28]:
trainer.save_model() # save model
trainer.save_state() # save metrics

In [None]:
wandb.finish()

In [None]:
import ipykernel
from jupyter_client import KernelManager
import os

def stop_kernel():
    try:
        # 現在のカーネルIDを取得
        connection_file = ipykernel.get_connection_file()
        kernel_id = os.path.basename(connection_file).replace('kernel-', '').replace('.json', '')
        
        # カーネルマネージャーを使用してカーネルを停止
        km = KernelManager(kernel_name='python3')
        km.kernel_spec = kernel_id
        km.shutdown_kernel()
        print("カーネルを正常に停止しました。")
    except Exception as e:
        print(f"カーネル停止エラー: {e}")

print("プログラム開始")
# ここに処理を記述
stop_kernel()
print("プログラム終了")