# No Quantization
# No LoRA

In [None]:
!echo Infofarm1881 | sudo -S apt-get install mpich -y

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124  # PyTorch libraries for deep learning, including vision and audio processing, optimized for CUDA 12.4
!pip install 'transformers==4.51.3'  # Library for state-of-the-art natural language processing models, including BERT, GPT, and others
!pip install 'ninja==1.11.1.4'  # Build system designed for speed, useful for compiling extensions quickly
!pip install 'wheel==0.37.1'  # Python package format for faster installation and distribution
!pip install 'flash-attn==2.7.4.post1' --no-build-isolation  # Efficient attention mechanism implementation for transformer models, optimized for speed and memory usage
!pip install 'accelerate==1.6.0'  # Library to simplify distributed training and inference of machine learning models
!pip install 'huggingface_hub[cli]==0.30.2'  # Client library and CLI for interacting with Hugging Face Hub, a platform for sharing ML models and datasets
!pip install 'bitsandbytes==0.45.5'  # Library for efficient quantization and memory optimization in deep learning models
!pip install 'peft==0.15.2'  # Parameter-efficient fine-tuning library for transformer models
!pip install 'python-dotenv==1.1.0'  # Library for managing environment variables from .env files
!pip install 'wandb==0.19.10'  # For machine learning experiment tracking and visualization
!pip install 'GPUtil==1.4.0'  # For GPU monitoring and manual logging of GPU utilization
!pip install 'psutil==7.0.0'  # For monitoring system resources and manual logging of CPU and memory usage
!pip install 'datasets==3.5.0'  # Library for easily accessing and managing datasets for machine learning
!pip install 'numpy==2.1.2'  # Fundamental library for numerical computations in Python, supports arrays and mathematical operations
!pip install 'ipywidgets==8.1.6'  # Library for creating interactive widgets in Jupyter Notebook
!pip install 'pandas==2.2.3'  # Library for data analysis and manipulation, provides DataFrame for tabular data
!pip install 'matplotlib==3.10.1'  # Library for data visualization, can draw various graphs such as line and scatter plots
!pip install 'seaborn==0.13.2'  # Data visualization library based on Matplotlib, easily creates statistical graphs
!pip install 'scipy==1.15.2'  # Library for scientific and technical computing, specialized in advanced numerical calculations and optimization
!pip install 'PyPDF2==3.0.1'  # Library for reading and writing PDF files, can extract and merge pages
!pip install 'openai==1.76.0'  # Library for using OpenAI APIs, can utilize GPT models and other AI services
!pip install 'langchain==0.3.24'  # Library for developing applications utilizing language models, supports chaining and prompt management
!pip install 'langgraph==0.3.34'  # Library for graph structure analysis and workflows involving language data and models
!pip install 'psycopg2==2.9.10'  # PostgreSQL database adapter for Python, enables database connectivity and operations
!pip install 'deepspeed==0.16.7' --use-pep517  # Deep learning optimization library for distributed training, supports large-scale models and efficient resource utilizationa
!pip install 'pyarrow==19.0.1'
!pip install 'setuptools==68.1.2' # if python-venv version==3.12~ . install 'setuptools==68.1.2'
!pip install 'pynvml==11.5.0'
!pip install 'mpi4py'



In [3]:
import time
program_start_time = time.time()

In [4]:
# Set Environment Variables

default_environment_variables = {
    "wandb_account_name": "arekunoimar-deepspeed",
    "wandb_project_name": "llama-3-2-1b",
    "model_name": "meta-llama/Llama-3.2-1B",
    "dataset": "alpaca_data.json",
    "dataset_max_length": 2048,
    "apply_dataset_rate": 1.0,
    "dataset_train_rate": 0.8,
    "dataset_validation_rate": 0.1,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "optim": "adamw_torch",
    "logging_steps": 1,
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 500,
    "seed": 1024,
    "fp16": False,
    "bf16": True,
    "save_strategy": "steps",
    "save_steps": 1000,
    "save_total_limit": 1,
    "evaluation_strategy": "steps",
    "eval_steps": 1000,
    "do_eval": True,
    "logging_distance_time": 1,
    "deepspeed_zero0": False,
    "deepspeed_train_config_zero0_path":"deepspeed_train_config_zero0.json",
    "deepspeed_zero1": False,
    "deepspeed_train_config_zero1_path":"deepspeed_train_config_zero1.json",
    "deepspeed_zero2": True,
    "deepspeed_train_config_zero2_path":"deepspeed_train_config_zero2.json",
    "deepspeed_zero3": False,
    "deepspeed_train_config_zero3_path":"deepspeed_train_config_zero3.json",
    "deepspeed_zero3_infinity": False,
    "deepspeed_train_config_zero3_infinity_path":"deepspeed_train_config_zero3_infinity.json",
}

In [5]:
# from google.colab import drive

# drive.mount('/content/drive')

In [6]:

# import json
# df = pd.read_json(default_environment_variables["deepspeed"], encoding="UTF-8")
# df.head()

In [None]:
import pandas as pd
df = pd.read_json(default_environment_variables["dataset"], encoding="UTF-8")
df.head()

In [None]:
from datasets import Dataset
import json
from transformers import LlamaForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, AutoModelForCausalLM

class TrainUtil:

  def __init__(self, environment_valiables):
      self.alpaca_dataset = environment_valiables["dataset"]
      self.dataset_max_length = environment_valiables["dataset_max_length"]
      self.apply_dataset_rate = environment_valiables["apply_dataset_rate"]
      self.dataset_train_rate = environment_valiables["dataset_train_rate"]
      self.dataset_validation_rate = environment_valiables["dataset_validation_rate"]
      
  def read_alpaca_json_dataset(self) -> list:
      with open(self.alpaca_dataset, 'r', encoding='utf-8') as f:
          dataset = json.load(f)
          for i in range(len(dataset[0:100])):
              print(f"dataset[{i}]: {dataset[i]}")
      return dataset

  def load_alpaca_dataset(self, tokenizer):
      dataset = self.read_alpaca_json_dataset()
      dataset = Dataset.from_list(dataset)
      totaL_size = total_size = int(len(dataset) * self.apply_dataset_rate)
      dataset = dataset.select(range(total_size))
      train_size = int(total_size * self.dataset_train_rate)
      validation_size = int(total_size * self.dataset_validation_rate)
      train_dataset = dataset.select(range(0, train_size))
      validation_dataset = dataset.select(range(train_size, train_size + validation_size))
      test_dataset = dataset.select(range(train_size + validation_size, total_size))
      return train_dataset, validation_dataset, test_dataset

  def load_alpaca_train_dataset(self, tokenizer: AutoTokenizer, train_dataset):
      dataset = train_dataset
      dataset = dataset.map(self.format_alpaca_dataset)
      tokenized_dataset = dataset.map(lambda example: self.alpaca_tokenize_function(example, tokenizer), batched=True)
      return tokenized_dataset

  def load_alpaca_validation_dataset(self, tokenizer: AutoTokenizer, validation_dataset):
      dataset = validation_dataset
      dataset = dataset.map(self.format_alpaca_dataset)
      tokenized_dataset = dataset.map(lambda example: self.alpaca_tokenize_function(example, tokenizer), batched=True)
      return tokenized_dataset

  def load_alpaca_test_dataset(self, tokenizer: AutoTokenizer, test_dataset):
      dataset = test_dataset
      dataset = dataset.map(self.format_alpaca_dataset)
      tokenized_dataset = dataset.map(lambda example: self.alpaca_tokenize_function(example, tokenizer), batched=True)
      return tokenized_dataset

  def format_alpaca_dataset(self, example):
      try:
          
        SYSTEM_PROMPT = "You are a helpful assistant."
        user_input = example["input"].strip()

        if user_input:
            prompt = (
                "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
                f"{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
                f"{example['instruction']}\n{user_input}<|eot_id|>"
                "<|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            prompt = (
                "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
                f"{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
                f"{example['instruction']}<|eot_id|>"
                "<|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        completion = example["output"].strip() + "<|eot_id|>"
        result = {"prompt": prompt, "completion": completion}
        return result
      except Exception as e:
          return {}

  def alpaca_tokenize_function(self, example: dict, tokenizer: AutoTokenizer) -> dict:
      tokens = tokenizer(example["prompt"], truncation=True, padding="max_length", max_length=self.dataset_max_length)
      tokens["labels"] = tokens["input_ids"].copy()
      tokens["labels"] = [[(label if label != tokenizer.pad_token_id else -100) for label in labels] for labels in tokens["labels"]]
      tokens["prompt"] = tokens["input_ids"]
      tokens["completion"] = tokens["labels"]
      return tokens
  
  def data_collator(self, tokenizer: AutoTokenizer):
      data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)
      return data_collator



In [9]:
class DeepSpeedUtil:

 def __init__(self, environment_valiables):
  self.deepspeed_zero0 = environment_valiables["deepspeed_zero0"]
  self.deepspeed_train_config_zero0_path = environment_valiables["deepspeed_train_config_zero0_path"]
  self.deepspeed_zero1 = environment_valiables["deepspeed_zero1"]
  self.deepspeed_train_config_zero1_path = environment_valiables["deepspeed_train_config_zero1_path"]
  self.deepspeed_zero2 = environment_valiables["deepspeed_zero2"]
  self.deepspeed_train_config_zero2_path = environment_valiables["deepspeed_train_config_zero2_path"]
  self.deepspeed_zero3 = environment_valiables["deepspeed_zero3"]
  self.deepspeed_train_config_zero3_path = environment_valiables["deepspeed_train_config_zero3_path"]
  self.deepspeed_zero3_infinity = environment_valiables["deepspeed_zero3_infinity"]
  self.deepspeed_train_config_zero3_infinity_path = environment_valiables["deepspeed_train_config_zero3_infinity_path"]

 def deepspeed_zero_config(self):
  config = (self.deepspeed_train_config_zero0_path if self.deepspeed_zero0 == True else
            self.deepspeed_train_config_zero1_path if self.deepspeed_zero1 == True else
            self.deepspeed_train_config_zero2_path if self.deepspeed_zero2 == True else
            self.deepspeed_train_config_zero3_path if self.deepspeed_zero3 == True else
            self.deepspeed_train_config_zero3_infinity_path if self.deepspeed_zero3_infinity == True else None)
  return config

 def deepspeed_valiable_name(self):
  deepspeed_valiable_name = ("Deepspeed_Zero0" if self.deepspeed_zero0 == True else
                             "Deepspeed_Zero1" if self.deepspeed_zero1 == True else
                             "Deepspeed_Zero2" if self.deepspeed_zero2 == True else
                             "Deepspeed_Zero3" if self.deepspeed_zero3 == True else
                             "Deepspeed_Zero3_Infinity" if self.deepspeed_zero3_infinity == True else None)
  return deepspeed_valiable_name

In [10]:
import time
from datetime import datetime
import socket
import random
import string
import os
import shutil
import wandb
import psutil
import GPUtil
from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage, nvmlDeviceGetTemperature
import threading
import csv

class LoggingUtil:
    
    def __init__(self, environment_valiables):
        self.stop_event = threading.Event()
        self.environment_valiables = environment_valiables

    def date_time(self):
        result = time.strftime("%Y%m%d%H%M", time.localtime())
        return result
    
    def timestamp(self):
        result = datetime.now().isoformat()
        return result
    
    def get_hostname(self):
        hostname = socket.gethostname()
        return hostname
    
    def split_model_name(self):
        model_name_valiable = self.environment_valiables["model_name"].split('/')[-1]
        return model_name_valiable

    def generate_random_num_strings(self):
        pool_num = "123456789"
        pool_string = string.ascii_letters
        num_strings = pool_num + pool_string
        result = "".join(random.choices(num_strings, k=10))
        return result
    
    def common_directory_path(self, split_model_name, deepspeed_valiable_name, wandb_run_id, date_time):
        common_directory_path_valiable = f"{split_model_name}_{deepspeed_valiable_name}_{wandb_run_id}_{date_time}"
        join_output_directory_path = os.path.join('./output', common_directory_path_valiable)
        os.makedirs(join_output_directory_path, exist_ok=True)
        return join_output_directory_path

    def output_directory_path(self, common_directory_path):
        wandb_output_directory_path = os.path.join(common_directory_path, 'wandb_output')
        wandb_run_name = common_directory_path
        deepspeed_train_output = os.path.join(common_directory_path, 'deepspeed_train_output')
        check_point_output = os.path.join(common_directory_path, 'check_point_output')
        export_train_output = os.path.join(common_directory_path, 'export_train_output')
        os.makedirs(export_train_output, exist_ok=True)
        return [wandb_output_directory_path, wandb_run_name, deepspeed_train_output, check_point_output, export_train_output]

    def export_files(self, export_train_output):
        shutil.copy("llama_3_2_1b_instruct_tuning.ipynb", export_train_output)
        export_train_loggings = os.path.join(export_train_output, 'train_loggings.csv')
        export_device_metric_loggings = os.path.join(export_train_output, 'device_metric_loggings.csv')
        export_deepspeed_metric_loggings = os.path.join(export_train_output, 'deepspeed_metric_loggings.csv')
        export_validation_loggings = os.path.join(export_train_output, 'validation_loggings.csv')
        return [export_train_loggings, export_device_metric_loggings, export_deepspeed_metric_loggings, export_validation_loggings]
    
    def init_wandb(self, wandb_output_directory_path, wandb_run_id, wandb_run_name):
        wandb_run = wandb.init(entity=self.environment_valiables["wandb_account_name"], project=self.environment_valiables["wandb_project_name"], dir=wandb_output_directory_path, id=wandb_run_id, name=wandb_run_name, mode='offline')
        return wandb_run
    
    def end_wandb(self):
        wandb.finish()

    def logging_device_cpu(self):
        per_core_usage = psutil.cpu_percent(percpu=True)
        per_core_freqs = psutil.cpu_freq(percpu=True)
        temps = psutil.sensors_temperatures().get('coretemp', [])
        temp_val = temps[0].current if temps else None

        per_core_current_freqs = [f.current for f in per_core_freqs]
        per_core_max_freqs = [f.max for f in per_core_freqs]

        per_core_freq_percentages = [
            (current / max_freq) * 100 if max_freq else None
            for current, max_freq in zip(per_core_current_freqs, per_core_max_freqs)
        ]

        data = {
            'TIMESTAMP': self.timestamp(),
            'CPU_AVG_USAGE_PERCENT': sum(per_core_usage) / len(per_core_usage),
            'CPU_AVG_CLOCK_MHZ': sum(per_core_current_freqs) / len(per_core_current_freqs),
            'CPU_AVG_CLOCK_PERCENT': sum(filter(None, per_core_freq_percentages)) / len(per_core_freq_percentages),
            'CPU_TEMP_C': temp_val
        }

        for i, (u, f, fp) in enumerate(zip(per_core_usage, per_core_current_freqs, per_core_freq_percentages), start=1):
            data[f'CPU_CORE_{i}_USAGE_PERCENT'] = u
            data[f'CPU_CORE_{i}_CLOCK_MHZ'] = f
            data[f'CPU_CORE_{i}_CLOCK_PERCENT'] = fp
        return data
    
    def logging_device_gpu(self):
        gpus = GPUtil.getGPUs()
        gpu_data = {
            'TIMESTAMP': self.timestamp(),
        }

        for i, gpu in enumerate(gpus):
            gpu_handle = nvmlDeviceGetHandleByIndex(i)
            gpu_data[f'GPU_{i}_USAGE_PERCENT'] = gpu.load * 100
            gpu_data[f'GPU_{i}_MEM_USED_MB'] = gpu.memoryUsed
            gpu_data[f'GPU_{i}_MEM_UTIL_PERCENT'] = gpu.memoryUtil * 100
            gpu_data[f'GPU_{i}_POWER_W'] = nvmlDeviceGetPowerUsage(gpu_handle) / 1000.0
            gpu_data[f'GPU_{i}_TEMP_C'] = nvmlDeviceGetTemperature(gpu_handle, 0)
        return gpu_data
    
    def logging_device_ram(self):
        mem = psutil.virtual_memory()
        return {
            'TIMESTAMP': self.timestamp(),
            'RAM_USAGE_PERCENT': mem.percent,
        }

    def logging_device_disk(self):
        usage = psutil.disk_usage('/')
        return {
            'TIMESTAMP': self.timestamp(),
            'DISK_USED_GB': usage.used / (1024**3),
            'DISK_USAGE_PERCENT': usage.percent,
        }
    
    def logging_device_network(self):
        stats = psutil.net_io_counters(pernic=True)
        net = {'TIMESTAMP': self.timestamp()}

        for iface, val in stats.items():
            sent_mb = val.bytes_sent / (1024**2)
            recv_mb = val.bytes_recv / (1024**2)
            net[f'NETWORK_{iface}_SENT_MB'] = sent_mb
            net[f'NETWORK_{iface}_RECV_MB'] = recv_mb
        return net
    
    def write_to_csv_log(self, data, output_dir):
        os.makedirs(os.path.dirname(output_dir), exist_ok=True)

        file_exists = os.path.isfile(output_dir)

        with open(output_dir, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=list(data.keys()))
            if not file_exists:
                writer.writeheader()
            writer.writerow(data)

    def loop_logging_metric_process(self, log_file, wandb_run_name, wandb_run_id):
        while not self.stop_event.is_set():
            cpu = self.logging_device_cpu()
            gpu = self.logging_device_gpu()
            ram = self.logging_device_ram()
            disk = self.logging_device_disk()
            net = self.logging_device_network()

            merged = {
                'wandb_account_name': self.environment_valiables["wandb_account_name"],
                'wandb_project_name': self.environment_valiables["wandb_project_name"],
                'wandb_run_name': wandb_run_name,
                'wandb_run_id': wandb_run_id,
                'hostname': self.get_hostname(),
                    **cpu,
                    **gpu,
                    **ram,
                    **disk,
                    **net,
                }
            self.write_to_csv_log(data=merged, output_dir=log_file)
            time.sleep(self.environment_valiables["logging_distance_time"])


    def start_logging_device_metric(self, log_file, wandb_run_name, wandb_run_id):
            self.stop_event.clear()
            global logging_thread
            logging_thread = threading.Thread(
                target=self.loop_logging_metric_process,
                args=(
                    log_file,
                    wandb_run_name,
                    wandb_run_id
                )
            )
            logging_thread.start()
    
    def finish_logging_device_metric(self):
        self.stop_event.set()
        logging_thread.join()

In [11]:
# Train Metrics Callback
from typing import Any, Dict, Optional
from transformers import TrainerCallback, TrainerState, TrainingArguments, TrainerControl

class TrainMetricsCallback(TrainerCallback):

    def __init__(self, wandb_account_name: str, wandb_project_name: str, wandb_run_name: str, wandb_run_id: str, hostname: str, log_file: str):
        self.wandb_account_name = wandb_account_name
        self.wandb_project_name = wandb_project_name
        self.wandb_run_name = wandb_run_name
        self.wandb_run_id = wandb_run_id
        self.hostname = hostname
        self.log_file = log_file

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None:
        logging_util = LoggingUtil(environment_valiables=default_environment_variables)

        train_metrics = {key: logs[key] for key in ['loss', 'grad_norm', 'learning_rate', 'epoch'] if logs and key in logs}

        if all(key in train_metrics for key in ['loss', 'grad_norm', 'learning_rate', 'epoch']):
            metrics = {
                'TIMESTAMP': logging_util.timestamp(),
                'wandb_account_name': self.wandb_account_name,
                'wandb_project_name': self.wandb_project_name,
                'wandb_run_name': self.wandb_run_name,
                'wandb_run_id': self.wandb_run_id,
                'hostname': self.hostname,
                'step': state.global_step,
                **train_metrics
            }
            logging_util.write_to_csv_log(data=metrics, output_dir=self.log_file)

In [12]:
# Validation Metrics Callback
class ValidationMetricsCallback(TrainerCallback):

    def __init__(self, wandb_account_name: str, wandb_project_name: str, wandb_run_name: str, wandb_run_id: str, hostname: str, log_file: str):
        self.wandb_account_name = wandb_account_name
        self.wandb_project_name = wandb_project_name
        self.wandb_run_name = wandb_run_name
        self.wandb_run_id = wandb_run_id
        self.hostname = hostname
        self.log_file = log_file

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs: Optional[Dict[str, Any]] = None, **kwargs: Any) -> None:
        if logs and any(key.startswith('eval_') for key in logs.keys()):
            logging_util = LoggingUtil(environment_valiables=default_environment_variables)
                
            validation_metrics = {key: logs[key] for key in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'] if logs and key in logs}

            if all(key in validation_metrics for key in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']):
                metrics = {
                    'TIMESTAMP': logging_util.timestamp(),
                    'wandb_account_name': self.wandb_account_name,
                    'wandb_project_name': self.wandb_project_name,
                    'wandb_run_name': self.wandb_run_name,
                    'wandb_run_id': self.wandb_run_id,
                    'hostname': self.hostname,
                    'step': state.global_step,
                    **validation_metrics
                }
                logging_util.write_to_csv_log(data=metrics, output_dir=self.log_file)

In [13]:
# DeepSpeed Metrics Callback
class DeepSpeedMetricsCallback(TrainerCallback):

    def __init__(self, trainer, wandb_account_name: str, wandb_project_name: str, wandb_run_name: str, wandb_run_id: str, hostname: str, log_file: str):
        self.trainer = trainer
        self.wandb_account_name = wandb_account_name
        self.wandb_project_name = wandb_project_name
        self.wandb_run_name = wandb_run_name
        self.wandb_run_id = wandb_run_id
        self.hostname = hostname
        self.log_file = log_file
        self.logging_util = LoggingUtil(environment_valiables=default_environment_variables)

    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs: Any) -> None:
            engine = getattr(self.trainer, "deepspeed", None)
            if engine is None:
                return
            
            timestamp = self.logging_util.timestamp()
        
            if state.global_step % 1000 == 0:
                for model_param_name, model_param in engine.module.named_parameters():
                    param_metrics = {
                        'TIMESTAMP': timestamp,
                        'wandb_account_name': self.wandb_account_name,
                        'wandb_project_name': self.wandb_project_name,
                        'wandb_run_name': self.wandb_run_name,
                        'wandb_run_id': self.wandb_run_id,
                        'hostname': self.hostname,
                        'step': state.global_step,
                        'model_param_name': model_param_name,
                        'model_param_numel': model_param.numel(),
                        'model_param_dtype': model_param.dtype,
                        'model_param_device': model_param.device.type,
                    }
                    self.logging_util.write_to_csv_log(data=param_metrics, output_dir=self.log_file)

In [None]:
import warnings
import torch.distributed as dist
import subprocess
import glob

# Warning Suppression
warnings.filterwarnings("ignore", message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly.") # Warning Suppression
warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast*") # Warning Suppression

# Call Instance
train_util = TrainUtil(default_environment_variables)
logging_util = LoggingUtil(default_environment_variables)
deepspeed_util = DeepSpeedUtil(default_environment_variables)

# Set Variables
date_time = logging_util.date_time()
split_model_name = logging_util.split_model_name()
hostname = logging_util.get_hostname()
timestamp = logging_util.timestamp()
wandb_run_id = logging_util.generate_random_num_strings()

common_directory_path = logging_util.common_directory_path(split_model_name=split_model_name, deepspeed_valiable_name=deepspeed_util.deepspeed_valiable_name(), wandb_run_id=wandb_run_id, date_time=date_time)
wandb_output_directory_path, wandb_run_name, deepspeed_train_output, check_point_output, export_train_output = logging_util.output_directory_path(common_directory_path)
export_train_loggings, export_device_metric_loggings, export_deepspeed_metric_loggings, export_validation_loggings = logging_util.export_files(export_train_output)

# Login WanDB
wandb_run = logging_util.init_wandb(wandb_output_directory_path=wandb_output_directory_path, wandb_run_id=wandb_run_id, wandb_run_name=wandb_run_name)

# Load Model
tokenizer = AutoTokenizer.from_pretrained(default_environment_variables["model_name"])
model = LlamaForCausalLM.from_pretrained(default_environment_variables["model_name"])

model.config.use_cache = False
# print(model.print_trainable_parameters())
tokenizer.pad_token = tokenizer.eos_token
data_collator = train_util.data_collator(tokenizer=tokenizer)

In [None]:
# DeepSpeed Zero Config
print(deepspeed_util.deepspeed_zero_config())

In [15]:
# Set Training Arguments
training_args = TrainingArguments(
    output_dir=check_point_output,
    num_train_epochs=default_environment_variables["num_train_epochs"],
    per_device_train_batch_size=default_environment_variables["per_device_train_batch_size"],
    per_device_eval_batch_size=default_environment_variables["per_device_eval_batch_size"],
    gradient_accumulation_steps=default_environment_variables["gradient_accumulation_steps"],
    optim=default_environment_variables["optim"],
    logging_steps=default_environment_variables["logging_steps"],
    learning_rate=default_environment_variables["learning_rate"],
    lr_scheduler_type=default_environment_variables["lr_scheduler_type"],
    warmup_steps=default_environment_variables["warmup_steps"],
    seed=default_environment_variables["seed"],
    fp16=default_environment_variables["fp16"],
    bf16=default_environment_variables["bf16"],
    deepspeed=deepspeed_util.deepspeed_zero_config(),
    save_strategy=default_environment_variables["save_strategy"],
    save_steps=default_environment_variables["save_steps"],
    save_total_limit=default_environment_variables["save_total_limit"],
    eval_strategy=default_environment_variables["evaluation_strategy"],
    eval_steps=default_environment_variables["eval_steps"],
    do_eval=default_environment_variables["do_eval"]
)

In [None]:
# Load Dataset
load_dataset_start_time = time.time()

train_util = TrainUtil(environment_valiables=default_environment_variables)
train_dataset, validation_dataset, test_dataset = train_util.load_alpaca_dataset(tokenizer=tokenizer)
tokenized_train_dataset = train_util.load_alpaca_train_dataset(tokenizer=tokenizer, train_dataset=train_dataset)
tokenized_validation_dataset = train_util.load_alpaca_validation_dataset(tokenizer=tokenizer, validation_dataset=validation_dataset)
tokenized_test_dataset = train_util.load_alpaca_test_dataset(tokenizer=tokenizer, test_dataset=test_dataset)

load_dataset_end_time = time.time()
print(f"load_dataset_end_time: {load_dataset_end_time - load_dataset_start_time}")

In [None]:
# Set Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
)

In [18]:
# Set Callback
train_metrics_callback = TrainMetricsCallback(log_file=export_train_loggings, wandb_run_id=wandb_run_id, wandb_project_name=default_environment_variables["wandb_project_name"], wandb_run_name=wandb_run_name, wandb_account_name=default_environment_variables["wandb_account_name"], hostname=hostname)
deepspeed_metrics_callback = DeepSpeedMetricsCallback(trainer=trainer, log_file=export_deepspeed_metric_loggings, wandb_run_id=wandb_run_id, wandb_project_name=default_environment_variables["wandb_project_name"], wandb_run_name=wandb_run_name, wandb_account_name=default_environment_variables["wandb_account_name"], hostname=hostname)
validation_metrics_callback = ValidationMetricsCallback(log_file=export_validation_loggings, wandb_run_id=wandb_run_id, wandb_project_name=default_environment_variables["wandb_project_name"], wandb_run_name=wandb_run_name, wandb_account_name=default_environment_variables["wandb_account_name"], hostname=hostname)
trainer.add_callback(train_metrics_callback)
trainer.add_callback(deepspeed_metrics_callback)
trainer.add_callback(validation_metrics_callback)

In [19]:
# Start logging device metric
logging_util.start_logging_device_metric(log_file=export_device_metric_loggings, wandb_run_name=wandb_run_name, wandb_run_id=wandb_run_id)

In [None]:
train_start_time = time.time()
trainer.train()
train_end_time = time.time()
print(f"train_end_time: {train_end_time - train_start_time}")

In [None]:
# end wandb
logging_util.end_wandb()

# release resources
dist.destroy_process_group()

# finish logging device metric
logging_util.finish_logging_device_metric()

In [None]:
# Sync wandb run
wandb_sync_start_time = time.time()
wandb_offline_run_dir = os.path.dirname(wandb_run.dir)
subprocess.run(["wandb", "sync", wandb_offline_run_dir])
wandb_sync_end_time = time.time()

In [None]:
program_end_time = time.time()
print(f"program_end_time: {program_end_time - program_start_time}")