### Using nemo-run for Finetuning on CCWS

**Step 1**: Load required python modules

In [None]:
"""Run a NeMo LLM inference job with specified parameters."""
import os
from pathlib import Path
from typing import Optional

import nemo_run as run
from dotenv import load_dotenv
from megatron.core.inference.common_inference_params import CommonInferenceParams
from nemo import lightning as nl
from nemo.collections import llm
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from utils.run_tools import slurm_executor

**Step 2**: Load required parameters from .env file

In [None]:
load_dotenv()

GPUS = int(os.getenv("GPUS_PER_NODE", "8"))
NODES = int(os.getenv("NUM_NODES", "2"))
GPU_QUEUE = os.getenv("GPU_PARTITION", "gpu")
WALLTIME = os.getenv("WALLTIME", "01:00:00")
CONTAINER = os.getenv("CONTAINER_IMAGE", "nvcr.io#nvidia/nemo:dev")
NEMO_HOME = os.getenv("NEMO_HOME")
CHECKPOINT_DIR = os.getenv("CHECKPOINT_DIR")  # default to current directory
HF_TOKEN_PATH = os.getenv("HF_TOKEN_PATH")
HF_HOME = os.getenv("HF_HOME")

# verify nemo_home is set
if NEMO_HOME is None:
    raise ValueError(
        "NEMO_HOME environment variable is not set. "
        "Please set it to the path of your NeMo installation."
    )

if CHECKPOINT_DIR is None:
    raise ValueError(
        "CHECKPOINT_PATH environment variable is not set. "
        "Please set it to the path of your checkpoint directory."
    )

# verify hf_token_path is set
if HF_TOKEN_PATH is None:
    raise ValueError(
        "HF_TOKEN_PATH environment variable is not set. "
        "Please set it to the path of your Hugging Face token."
    )

if HF_HOME is None:
    raise ValueError(
        "HF_HOME environment variable is not set. "
        "Please set it to the path of your Hugging Face cache."
    )

**Step 3**: Configuration - loading the llama3 model from HuggingFace

In [None]:
# llm.import_ckpt is the nemo2 API for converting Hugging Face checkpoint to NeMo format
def configure_checkpoint_conversion():
    """Configure the checkpoint conversion for NeMo LLM."""
    return run.Partial(
        llm.import_ckpt,
        model=llm.llama3_8b.model(),
        source="hf://meta-llama/Meta-Llama-3-8B",
        overwrite=False,
    )


import_ckpt = configure_checkpoint_conversion()

# define the slurm executor to perform the model import in a slurm job
import_exec = slurm_executor(
    account="",
    partition=GPU_QUEUE,
    nodes=1,
    devices=1,
    container_image=CONTAINER,
    gres="gpu:1",  # Set the number of GPUs per node
    custom_mounts=[
        NEMO_HOME + ":" + NEMO_HOME,
        HF_TOKEN_PATH + ":" + HF_TOKEN_PATH,
        HF_HOME + ":" + HF_HOME,
    ],
)

# Set this env vars for model download from huggingface
import_exec.env_vars["NEMO_HOME"] = NEMO_HOME
import_exec.env_vars["HF_HOME"] = HF_HOME
import_exec.env_vars["HF_TOKEN_PATH"] = HF_TOKEN_PATH

**Step 4**: Configuration - finetuning the model


We will use the the default recipe included in Nemo 2.0 for finetuning the Llama 3 model. The default recipe uses the `SquadDataModule` for the `data` argument. You can replace the `SquadDataModule` with your custom dataset. We will not target the use of custom data sets for this exercise, but we included a template that highlights how to use custom data with the recipe.

In [None]:
def configure_finetuning_recipe(
    checkpoint_path: str,
    nodes: int = 1,
    gpus_per_node: int = 1,
    peft_scheme: Optional[str] = None,
    model_name: str = "llama3_ccws",
):
    """Configure the finetuning recipe for NeMo LLM."""

    recipe = llm.llama3_8b.finetune_recipe(
        dir=checkpoint_path,  # Path to store checkpoints
        name=model_name,
        num_nodes=nodes,
        num_gpus_per_node=gpus_per_node,
        peft_scheme=peft_scheme,
    )

    recipe.trainer.max_steps = 200
    recipe.trainer.num_sanity_val_steps = 0

    # Async checkpointing doesn't work with PEFT
    recipe.trainer.strategy.ckpt_async_save = False

    # Note, the default is 2
    recipe.trainer.strategy.context_parallel_size = 1
    recipe.trainer.val_check_interval = 200

    # This is currently required for LoRA/PEFT
    recipe.trainer.strategy.ddp = "megatron"

    recipe.data.delete_raw = False
    # # To override the data argument
    # dataloader = a_function_that_configures_your_custom_dataset(
    #     gbs=gbs,
    #     mbs=mbs,
    #     seq_length=recipe.model.config.seq_length,
    # )
    # recipe.data = dataloader

    return recipe


MODEL_NAME = "llama3_ft_ccws"
# This will finetune the model using the number of nodes and GPUs specified in the .env file.
# The .env file will also be used to determine where to store the checkpoints.
finetune = configure_finetuning_recipe(
    checkpoint_path=CHECKPOINT_DIR,
    gpus_per_node=GPUS,
    nodes=NODES,
    model_name=MODEL_NAME,
)

# define the slurm executor to perform the finetuning as a slurm job
finetune_exec = slurm_executor(
    account="",
    partition=GPU_QUEUE,
    nodes=finetune.trainer.num_nodes,
    devices=finetune.trainer.devices,
    time=WALLTIME,
    container_image=CONTAINER,
    gres="gpu:" + str(finetune.trainer.devices),
    custom_mounts=[
        NEMO_HOME + ":" + NEMO_HOME,
        HF_TOKEN_PATH + ":" + HF_TOKEN_PATH,
        HF_HOME + ":" + HF_HOME,
    ],
)

finetune_exec.env_vars["NEMO_HOME"] = NEMO_HOME
finetune_exec.env_vars["HF_HOME"] = HF_HOME

**Step 6**: Submit jobs

We will now use nemo-run experiments to finetune the model. The first job will load the model from huggingface, the second job will finetune the model as defined by the recipe. 

In [None]:
with run.Experiment("llama3-8b-nemoft-ccws") as exp:
    exp.add(import_ckpt, executor=import_exec, name="llama3-8b-ccws-import")
    exp.add(finetune, executor=finetune_exec, name="llama3-8b-ccws-finetune")
    exp.run(sequential=True, tail_logs=False, detach=True)

**Step 7**: Monitor jobs

Running exp.status() will let you know which tasks are being executed, and if they complete successfully. 

In [None]:
exp.status()

# Uncomment the following lines to cancel the jobs if needed
# rerun the previous cell to resubmit the jobs
# exp.cancel("llama3-8b-ccws-finetune")
# exp.cancel("llama3-8b-ccws-import")

**Optional**: Review Logs for model import

As tasks are running, you can use exp.logs() to tail the logs in real-time

In [None]:
exp.logs("llama3-8b-ccws-import")

**Optional**: Review logs for finetuning

In [None]:
exp.logs("llama3-8b-ccws-finetune")

**Step 8**: Verify Results

The previous jobs, should produce a checkpoint. We will try to find the checkpoint's path so that we can inference against it. Do not proceed to the next steps if a path is not found. Verify that you have a checkpoint and ensure that `ckpt_path` is set to the path for the checkpoint.   

In [None]:
# this will find the last checkpoint in the checkpoint directory and use it for inference
SEARCH_PATH = "".join([CHECKPOINT_DIR, "/", MODEL_NAME, "/"])
print(f"Searching for checkpoints in {SEARCH_PATH}")

CKPT_PATH = str(
    next(
        (
            d
            for d in Path(SEARCH_PATH).rglob("*")
            if d.is_dir() and d.name.endswith("-last")
        ),
        None,
    )
)

print(f"Checkpoint path: {CKPT_PATH}")

**Step 9**: Configure inferencing

In [None]:
def trainer(nodes: int, devices: int) -> run.Config[nl.Trainer]:
    """Configure the NeMo Trainer for inference."""
    strategy = run.Config(
        nl.MegatronStrategy,
        tensor_model_parallel_size=1,
    )
    trainer_cfg = run.Config(
        nl.Trainer,
        accelerator="gpu",
        devices=devices,
        num_nodes=nodes,
        strategy=strategy,
        plugins=bf16_mixed(),
    )
    return trainer_cfg


def configure_inference(
    prompts: list[str],
    sft_ckpt_path: str,
    output_path: str,
    nodes: int = 1,
    devices: int = 1,
    tokens: int = 1024,
    topp: float = 0.90,
    temp: float = 0.2,
):
    """Configure the inference for NeMo LLM."""
    return run.Partial(
        llm.generate,
        path=str(sft_ckpt_path),
        trainer=trainer(nodes, devices),
        prompts=prompts,
        inference_params=CommonInferenceParams(
            num_tokens_to_generate=tokens, top_p=topp, temperature=temp
        ),
        output_path=output_path,
    )


# this will be the output path for the inference results
OUTPUT_PATH = "".join([NEMO_HOME, "/llama3-8b-ft-ccws-prediction.jsonl"])

# this is the list of prompts to use for inference, feel free to edit them
CONTEXT = (
    "The Legend of Zelda is a video game series by Nintendo. Link is the main hero."
)
QUESTION = "Who is the main protagonist of the Legend of Zelda series?"
PROMPT1 = f"Context: {CONTEXT}\nQuestion: {QUESTION}\nAnswer:"

CONTEXT = "Queen Victoria ruled the United Kingdom from 1837 to 1901, marking the Victorian era."
QUESTION = "How long did Queen Victoria reign?"
PROMPT2 = f"Context: {CONTEXT}\nQuestion: {QUESTION}\nAnswer:"

CONTEXT = (
    "The Eiffel Tower was built in Paris, France in 1889 and stands at 324 meters tall."
)
QUESTION = "Where is the Eiffel Tower located?"
PROMPT3 = f"Context: {CONTEXT}\nQuestion: {QUESTION}\nAnswer:"

PROMPTS = [PROMPT1, PROMPT2, PROMPT3]

inference = configure_inference(
    prompts=PROMPTS, sft_ckpt_path=CKPT_PATH, output_path=OUTPUT_PATH
)

# define the slurm executor
# to perform the inferencing as a slurm job
inference_exec = slurm_executor(
    account="",
    partition=GPU_QUEUE,
    nodes=1,
    devices=1,
    container_image=CONTAINER,
    gres="gpu:1",
    custom_mounts=[
        NEMO_HOME + ":" + NEMO_HOME,
        HF_TOKEN_PATH + ":" + HF_TOKEN_PATH,
        HF_HOME + ":" + HF_HOME,
    ],
)
inference_exec.env_vars["NEMO_HOME"] = NEMO_HOME
inference_exec.env_vars["HF_HOME"] = HF_HOME

**Step 10**: Inference the model

If the job completes successfuly, you will find a file named: `llama3-8b-ft-ccws-prediction.jsonl` in your `NEMO_HOME` which includes the responses from the model to the provided prompts.  

In [None]:
with run.Experiment("llama3-8b-nemoft-ccws") as infexp:
    infexp.add(
        inference,
        executor=inference_exec,
        name="llama3-8b-ft-ccws-inference",
    )
    infexp.run(sequential=True, tail_logs=False, detach=True)

**Optional**: Check job status

In [None]:
infexp.status()
# Uncomment the following lines to cancel the jobs if needed
# rerun the previous cell to resubmit the jobs
# infexp.cancel("llama3-8b-ft-ccws-inference")

**Optional**: Review logs for inferencing

In [None]:
infexp.logs("llama3-8b-ft-ccws-inference")

Review the model responses from the following path:

In [None]:
print(f"Output path: {OUTPUT_PATH}")

print("Inference results:")
with open(OUTPUT_PATH, "r", encoding="utf-8") as file:
    print(file.read())

If you are happy with your model, you can deploy it using any strategy you prefer. 