### Clone the Megatron-LM repository

In [None]:
!git clone https://github.com/ROCm/Megatron-LM

### Load dataset
`datasets` library version 4.0.0 or later has deprecated support for loading datasets using Python scripts.
Use `datasets==3.6.0`.

In [1]:
from datasets import load_dataset

dataset = load_dataset("bookcorpus/bookcorpus", trust_remote_code=True, split="train")

print("Dataset Structure:", dataset)
print("Sample Data:", dataset[0])  # Access the first record

Dataset Structure: Dataset({
    features: ['text'],
    num_rows: 74004228
})
Sample Data: {'text': 'usually , he would be tearing around the living room , playing with his toys .'}


#### Convert to the JSONL format
Megatron-LM’s preprocessing script requires that the input be in JSONL format.

In [2]:
import json
from tqdm import tqdm

output_file = "bookcorpus.jsonl"

with open(output_file, "w") as f:
    for record in tqdm(dataset, desc="Saving dataset to JSONL", unit="record"):
        json.dump({"text": record["text"]}, f)
        f.write("\n")

print(f"Dataset saved to {output_file}")

Saving dataset to JSONL: 100% 74004228/74004228 [14:54<00:00, 82686.48record/s]

Dataset saved to bookcorpus.jsonl





In [3]:
# inspect convertion
with open(output_file, "r") as f:
    for i in range(5):
        print(json.loads(f.readline()))

{'text': 'usually , he would be tearing around the living room , playing with his toys .'}
{'text': 'but just one look at a minion sent him practically catatonic .'}
{'text': "that had been megan 's plan when she got him dressed earlier ."}
{'text': "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older ."}
{'text': 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'}


### Preprocess the data

In [5]:
!mkdir -p output
!python Megatron-LM/tools/preprocess_data.py \
    --input bookcorpus.jsonl \
    --json-keys text \
    --output-prefix output/bookcorpus \
    --tokenizer-type HuggingFaceTokenizer \
    --tokenizer-model "meta-llama/Llama-3.1-8B"\
    --workers 4 \
    --append-eod \
    --partitions 2 \
    --log-interval 1000000 \
    --split-sentences

  def forward(ctx, input, weight, bias, allreduce_dgrad):
  def backward(ctx, grad_output):
  def forward(
  def backward(ctx, grad_output):
Opening bookcorpus_ss_0.jsonl
Opening bookcorpus_ss_1.jsonl
Time to startup: 0.6142144203186035
Time to startup: 0.6443839073181152
Processed 1000000 documents (37069.8632136344 docs/s, 2.791395988928492 MB/s).
Processed 1000000 documents (35234.65604983583 docs/s, 2.653956978394102 MB/s).
Processed 2000000 documents (38052.842540226724 docs/s, 2.7282299251748356 MB/s).
Processed 2000000 documents (36218.01116704971 docs/s, 2.597324092267478 MB/s).
Processed 3000000 documents (38118.880584453924 docs/s, 2.737452936387159 MB/s).
Processed 3000000 documents (36661.62678189521 docs/s, 2.63371720876368 MB/s).
Processed 4000000 documents (38249.964956032556 docs/s, 2.7380023805001623 MB/s).
Processed 4000000 documents (37073.37384286992 docs/s, 2.6544568517148686 MB/s).
Processed 5000000 documents (38424.923196949094 docs/s, 2.714886268089049 MB/s).
Pr

In [6]:
# validate
!ls output/

bookcorpus_0_text_sentence.bin	bookcorpus_1_text_sentence.idx
bookcorpus_0_text_sentence.idx	bookcorpus_text_sentence.bin
bookcorpus_1_text_sentence.bin	bookcorpus_text_sentence.idx


### Training

In [48]:
!export TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
!export DATA_PATH="output/bookcorpus_text_sentence"
!export SAVE_CKPT_PATH="checkpoints"

In [51]:
!cd Megatron-LM && TEE_OUTPUT=1 MBS=2 BS=64 TP=1 TE_FP8=0 SEQ_LENGTH=4096  \
TOKENIZER_MODEL='meta-llama/Llama-3.1-8B' MODEL_SIZE='8' \
bash examples/llama/train_llama3.sh --data-path ${DATA_PATH} --save ${SAVE_CKPT_PATH}

examples/llama/train_llama3.sh: line 31: export: --: invalid option
export: usage: export [-fn] [name[=value] ...] or export -p
examples/llama/train_llama3.sh: line 31: export: --: invalid option
export: usage: export [-fn] [name[=value] ...] or export -p
NO_TRAINING=0
Single node setup, skipping NCCL and GLOO socket interface settings.
experiment/1nodes_rank0_train_8B_mbs2_bs64_tp1_pp1_cp1_iter10/TE_FP8_0/2025-10-09_19-21-40/output_perf.log
  def forward(ctx, input, weight, bias, allreduce_dgrad):
  def backward(ctx, grad_output):
  def forward(
  def backward(ctx, grad_output):
using world size: 1, data-parallel size: 1, context-parallel size: 1, tensor-model-parallel size: 1, encoder-tensor-model-parallel size: 0, pipeline-model-parallel size: 1, encoder-pipeline-model-parallel size: 0
accumulate and all-reduce gradients in fp32 for bfloat16 data type.
using torch.bfloat16 for parameters ...
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_