### Clone the Megatron-LM repository

In [None]:
!git clone https://github.com/ROCm/Megatron-LM

### Load dataset
`datasets` library version 4.0.0 or later has deprecated support for loading datasets using Python scripts.
Use `datasets==3.6.0`.

In [1]:
from datasets import load_dataset

dataset = load_dataset("bookcorpus/bookcorpus", trust_remote_code=True, split="train")

print("Dataset Structure:", dataset)
print("Sample Data:", dataset[0])  # Access the first record

Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

Dataset Structure: Dataset({
    features: ['text'],
    num_rows: 74004228
})
Sample Data: {'text': 'usually , he would be tearing around the living room , playing with his toys .'}


#### Convert to the JSONL format
Megatron-LM’s preprocessing script requires that the input be in JSONL format.

In [2]:
import json
from tqdm import tqdm

output_file = "bookcorpus.jsonl"

with open(output_file, "w") as f:
    for record in tqdm(dataset, desc="Saving dataset to JSONL", unit="record"):
        json.dump({"text": record["text"]}, f)
        f.write("\n")

print(f"Dataset saved to {output_file}")

Saving dataset to JSONL: 100% 74004228/74004228 [15:52<00:00, 77656.02record/s]

Dataset saved to bookcorpus.jsonl





In [3]:
# inspect convertion
with open(output_file, "r") as f:
    for i in range(5):
        print(json.loads(f.readline()))

{'text': 'usually , he would be tearing around the living room , playing with his toys .'}
{'text': 'but just one look at a minion sent him practically catatonic .'}
{'text': "that had been megan 's plan when she got him dressed earlier ."}
{'text': "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older ."}
{'text': 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'}


### Preprocess the data
*Note*: This is an example if tokenizer loaded for preprocessing, otherwise use `TOKENIZER_MODEL` variable in trianing.

In [4]:
# download the tokenizer files
!wget https://huggingface.co/gpt2/resolve/main/vocab.json -O vocab.json
!wget https://huggingface.co/gpt2/resolve/main/merges.txt -O merges.txt

--2025-10-08 20:26:16--  https://huggingface.co/gpt2/resolve/main/vocab.json
Resolving huggingface.co (huggingface.co)... 3.171.171.65, 3.171.171.104, 3.171.171.128, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1042301 (1018K) [text/plain]
Saving to: ‘vocab.json’


2025-10-08 20:26:16 (19.2 MB/s) - ‘vocab.json’ saved [1042301/1042301]

--2025-10-08 20:26:16--  https://huggingface.co/gpt2/resolve/main/merges.txt
Resolving huggingface.co (huggingface.co)... 3.171.171.128, 3.171.171.104, 3.171.171.6, ...
Connecting to huggingface.co (huggingface.co)|3.171.171.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 456318 (446K) [text/plain]
Saving to: ‘merges.txt’


2025-10-08 20:26:16 (16.0 MB/s) - ‘merges.txt’ saved [456318/456318]



In [6]:
!mkdir -p output
!python Megatron-LM/tools/preprocess_data.py \
    --input bookcorpus.jsonl \
    --json-keys text \
    --output-prefix output/bookcorpus \
    --tokenizer-type GPT2BPETokenizer \
    --vocab-file vocab.json \
    --merge-file merges.txt \
    --workers 4 \
    --append-eod \
    --partitions 2 \
    --log-interval 1000000 \
    --split-sentences

  def forward(ctx, input, weight, bias, allreduce_dgrad):
  def backward(ctx, grad_output):
  def forward(
  def backward(ctx, grad_output):
Opening bookcorpus_0.jsonl
Opening bookcorpus_1.jsonl
Processed 1000000 documents (192510.90471823156 docs/s, 14.12832597692662 MB/s).
Processed 1000000 documents (188024.37020884166 docs/s, 13.803066413068835 MB/s).
Processed 2000000 documents (192513.77201031064 docs/s, 13.43451984869808 MB/s).
Processed 2000000 documents (188936.2168688218 docs/s, 13.188253208343863 MB/s).
Processed 3000000 documents (192259.54275523528 docs/s, 13.439340757998671 MB/s).
Processed 3000000 documents (186379.0682592968 docs/s, 13.03297580236802 MB/s).
Processed 4000000 documents (193818.91850690029 docs/s, 13.503367995015601 MB/s).
Processed 4000000 documents (189192.20861101587 docs/s, 13.184529470645844 MB/s).
Processed 5000000 documents (194297.69896025097 docs/s, 13.356547727076713 MB/s).
Processed 5000000 documents (191847.32131163232 docs/s, 13.1914043663353

In [7]:
# validate
!ls output/

bookcorpus_0_text_sentence.bin	bookcorpus_1_text_sentence.idx
bookcorpus_0_text_sentence.idx	bookcorpus_text_sentence.bin
bookcorpus_1_text_sentence.bin	bookcorpus_text_sentence.idx


### Training

In [9]:
!export TOKENIZER_MODEL="meta-llama/Llama-3.1-8B"
!export DATA_PATH="output/bookcorpus_text_sentence"

In [17]:
!cd Megatron-LM && TEE_OUTPUT=1 MBS=2 BS=64 TP=1 TE_FP8=0 SEQ_LENGTH=4096  \
TOKENIZER_MODEL='meta-llama/Llama-3.1-8B' MODEL_SIZE='8' \
bash examples/llama/train_llama3.sh --data-path ${DATA_PATH}

examples/llama/train_llama3.sh: line 31: export: --: invalid option
export: usage: export [-fn] [name[=value] ...] or export -p
NO_TRAINING=0
Single node setup, skipping NCCL and GLOO socket interface settings.
experiment/1nodes_rank0_train_8B_mbs2_bs64_tp1_pp1_cp1_iter10/TE_FP8_0/2025-10-09_13-36-02/output_perf.log
  def forward(ctx, input, weight, bias, allreduce_dgrad):
  def backward(ctx, grad_output):
  def forward(
  def backward(ctx, grad_output):
using world size: 1, data-parallel size: 1, context-parallel size: 1, tensor-model-parallel size: 1, encoder-tensor-model-parallel size: 0, pipeline-model-parallel size: 1, encoder-pipeline-model-parallel size: 0
accumulate and all-reduce gradients in fp32 for bfloat16 data type.
using torch.bfloat16 for parameters ...
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. True
  adam_beta1 ...................................... 0.9
  adam_beta2 ..................................