<a href="https://colab.research.google.com/github/Ducmanh9790/NLG/blob/main/NLG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Clone repo & install
!git clone https://github.com/microsoft/LoRA.git
%cd LoRA

# Cài loralib
!pip install -e .

# Cài dependencies NLG
!pip install transformers tqdm tensorboard progress numpy pyyaml

print("✓ Setup complete!")


Cloning into 'LoRA'...
remote: Enumerating objects: 2024, done.[K
remote: Counting objects: 100% (770/770), done.[K
remote: Compressing objects: 100% (413/413), done.[K
remote: Total 2024 (delta 517), reused 357 (delta 357), pack-reused 1254 (from 2)[K
Receiving objects: 100% (2024/2024), 34.72 MiB | 30.86 MiB/s, done.
Resolving deltas: 100% (814/814), done.
/content/LoRA
Obtaining file:///content/LoRA
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: loralib
  Running setup.py develop for loralib
Successfully installed loralib-0.1.2
Collecting progress
  Downloading progress-1.6.1-py3-none-any.whl.metadata (4.3 kB)
Downloading progress-1.6.1-py3-none-any.whl (9.8 kB)
Installing collected packages: progress
Successfully installed progress-1.6.1
✓ Setup complete!


In [2]:
# Cell 2: Download GPT-2 Medium
import os
os.makedirs("examples/NLG/pretrained_checkpoints", exist_ok=True)

!wget -O examples/NLG/pretrained_checkpoints/gpt2-medium-pytorch_model.bin \
  https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin

print("✓ GPT-2 Medium downloaded!")


--2025-12-04 07:16:02--  https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 3.170.185.14, 3.170.185.33, 3.170.185.25, ...
Connecting to huggingface.co (huggingface.co)|3.170.185.14|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f17434b/6056cd022b933c356a256889bf854d6273e618fe8c5dd6980439c81c324ffa4a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251204%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251204T071602Z&X-Amz-Expires=3600&X-Amz-Signature=d2f3bfda6a3fb3e80135643e8c84cf7cc5e9acdd35fb984da705355e11b4be4e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&x-id=GetObject&Expires=1764836162&Policy=eyJTdGF0ZW1lbnQiOlt7IkNv

In [3]:
# Cell 3: Format E2E data
import subprocess
import os

os.chdir("examples/NLG")

# Format train
!python src/format_converting_e2e.py data/e2e/train.txt data/e2e/train_formatted.jsonl
!python src/gpt2_encode.py --vocab vocab --input data/e2e/train_formatted.jsonl \
  --output data/e2e/train.jsonl --add_bos --add_eos

# Format valid
!python src/format_converting_e2e.py data/e2e/valid.txt data/e2e/valid_formatted.jsonl
!python src/gpt2_encode.py --vocab vocab --input data/e2e/valid_formatted.jsonl \
  --output data/e2e/valid.jsonl --add_bos --add_eos

# Format test
!python src/format_converting_e2e.py data/e2e/test.txt data/e2e/test_formatted.jsonl
!python src/gpt2_encode.py --vocab vocab --input data/e2e/test_formatted.jsonl \
  --output data/e2e/test.jsonl --add_bos --add_eos

print("✓ E2E datasets formatted!")


✓ E2E datasets formatted!


In [4]:
# Cell 4: Patch gpu.py cho Colab (single GPU mode)
import os

gpu_py_path = "/content/LoRA/examples/NLG/src/gpu.py"

# Read current file
with open(gpu_py_path, 'r') as f:
    content = f.read()

# Replace parse_gpu function
old_parse_gpu = """def parse_gpu(args):
    torch.manual_seed(args.random_seed)

    if args.platform == 'local':
        dist.init_process_group(backend='nccl')
        local_rank = torch.distributed.get_rank()
        torch.cuda.set_device(local_rank)
        device = torch.device('cuda', local_rank)
        args.rank = local_rank
        args.device = device
        args.world_size = torch.distributed.get_world_size()
        args.dist = dist"""

new_parse_gpu = """def parse_gpu(args):
    torch.manual_seed(args.random_seed)

    if args.platform == 'local':
        # Single GPU mode - no distributed training
        if torch.cuda.is_available():
            device = torch.device('cuda', 0)
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        args.rank = 0
        args.device = device
        args.world_size = 1
        args.local_rank = 0
        args.dist = None"""

content = content.replace(old_parse_gpu, new_parse_gpu)

# Write back
with open(gpu_py_path, 'w') as f:
    f.write(content)

print("✓ gpu.py patched for Colab!")

✓ gpu.py patched for Colab!


In [5]:
# Cell 5: Patch gpt2_ft.py cho Colab
import os

gpt2_ft_path = "/content/LoRA/examples/NLG/src/gpt2_ft.py"

with open(gpt2_ft_path, 'r') as f:
    content = f.read()

# Fix DistributedSampler
old_sampler = """    train_loader = DataLoader(
        train_data, batch_size=args.train_batch_size, num_workers=0,
        shuffle=False, pin_memory=False, drop_last=True,
        sampler=torch.utils.data.distributed.DistributedSampler(train_data, seed=args.random_seed)
    )

    valid_loader = DataLoader(
        valid_data, batch_size=args.valid_batch_size, num_workers=0,
        shuffle=False, pin_memory=False, drop_last=False,
        sampler=torch.utils.data.distributed.DistributedSampler(valid_data, seed=args.random_seed)
    )"""

new_sampler = """    # Create sampler only for distributed training
    train_sampler = None
    valid_sampler = None
    if args.world_size > 1:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_data, seed=args.random_seed)
        valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_data, seed=args.random_seed)

    train_loader = DataLoader(
        train_data, batch_size=args.train_batch_size, num_workers=0,
        shuffle=(train_sampler is None), pin_memory=False, drop_last=True,
        sampler=train_sampler
    )

    valid_loader = DataLoader(
        valid_data, batch_size=args.valid_batch_size, num_workers=0,
        shuffle=False, pin_memory=False, drop_last=False,
        sampler=valid_sampler
    )"""

content = content.replace(old_sampler, new_sampler)

# Fix set_epoch
old_set_epoch = "    train_loader.sampler.set_epoch(epoch)"
new_set_epoch = """    # Only set epoch for DistributedSampler
    if hasattr(train_loader.sampler, 'set_epoch'):
        train_loader.sampler.set_epoch(epoch)"""

content = content.replace(old_set_epoch, new_set_epoch)

# Fix .cuda() call
old_cuda = "    lm_net = lm_net.cuda()"
new_cuda = "    lm_net = lm_net.to(args.device)"

content = content.replace(old_cuda, new_cuda)

with open(gpt2_ft_path, 'w') as f:
    f.write(content)

print("✓ gpt2_ft.py patched for Colab!")

✓ gpt2_ft.py patched for Colab!


In [6]:
# Cell 6: Patch distributed_opt
gpu_py_path = "/content/LoRA/examples/NLG/src/gpu.py"

with open(gpu_py_path, 'r') as f:
    content = f.read()

# Fix distributed_opt
old_distributed_opt = """def distributed_opt(args, model, opt, grad_acc=1):
    if args.platform == 'azure':
        args.hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        opt = args.hvd.DistributedOptimizer(
            opt, named_parameters=model.named_parameters(), backward_passes_per_step=grad_acc
        )
    elif args.platform == 'philly' or args.platform == 'k8s' or args.platform == 'local':
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank,
            find_unused_parameters=False, broadcast_buffers=False
        )
    return model, opt"""

new_distributed_opt = """def distributed_opt(args, model, opt, grad_acc=1):
    if args.platform == 'azure':
        args.hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        opt = args.hvd.DistributedOptimizer(
            opt, named_parameters=model.named_parameters(), backward_passes_per_step=grad_acc
        )
    elif args.platform in ['philly', 'k8s', 'local'] and args.world_size > 1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank,
            find_unused_parameters=False, broadcast_buffers=False
        )
    return model, opt"""

content = content.replace(old_distributed_opt, new_distributed_opt)

with open(gpu_py_path, 'w') as f:
    f.write(content)

print("✓ distributed_opt patched!")

✓ distributed_opt patched!


In [7]:
# Cell 7: Patch distributed_sync
gpu_py_path = "/content/LoRA/examples/NLG/src/gpu.py"

with open(gpu_py_path, 'r') as f:
    content = f.read()

# Fix distributed_sync
old_sync = """def distributed_sync(args):
    if args.platform == 'azure':
        args.hvd.allreduce(torch.tensor(0), name='barrier')
    else:
        args.dist.barrier()"""

new_sync = """def distributed_sync(args):
    if args.platform == 'azure':
        args.hvd.allreduce(torch.tensor(0), name='barrier')
    elif args.dist is not None:
        args.dist.barrier()"""

content = content.replace(old_sync, new_sync)

with open(gpu_py_path, 'w') as f:
    f.write(content)

print("✓ distributed_sync patched!")

✓ distributed_sync patched!


In [8]:
# Cell 7: Giảm batch size
!python src/gpt2_ft.py \
    --train_data "./data/e2e/train.jsonl" \
    --valid_data "./data/e2e/valid.jsonl" \
    --train_batch_size 2 \
    --grad_acc 4 \
    --valid_batch_size 2 \
    --seq_len 256 \
    --model_card gpt2.md \
    --init_checkpoint "./pretrained_checkpoints/gpt2-medium-pytorch_model.bin" \
    --platform local \
    --clip 0.0 \
    --lr 0.0002 \
    --weight_decay 0.01 \
    --correct_bias \
    --adam_beta2 0.999 \
    --scheduler linear \
    --warmup_step 100 \
    --max_epoch 2 \
    --save_interval 500 \
    --lora_dim 4 \
    --lora_alpha 32 \
    --lora_dropout 0.1 \
    --label_smooth 0.1 \
    --work_dir "./trained_models/GPT2_M/e2e" \
    --random_seed 110

print("✓ Training with reduced batch size completed!")

myrank: 0 local_rank: 0 device_count: 1 world_size: 1
        - platform : local
        - local_rank : 0
        - rank : 0
        - device : cuda:0
        - world_size : 1
        - random_seed : 110
        - lr : 0.0002
        - weight_decay : 0.01
        - correct_bias : True
        - adam_epislon : 1e-06
        - no_decay_bias : False
        - adam_beta1 : 0.9
        - adam_beta2 : 0.999
        - scheduler : linear
        - max_step : None
        - max_epoch : 2
        - warmup_step : 100
        - i_steps : 0
        - i_lrs : 0.00025
        - train_data : ./data/e2e/train.jsonl
        - valid_data : ./data/e2e/valid.jsonl
        - train_batch_size : 2
        - valid_batch_size : 2
        - grad_acc : 4
        - clip : 0.0
        - seq_len : 256
        - model_card : gpt2.md
        - init_checkpoint : ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin
        - fp16 : False
        - log_interval : 100
        - eval_interval : 2000
        - save_interv