In [1]:
import os
import yaml
import torch
from unsloth import FastLanguageModel
from huggingface_hub import login
import argparse
from peft import PeftModel

from dotenv import load_dotenv
load_dotenv()

def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config("cpt_training_config.yaml")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth Zoo will now patch everything to make training faster!


Load Llama Base Model

In [2]:
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
        model_name=config['model']['base_model'],
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cpu"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00,  9.79it/s]


Add LORA Adapters

In [3]:
peft_model_adapters = "ShethArihant/Llama-3.1-8B-us-army-fm-base"
cpt_base_model = PeftModel.from_pretrained(base_model, peft_model_adapters)
cpt_base_model = cpt_base_model.merge_and_unload()



In [6]:
cpt_base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1

Push the final base + CPT model

In [9]:
cpt_base_model.push_to_hub(
    "ShethArihant/Llama-3.1-8B-us-army-fm-base",
    commit_message="Replaced only LoRA Adapters with full base + cpt model (Base)"
)

Processing Files (4 / 4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16.1GB / 16.1GB,  187MB/s  
New Data Upload: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14.0GB / 14.0GB,  187MB/s  


Saved model to https://huggingface.co/ShethArihant/Llama-3.1-8B-us-army-fm-base


Now, `cpt_base_model` is the final BASE model. Instruct weights need to be added to this.

`llama_instruct_weights` = LLama-Instruct - LLama-Base

Instruct Model = `cpt_base_model` + `llama_instruct_weights`

Creating CPT-Instruct Model

In [2]:
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
        model_name=config['model']['base_model'],
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cpu"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 11.13it/s]


In [3]:
instruct_model, instruct_tokenizer = FastLanguageModel.from_pretrained(
        model_name=config['model']['instruct_model'],
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cpu"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 11.71it/s]


In [4]:
base_state_dict = base_model.state_dict()
instruct_state_dict = instruct_model.state_dict()
# cpt_base_state_dict = cpt_base_model.state_dict()

In [5]:
del base_model, instruct_model
# del base_model, instruct_model, cpt_base_model

In [6]:
torch.cuda.empty_cache()

In [7]:
instruction_residuals = {}
residual_count = 0
skipped_count = 0

for key in base_state_dict.keys():
    if key in instruct_state_dict:
        # Check if shapes match before computing residual
        if base_state_dict[key].shape == instruct_state_dict[key].shape:
            instruction_residuals[key] = instruct_state_dict[key] - base_state_dict[key]
            residual_count += 1
        else:
            print(f"  ‚ö†Ô∏è  Skipping {key}: shape mismatch "
                    f"(base: {base_state_dict[key].shape} vs instruct: {instruct_state_dict[key].shape})")
            skipped_count += 1
    else:
        print(f"  ‚ö†Ô∏è  Key {key} not found in instruct model")

print(f"\n  ‚úÖ Computed {residual_count} instruction residuals")
if skipped_count > 0:
    print(f"  ‚ö†Ô∏è  Skipped {skipped_count} layers due to shape mismatches")


  ‚úÖ Computed 291 instruction residuals


In [8]:
del base_state_dict, instruct_state_dict

In [15]:
torch.save(instruction_residuals, "instruction_residuals.pt")

Final steps to add `instruction_residuals` to `cpt_base_model`

In [2]:
cpt_base_model_name = "ShethArihant/Llama-3.1-8B-us-army-fm-base"
cpt_base_model, cpt_base_tokenizer = FastLanguageModel.from_pretrained(
        model_name=cpt_base_model_name,
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cuda"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:16<00:00, 19.02s/it]
Unsloth 2025.11.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
cpt_base_model = FastLanguageModel.for_inference(cpt_base_model)
cpt_base_model = cpt_base_model.merge_and_unload()

In [4]:
cpt_state_dict = cpt_base_model.state_dict()
cpt_state_dict.keys()

odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'mod

In [7]:
del cpt_base_model

In [8]:
del cpt_base_tokenizer

In [5]:
import gc
gc.collect()

48

In [6]:
# Source - https://stackoverflow.com/a
# Posted by cshelly, modified by community. See post 'Timeline' for change history
# Retrieved 2025-11-22, License - CC BY-SA 4.0

import ctypes
libc = ctypes.CDLL("libc.so.6") # clearing cache 
libc.malloc_trim(0)

1

In [7]:
torch.cuda.empty_cache()

In [8]:
instruction_residuals = torch.load("instruction_residuals.pt")

In [9]:
cpt_state_dict.keys()

odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'mod

In [10]:
instruction_residuals.keys()

dict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'mode

In [11]:
key = "model.embed_tokens.weight"

In [21]:
(cpt_state_dict[key].cpu() + instruction_residuals[key]).to("cuda")

tensor([[ 1.6327e-03,  5.6152e-03, -2.9297e-03,  ...,  4.4556e-03,
         -2.9144e-03, -3.2616e-04],
        [-3.8605e-03,  3.5858e-04, -1.5717e-03,  ...,  2.5177e-03,
         -1.1749e-03, -1.0529e-03],
        [ 1.3351e-03, -1.6968e-02,  3.1281e-03,  ...,  2.9907e-03,
          9.5215e-03,  4.8828e-03],
        ...,
        [ 2.2127e-23,  3.9033e-24,  2.1610e-23,  ...,  6.3693e-23,
         -2.6496e-24, -2.3575e-23],
        [ 2.2851e-23, -2.2101e-24, -2.2230e-23,  ...,  2.7917e-23,
          8.6854e-24, -3.7016e-23],
        [-8.8508e-23, -7.5687e-23,  6.4882e-24,  ...,  5.8937e-24,
         -6.4520e-23, -2.7142e-24]], device='cuda:0', dtype=torch.bfloat16)

In [12]:
torch.cuda.empty_cache()
gc.collect()
applied_count = 0
skipped_cpt_count = 0

for key in cpt_state_dict.keys():
    if key in instruction_residuals:
        # Double-check shape compatibility with CPT model
        if cpt_state_dict[key].shape == instruction_residuals[key].shape:
            cpt_state_dict[key] = (cpt_state_dict[key].cpu() + instruction_residuals[key]).to("cuda")
            torch.cuda.empty_cache()
            gc.collect()
            applied_count += 1
        else:
            print(f"  ‚ö†Ô∏è  Skipping {key}: shape mismatch with CPT model "
                    f"(CPT: {cpt_state_dict[key].shape} vs residual: {instruction_residuals[key].shape})")
            skipped_cpt_count += 1

print(f"\n  ‚úÖ Applied {applied_count} instruction residuals")
if skipped_cpt_count > 0:
    print(f"  ‚ö†Ô∏è  Skipped {skipped_cpt_count} layers due to shape mismatches with CPT model")

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 22.07 GiB of which 8.44 MiB is free. Including non-PyTorch memory, this process has 22.05 GiB memory in use. Of the allocated memory 21.71 GiB is allocated by PyTorch, and 27.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
cpt_base_model.load_state_dict(cpt_state_dict)