In [1]:
import os
import yaml
import torch
from unsloth import FastLanguageModel
from huggingface_hub import login
import argparse
from peft import PeftModel

from dotenv import load_dotenv
load_dotenv()

def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

config = load_config("cpt_training_config.yaml")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth Zoo will now patch everything to make training faster!


Load Llama Base Model

In [2]:
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
        model_name=config['model']['base_model'],
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cpu"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00,  9.79it/s]


Add LORA Adapters

In [3]:
peft_model_adapters = "ShethArihant/Llama-3.1-8B-us-army-fm-base"
cpt_base_model = PeftModel.from_pretrained(base_model, peft_model_adapters)
cpt_base_model = cpt_base_model.merge_and_unload()



In [6]:
cpt_base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1

Push the final base + CPT model

In [9]:
cpt_base_model.push_to_hub(
    "ShethArihant/Llama-3.1-8B-us-army-fm-base",
    commit_message="Replaced only LoRA Adapters with full base + cpt model (Base)"
)

Processing Files (4 / 4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16.1GB / 16.1GB,  187MB/s  
New Data Upload: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14.0GB / 14.0GB,  187MB/s  


Saved model to https://huggingface.co/ShethArihant/Llama-3.1-8B-us-army-fm-base


Now, `cpt_base_model` is the final BASE model. Instruct weights need to be added to this.

`llama_instruct_weights` = LLama-Instruct - LLama-Base

Instruct Model = `cpt_base_model` + `llama_instruct_weights`

In [8]:
instruct_model, instruct_tokenizer = FastLanguageModel.from_pretrained(
        model_name=config['model']['instruct_model'],
        dtype=getattr(torch, config['model']['dtype']),
        load_in_4bit=config['model']['load_in_4bit'],
        device_map="cpu"
    )

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00,  8.83it/s]


In [9]:
base_state_dict = base_model.state_dict()
instruct_state_dict = instruct_model.state_dict()
cpt_base_state_dict = cpt_base_model.state_dict()

In [21]:
del base_model, instruct_model, cpt_base_model

In [25]:
torch.cuda.empty_cache()

In [11]:
instruction_residuals = {}
residual_count = 0
skipped_count = 0

for key in base_state_dict.keys():
    if key in instruct_state_dict:
        # Check if shapes match before computing residual
        if base_state_dict[key].shape == instruct_state_dict[key].shape:
            instruction_residuals[key] = instruct_state_dict[key] - base_state_dict[key]
            residual_count += 1
        else:
            print(f"  ‚ö†Ô∏è  Skipping {key}: shape mismatch "
                    f"(base: {base_state_dict[key].shape} vs instruct: {instruct_state_dict[key].shape})")
            skipped_count += 1
    else:
        print(f"  ‚ö†Ô∏è  Key {key} not found in instruct model")

print(f"\n  ‚úÖ Computed {residual_count} instruction residuals")
if skipped_count > 0:
    print(f"  ‚ö†Ô∏è  Skipped {skipped_count} layers due to shape mismatches")