## 1. Download the model weights (from huggingface)

In [1]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")


# downloaded to `parameters_300m/model_pg_398000_steps.pth`
!ls parameters_300m/model_pg_398000_steps.pth

model_pg_398000_steps.pth:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

parameters_300m/model_pg_398000_steps.pth


## 2. Load The tokenizer

In [2]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
tokenizer.save_pretrained("NepaliBPE")

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.98M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

('NepaliBPE/tokenizer_config.json',
 'NepaliBPE/special_tokens_map.json',
 'NepaliBPE/tokenizer.json')

In [3]:
print(f"{type(tokenizer.get_vocab())}\n{list(tokenizer.get_vocab().items())[:15]}")

<class 'dict'>
[('पारिजात</w>', 49045), ('हठा', 37114), ('तनाव', 18239), ('झापाका</w>', 11209), ('लालपूर्जा</w>', 31517), ('हलिया</w>', 28308), ('खोजिरहेका</w>', 16874), ('राती', 37179), ('तिब्ब', 7846), ('दिल्ली</w>', 6131), ('गरिएसँगै</w>', 39805), ('बताइन्छ</w>', 8934), ('अन्यको</w>', 20782), ('शासकको</w>', 39011), ('कृष्णले</w>', 22608)]


## 3. DownLoad previous_chapters.py

In [4]:
import requests
res=requests.get(r"https://raw.githubusercontent.com/Aananda-giri/LLAMA3-Nepali/main/4.%20inference/2_inference/previous_chapters.py")
with open('previous_chapters.py','w') as f:
    f.write(res.text)

In [7]:
!pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/487.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m327.7/487.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━

## 4. Load the model weights

In [9]:
import torch

from previous_chapters import (
    Llama3Model,
    ChatFormat,
    Tokenizer,
    generate_and_print_sample
)

_tokenizer = Tokenizer("NepaliBPE/tokenizer.json")
chat_tokenizer = ChatFormat(_tokenizer)

print(f'---------------------\nDEBUG MODE=False (300M model)\n---------------------')
# Llama 3.2 ~300M Scaled Version
LLAMA32_CONFIG = {
    "vocab_size": 50006,       # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
    "context_length": 512,      # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
    "emb_dim": 1320,            # 2048 reduced Embedding dimension
    "n_heads": 20,              # 32 reduced Number of attention heads
    "n_layers": 10,             # 16 reduced Number of layers
    "hidden_dim": 5280,         # 8192 Size of the intermediate dimension in FeedForward
    "n_kv_groups": 5,           # 8 Key-Value groups for grouped-query attention
    "rope_base": 500_000.0,     # 500_000 The base in RoPE's "theta"
    "dtype": torch.bfloat16,    # Lower-precision dtype to reduce memory usage
    "rope_freq": {              # RoPE frequency scaling
        "factor": 32.0,
        "low_freq_factor": 1.0,
        "high_freq_factor": 4.0,
        "original_context_length": 8192,
    }
}

old_context_length = 131_072    # original context length of llama3.2 model
new_context_length = LLAMA32_CONFIG["context_length"]  # 512 our new context length

def rescale_theta(theta_old, context_length_old, context_length_new):
    # original linear scaling
    scaling_factor = context_length_new / context_length_old
    theta_new = theta_old * scaling_factor
    return theta_new

LLAMA32_CONFIG["rope_base"] = rescale_theta(
    LLAMA32_CONFIG["rope_base"],
    old_context_length,
    new_context_length
)

print("New RoPE theta (i.e. LLAMA32_CONFIG[\"rope_base\"]):", LLAMA32_CONFIG["rope_base"])

model = Llama3Model(LLAMA32_CONFIG)
# Todo: don't compile? (claude sonnet 3.7 said compiling would speed up inference speed)
# compile the model
if True:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model) # requires PyTorch 2.0

model.eval()    # eval mode


# Check buffers
# --------------
print('The following is expected to print True to confirm buffers are reused instead of being (wastefully) recreated:')
print(model.trf_blocks[0].att.mask is model.trf_blocks[-1].att.mask)
print(model.trf_blocks[0].att.cos is model.trf_blocks[-1].att.cos)
print(model.trf_blocks[0].att.sin is model.trf_blocks[-1].att.sin)

# Display number of parameters
# -----------------------------
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
# Account for weight tying
total_params_normalized = total_params - model.tok_emb.weight.numel()
print(f"\nTotal number of unique parameters: {total_params_normalized:,}")

# Display model_memory_size
# -----------------------------------------------------------------------
def model_memory_size(model, input_dtype=torch.float32):
    total_params = 0
    total_grads = 0
    for param in model.parameters():
        # Calculate total number of elements per parameter
        param_size = param.numel()
        total_params += param_size
        # Check if gradients are stored for this parameter
        if param.requires_grad:
            total_grads += param_size

    # Calculate buffer size (non-parameters that require memory)
    total_buffers = sum(buf.numel() for buf in model.buffers())

    # Size in bytes = (Number of elements) * (Size of each element in bytes)
    # We assume parameters and gradients are stored in the same type as input dtype
    element_size = torch.tensor(0, dtype=input_dtype).element_size()
    total_memory_bytes = (total_params + total_grads + total_buffers) * element_size

    # Convert bytes to gigabytes
    total_memory_gb = total_memory_bytes / (1024**3)

    return total_memory_gb

print(f"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB")
print(f"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB")
# -----------------------------------------------------------------------

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model.to(device)
print(f'device: {device}')

latest_model_checkpoint = "parameters_300m/model_pg_398000_steps.pth"

checkpoint = torch.load(latest_model_checkpoint, map_location=device, weights_only=False)

# modified (added model loading code)
model.load_state_dict(checkpoint["model_state_dict"])

---------------------
DEBUG MODE=False (300M model)
---------------------
New RoPE theta (i.e. LLAMA32_CONFIG["rope_base"]): 1953.125
compiling the model... (takes a ~minute)
The following is expected to print True to confirm buffers are reused instead of being (wastefully) recreated:
True
True
True
Total number of parameters: 384,691,560

Total number of unique parameters: 318,683,640
float32 (PyTorch default): 2.87 GB
bfloat16: 1.43 GB
device: cpu


<All keys matched successfully>

## 5. Generate text sample

In [18]:
generate_and_print_sample(PROMPT="रामले भात", tokenizer=_tokenizer, chat_tokenizer=chat_tokenizer, model=model, device=device, context_length = LLAMA32_CONFIG["context_length"])

Output text:
 रामले भात खाने चलन छ ।


## 6. Generation Function with more Features

In [42]:
from previous_chapters import generate_chat_optimized
import time

start_time = time.time()
output_text = generate_chat_optimized(
    prompt="रामले भात",
    tokenizer=tokenizer,
    chat_tokenizer=chat_tokenizer,
    model=model,
    max_new_tokens=20,
    context_size=512,
    device=device,
    temperature=0.3,
    top_k=5,
    top_p=None,
    eos_id=None,
    repetition_penalty=1.2,
    penalize_len_below=10,
    batch_size=1  # Added parameter
)

print(f"time:{time.time() - start_time}\n output_text: {output_text}")

time:30.017128944396973
 output_text: रामले भात र अन्य खानेकुरा समेत खान पाएका छैनन् । उनी भन्छन् " " म पनि खाना बनाउन सक्छु । तर खाना
