# Continual Pretraining of Llama 3.2 1B

In [1]:
from pprint import pprint
import math
import wandb

import datasets
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init(
  project='DLP-W4-CPT-Node-1',
  config={
    "batch_size":4,
    "dataset": "Sangraha",
  },
)

[34m[1mwandb[0m: Currently logged in as: [33m21f2000143[0m ([33m21f2000143-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
ds = load_dataset('ai4bharat/sangraha', data_files='https://huggingface.co/datasets/ai4bharat/sangraha/resolve/main/verified/tam/data-0.parquet')['train']

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 149796 examples [00:06, 22677.78 examples/s]


In [4]:
ds

Dataset({
    features: ['doc_id', 'text', 'type'],
    num_rows: 149796
})

In [7]:
ds[0]

{'doc_id': 'd8467560df293124156db767026a1e930b96341d',
 'text': 'புதுச்சேரி-நேருக்கு நேர் விவாதம் செய்தால் மீசை அல்ல, மொட்டையே அடித்துக் கொள்ள நேரிடும் என முன்னாள் அமைச்சருக்கு, கல்யாணசுந்தரம் எம். எல். ஏ. , சவால் விடுத்துள்ளார்.\n அவரது அறிக்கைஃ\nகாலாப்பட்டில் கடந்த 9ம் தேதி நடந்த காங். , நிர்வாகிகள் ஆலோசனைக் கூட்டத்தில் பேசிய முன்னாள் அமைச்சர் ஷாஜகான், அவர் கொண்டு வந்த திட்டங்களை நான் கொண்டு வந்ததாக கூறி மக்களை ஏமாற்றி வருவதாக தெரிவித்துள்ளார்.\n மேலும், என்னுடன் நேருக்கு நேர் விவாதிக்க போவதாகவும், அவ்வாறு இல்லை என நிருபித்தால், ஒருபக்க மீசையை எடுத்துக் கொள்வதாக சவால் விட்டுள்ளார்.\n அவரது சவால், எனது மக்கள் நலத்திட்ட பணிகளை மேலும் மேம்படுத்திக் கொள்ள வாய்ப்பாக கருதுகிறேன்.\n எந்த திட்டத்தை யார் கொண்டு வந்தது என்பது காலாப்பட்டு தொகுதி மக்களுக்கு நன்றாகவே தெரியும், நான், அவருடன் நேருக்கு நேர் விவாதம் செய்தால் அவ் மீசையை மட்டும் அல்ல மொட்டை அடித்துக் கொள்ள நேரிடும்.\n பிள்ளைச்சாவடி கடல் அரிப்பு சம்பந்தமாக அவர் பேசியதற்கு, நான் கடந்த டிசம்பர் 13ம் தேதி நேருக்கு நேர் பொது மேடையில் விவாதி

# Login to Hugging Face in Your Script/Notebook

In [9]:
from transformers import AutoTokenizer

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # Automatically loads .env file from current directory

In [3]:
from huggingface_hub import login

login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))

  from .autonotebook import tqdm as notebook_tqdm


# Set the token as environment variable

In [16]:
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(f'Vocab size: {tokenizer.vocab_size}')
print(f'Context length: {tokenizer.model_max_length}')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Vocab size: 128000
Context length: 131072


In [26]:
tokenizer.model_max_length = 1024
tokenizer.pad_token = tokenizer.eos_token

## Finding the Fertility score of the tokenizer

In [18]:
example = ds[1]
num_words = len(example['text'].split())
num_words

47

In [21]:
input_ids = tokenizer.encode(example['text'])
len(input_ids)

521

In [22]:
print(f'Fertility score of the model is: {len(input_ids)/num_words}')

Fertility score of the model is: 11.085106382978724


In [34]:
def tokenize(example):
  return tokenizer(example['text'], padding=False, truncation=True)

In [33]:
import multiprocess

num_cores = multiprocess.cpu_count()
print(f"Available CPU cores for multiprocessing: {num_cores}")

Available CPU cores for multiprocessing: 8


In [39]:
# Using num_proc=1 to avoid multiprocessing issues with the tokenizer
tokenized_ds = ds.map(tokenize, batched=True, num_proc=1, remove_columns=['doc_id', 'text', 'type'])
print(tokenized_ds)

Map: 100%|██████████| 149796/149796 [13:16<00:00, 188.13 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 149796
})





In [40]:
tokenized_ds.save_to_disk('tamil_ds')

Saving the dataset (2/2 shards): 100%|██████████| 149796/149796 [00:01<00:00, 130921.75 examples/s]


# Using all cores of CPU

In [41]:
# Save the tokenizer to disk first (add this cell before your tokenize function)
tokenizer_save_path = "./saved_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

('./saved_tokenizer\\tokenizer_config.json',
 './saved_tokenizer\\special_tokens_map.json',
 './saved_tokenizer\\tokenizer.json')

In [50]:
# Define a function to initialize tokenizer in each worker
def tokenize_with_worker_init(examples):
    # This loads the tokenizer inside each worker process
    from transformers import AutoTokenizer
    worker_tokenizer = AutoTokenizer.from_pretrained(
        './saved_tokenizer',
        local_files_only=True)
    worker_tokenizer.model_max_length = 1024
    worker_tokenizer.pad_token = worker_tokenizer.eos_token
    
    return worker_tokenizer(
        examples["text"], 
        truncation=True, 
        padding=False
    )

In [51]:
# Replace your current tokenize mapping with this
batch_size = 1000  # Adjust batch size as needed for your RAM

tokenized_ds = ds.map(
    tokenize_with_worker_init,
    batched=True,
    batch_size=batch_size,
    num_proc=num_cores,  # Use all available cores
    remove_columns=['doc_id', 'text', 'type']
)

print(tokenized_ds)

Map (num_proc=8): 100%|██████████| 149796/149796 [04:00<00:00, 623.17 examples/s] 


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 149796
})


In [52]:
ds_chunked = load_from_disk('tamil_ds')
ds_chunked

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 149796
})

In [53]:
ds_split = ds_chunked.train_test_split(test_size=0.001, seed=42)
print(ds_split)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 149646
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 150
    })
})


In [55]:
from transformers import DataCollatorForLanguageModeling

In [56]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Loading the model

In [59]:
from transformers import AutoModelForCausalLM

In [60]:
model = AutoModelForCausalLM.from_pretrained(model_id, pad_token_id=tokenizer.eos_token_id)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [61]:
configuration = model.config
configuration

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128001,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "vocab_size": 128256
}

In [62]:
print(model.dtype)

torch.float32


In [66]:
num_parameters=model.num_parameters()

In [67]:
mem_in_gb = num_parameters*4/1e9
print(mem_in_gb)

4.9432576


In [68]:
print(model.get_memory_footprint()/1e9)

4.943257728


## Estimate of Memory size required

In [71]:
param_model = num_parameters*4/1e9
adam_opt = 3*param_model # for storing moments
kernel = 1
bs = 1 # batch size
print(f'Total Memory requirement per sample: {(param_model+adam_opt+kernel)*bs} GB')

Total Memory requirement per sample: 20.7730304 GB


In [72]:
prompt = "I was reading Feynman's lecture on physics. He talks about "
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, top_k=10, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["I was reading Feynman's lecture on physics. He talks about 3 different types of particles. I am not sure if he is talking about quarks, electrons, and photons. The first type he talks about is the electron. He says that when a particle is created it has a momentum and a charge. He"]

In [73]:
prompt = "நான் ஃபைன்மானின் இயற்பியல் சொற்பொழிவை படித்தேன். அவர் பற்றி பேசுகிறார் "
inputs = tokenizer(prompt, return_tensors='pt', padding=True)
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, top_k=10, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['நான் ஃபைன்மானின் இயற்பியல் சொற்பொழிவை படித்தேன். அவர் பற்றி பேசுகிறார் 1 கணக்கை அறிய அவரது இயற்பியல் சொற்பொ�']

In [75]:
from peft import LoraConfig, TaskType, LoraModel

In [76]:
lora_config = LoraConfig(
  r=16,
  target_modules=['q_proj', 'v_proj'],
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  lora_alpha=32,
  lora_dropout=0.05
)

In [77]:
from peft import get_peft_model
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [78]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [80]:
pprint(lora_model.peft_config)

{'default': LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
                       peft_type=<PeftType.LORA: 'LORA'>,
                       auto_mapping=None,
                       base_model_name_or_path='meta-llama/Llama-3.2-1B',
                       revision=None,
                       inference_mode=False,
                       r=16,
                       target_modules={'v_proj', 'q_proj'},
                       exclude_modules=None,
                       lora_alpha=32,
                       lora_dropout=0.05,
                       fan_in_fan_out=False,
                       bias='none',
                       use_rslora=False,
                       modules_to_save=None,
                       init_lora_weights=True,
                       layers_to_transform=None,
                       layers_pattern=None,
                       rank_pattern={},
                       alpha_pattern={},
                       megatron_config=None,
                       megat

In [83]:
training_args = TrainingArguments(
  output_dir="lora_llma_1b_ct",
  eval_strategy="steps",
  eval_steps=100,
  num_train_epochs=1,
  per_device_train_batch_size=2,
  per_device_eval_batch_size=2,
  bf16=False,
  fp16=True,
  tf32=False,
  gradient_accumulation_steps=1,
  adam_beta1=0.9,
  adam_beta2=0.999,
  learning_rate=2e-5,
  weight_decay=0.01,
  logging_dir='logs',
  logging_strategy='steps',
  logging_steps=100,
  save_steps=100,
  save_total_limit=20,
  report_to="none",
)

In [84]:
trainer = Trainer(model=lora_model,
args = training_args,
train_dataset=ds_split['train'],
eval_dataset=ds_split['test'],
data_collator=data_collator)

In [None]:
results = trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 