# Continual Pretraining of Llama 3.2 1B

In [None]:
from pprint import pprint
import math
import wandb
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
import os
import multiprocess
from transformers import TrainingArguments, Trainer

import datasets
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from peft import LoraConfig, TaskType, LoraModel
import numpy as np
import pandas as pd
import torch
import random
from peft import get_peft_model


  from pandas.core import (


In [2]:
wandb.init(
  project='DLP-W4-CPT-Node-1',
  config={
    "batch_size":4,
    "dataset": "Sangraha",
  },
)

[34m[1mwandb[0m: Currently logged in as: [33m21f2000143[0m ([33m21f2000143-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


## 3. Set Random Seeds for Reproducibility

In [3]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(42)

In [4]:
train_ds = load_dataset('csv', data_files='competition_train.csv')
val_ds = load_dataset('csv', data_files='competition_val.csv')
test_ds = load_dataset('csv', data_files='competition_test.csv')

# Login to Hugging Face in Your Script/Notebook

In [5]:
load_dotenv()  # Automatically loads .env file from current directory

True

In [None]:
load_dotenv()  # Automatically loads .env file from current directory
login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))

### Create Prompt Templates for Different Languages

We'll create language-specific prompt templates to help the model understand the task.

In [7]:
def create_prompt(example, add_instruction=True):
    language = example['language']
    text = example['Sentence']
    emotion = example['emotion']
    
    # Create language-specific prompts
    if language == 'Santali':
        lang_desc = "Santali language text in OI Chiki script"
    elif language == 'Kashmiri':
        lang_desc = "Kashmiri language text in Arabic script"
    else:  # Manipuri
        lang_desc = "Manipuri language text in Meitei Mayek script"
    
    # First format
    if add_instruction:
        prompt = f"""Classify the emotion expressed in the following {lang_desc}. 
Choose from: fear, happy, surprise, sad, anger, disgust.

Text: {text}

Emotion: {emotion}"""
    else: # Second format
        prompt = f"""The following is {lang_desc}: {text}

The emotion expressed in this text is: {emotion}"""
    
    return prompt

# Create a few examples to see the prompts
example = train_ds['train'][0]
print(create_prompt(example))
print("\nAlternative prompt format:")
print(create_prompt(example, add_instruction=False))

Classify the emotion expressed in the following Kashmiri language text in Arabic script. 
Choose from: fear, happy, surprise, sad, anger, disgust.

Text: رُک، بہٕ چھس صرف یہ یقینی بناونٕچ کوٗشش کران ز مےٚ چھ فراہم کرنہٕ آمتیٚن خدماتن خٲطرٕ معقول قۭمت میلان۔

Emotion: disgust

Alternative prompt format:
The following is Kashmiri language text in Arabic script: رُک، بہٕ چھس صرف یہ یقینی بناونٕچ کوٗشش کران ز مےٚ چھ فراہم کرنہٕ آمتیٚن خدماتن خٲطرٕ معقول قۭمت میلان۔

The emotion expressed in this text is: disgust


# Set the token as environment variable

In [8]:
model_id = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(f'Vocab size: {tokenizer.vocab_size}')
print(f'Context length: {tokenizer.model_max_length}')

Vocab size: 262144
Context length: 1000000000000000019884624838656


In [9]:
tokenizer.model_max_length = 1024
tokenizer.pad_token = tokenizer.eos_token

## Finding the Fertility score of the tokenizer

In [10]:
example = train_ds['train'][1]
num_words = len(example['Sentence'].split())
num_words

23

In [11]:
input_ids = tokenizer.encode(example['Sentence'])
len(input_ids)

66

In [12]:
print(f'Fertility score of the model is: {len(input_ids)/num_words}')

Fertility score of the model is: 2.869565217391304


In [13]:
def tokenize_function(examples):
    # Handle batch processing - create a list of prompts
    prompts = []
    
    # If this is a batch, process each example in the batch
    if isinstance(examples['language'], list):
        for i in range(len(examples['language'])):
            example = {
                'language': examples['language'][i],
                'Sentence': examples['Sentence'][i],
                'emotion': examples['emotion'][i]
            }
            prompts.append(create_prompt(example))
    else:
        # Handle single example case
        prompts = create_prompt(examples)
    
    tokenized = tokenizer(
        prompts,
        padding=False,
        truncation=True
    )
    
    return tokenized

In [14]:
num_cores = multiprocess.cpu_count()
print(f"Available CPU cores for multiprocessing: {num_cores}")

Available CPU cores for multiprocessing: 24


In [15]:
train_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'Sentence', 'language', 'emotion'],
        num_rows: 7176
    })
})

In [16]:
# Using num_proc=1 to avoid multiprocessing issues with the tokenizer
# tokenized_ds = ds.map(tokenize, batched=True, num_proc=1, remove_columns=['doc_id', 'text', 'type'])
# print(tokenized_ds)

# Tokenize datasets
tokenized_datasets = train_ds.map(
    tokenize_function,
    batched=True,
    remove_columns=['id', 'Sentence', 'language', 'emotion']
)

print("Tokenized datasets:")
for split, dataset in tokenized_datasets.items():
    print(f"{split}: {dataset}")



Map:   0%|          | 0/7176 [00:00<?, ? examples/s]

Tokenized datasets:
train: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7176
})


In [17]:
tokenized_datasets.save_to_disk('nppe1')

Saving the dataset (0/1 shards):   0%|          | 0/7176 [00:00<?, ? examples/s]

# Using all cores of CPU

In [18]:
# Save the tokenizer to disk first (add this cell before your tokenize function)
tokenizer_save_path = "./nppe1_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

('./nppe1_tokenizer/tokenizer_config.json',
 './nppe1_tokenizer/special_tokens_map.json',
 './nppe1_tokenizer/chat_template.jinja',
 './nppe1_tokenizer/tokenizer.model',
 './nppe1_tokenizer/added_tokens.json',
 './nppe1_tokenizer/tokenizer.json')

In [19]:
ds_chunked = load_from_disk('nppe1')
ds_chunked

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7176
    })
})

In [20]:
# Check the features of the dataset
print("Dataset features:", ds_chunked['train'].features)
print("First example feature shapes:")
for key, value in ds_chunked['train'][0].items():
    if hasattr(value, "__len__"):
        print(f"  {key}: length = {len(value)}")
    else:
        print(f"  {key}: {value}")

Dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
First example feature shapes:
  input_ids: length = 91
  attention_mask: length = 91


In [21]:
# Re-tokenize the dataset with the improved tokenize_function
print("Re-tokenizing the dataset...")
fresh_tokenized_datasets = train_ds.map(
    tokenize_function,
    batched=True,
    remove_columns=['id', 'Sentence', 'language', 'emotion']
)

# Save the re-tokenized dataset
fresh_tokenized_datasets.save_to_disk('nppe1_fresh')

# Load the fresh dataset
ds_fresh = load_from_disk('nppe1_fresh')
print("Fresh dataset loaded:", ds_fresh)

# Compare original and fresh datasets
print("\nOriginal dataset sample:")
print(ds_chunked['train'][0])
print("\nFresh dataset sample:")
print(ds_fresh['train'][0])

Re-tokenizing the dataset...


Map:   0%|          | 0/7176 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7176 [00:00<?, ? examples/s]

Fresh dataset loaded: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7176
    })
})

Original dataset sample:
{'input_ids': [2, 4335, 1891, 506, 21543, 8977, 528, 506, 2269, 201893, 5192, 1816, 528, 37369, 8948, 236761, 236743, 107, 24852, 699, 236787, 9891, 236764, 5293, 236764, 14089, 236764, 11019, 236764, 25046, 236764, 56107, 236761, 108, 2067, 236787, 2272, 237880, 236927, 237108, 21248, 251588, 40331, 236914, 47087, 11084, 174286, 31051, 2511, 251588, 237343, 5414, 248916, 68976, 4246, 1436, 5306, 995, 236980, 455, 392, 40331, 108721, 4246, 97207, 251588, 37596, 8458, 455, 392, 236872, 90912, 236872, 3268, 455, 416, 53661, 251588, 5951, 22563, 3417, 457, 411, 21497, 6142, 57248, 237407, 108, 221037, 236787, 56107], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [22]:
# ds_split = ds_chunked.train_test_split(test_size=0.001, seed=42)
# print(ds_split)

In [23]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Loading the model

In [24]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    pad_token_id=tokenizer.pad_token_id,
    device_map="auto",
    torch_dtype="auto"
)

In [25]:
print(next(model.parameters()).device)

cuda:0


In [26]:
configuration = model.config
configuration

Gemma3TextConfig {
  "architectures": [
    "Gemma3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": null,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    106
  ],
  "final_logit_softcapping": null,
  "head_dim": 256,
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 1152,
  "initializer_range": 0.02,
  "intermediate_size": 6912,
  "layer_types": [
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "s

In [27]:
num_parameters=model.num_parameters()

In [28]:
mem_in_gb = num_parameters*4/1e9
print(mem_in_gb)

3.999543808


In [29]:
print(model.get_memory_footprint()/1e9)

1.99977293


## Estimate of Memory size required

In [30]:
param_model = num_parameters*4/1e9
adam_opt = 3*param_model # for storing moments
kernel = 1
bs = 1 # batch size
print(f'Total Memory requirement per sample: {(param_model+adam_opt+kernel)*bs} GB')

Total Memory requirement per sample: 16.998175232 GB


In [31]:
# prompt = "I was reading Feynman's lecture on physics. He talks about "
# inputs = tokenizer(prompt, return_tensors='pt', padding=True)
# outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, top_k=10, top_p=0.95)
# tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [32]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,490,944 || all params: 1,001,376,896 || trainable%: 0.1489


In [33]:
ds_chunked['train'][0]
print("\nDataset keys:", list(ds_chunked['train'][0].keys()))
print("\nInput IDs shape:", len(ds_chunked['train'][0]['input_ids']))


Dataset keys: ['input_ids', 'attention_mask']

Input IDs shape: 91


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate macro F1 score
    f1_macro = f1_score(labels, predictions, average='macro')
    
    # Calculate individual F1 scores for each class
    f1_per_class = f1_score(labels, predictions, average=None)
    f1_dict = {emotion: score for emotion, score in zip(label_mapping.keys(), f1_per_class)}
    
    # Create detailed classification report
    report = classification_report(labels, predictions, target_names=list(label_mapping.keys()), output_dict=True)
    
    metrics = {
        'f1_macro': f1_macro,
        **{f'f1_{emotion}': score for emotion, score in f1_dict.items()},
        'accuracy': (predictions == labels).mean()
    }
    
    return metrics

In [34]:
# Use the fresh dataset and proper configuration
training_args = TrainingArguments(
    output_dir="./gemma-emotion-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
    # Added for better stability
    fp16=True,  # Use mixed precision training if available
    optim="adamw_torch",  # Use AdamW optimizer
)

# Create a new trainer with all proper components
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_chunked['train'],  # Use the fresh dataset
    tokenizer=tokenizer,
    data_collator=data_collator,  # Include the data_collator
)

# Start training
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
10,4.1242
20,3.8775
30,2.9567
40,2.2526
50,1.8418
60,1.9695
70,1.6996
80,1.8004
90,1.7526
100,1.5161


TrainOutput(global_step=2691, training_loss=1.3002480524971067, metrics={'train_runtime': 959.8328, 'train_samples_per_second': 22.429, 'train_steps_per_second': 2.804, 'total_flos': 1.7958991076181504e+16, 'train_loss': 1.3002480524971067, 'epoch': 3.0})

In [35]:
model.save_pretrained("./lora-gemma-emotion")
tokenizer.save_pretrained("./lora-gemma-emotion")

('./lora-gemma-emotion/tokenizer_config.json',
 './lora-gemma-emotion/special_tokens_map.json',
 './lora-gemma-emotion/chat_template.jinja',
 './lora-gemma-emotion/tokenizer.model',
 './lora-gemma-emotion/added_tokens.json',
 './lora-gemma-emotion/tokenizer.json')

In [36]:
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
model = PeftModel.from_pretrained(model, "./lora-gemma-emotion")