In [1]:
from huggingface_hub import login
import os
import wandb

from dotenv import load_dotenv
load_dotenv()
hf_token = os.getenv("HUGGINGFACE_TOKEN")
wandb_api_key = os.getenv("WANDB_API_KEY")
if hf_token:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face Hub!")
else:
    print("Hugging Face token not found in .env file.")
if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("✅ Successfully logged in to WandB!")
else:
    print("❌ WandB API key not found in .env file.")

run = wandb.init(
    project='Fine-tune-Llama3.2 on Text Translation', 
    job_type="training", 
    anonymous="allow"
)


  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/aman/.netrc


Successfully logged in to Hugging Face Hub!


[34m[1mwandb[0m: Currently logged in as: [33mitsaman9868[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


✅ Successfully logged in to WandB!


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 
dtype = None 
load_in_4bit = True 
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",  
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

Unsloth 2025.1.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:
from datasets import load_dataset
dataset=load_dataset("Amanaccessassist/Hienglish_Translation",split="train")
dataset

Generating train split: 100%|██████████| 151281/151281 [00:00<00:00, 2276348.56 examples/s]
Generating validation split: 100%|██████████| 37821/37821 [00:00<00:00, 3691623.92 examples/s]


Dataset({
    features: ['id', 'translation'],
    num_rows: 151281
})

In [5]:
dataset['translation'][0]

{'input': 'casserole ke liye timer start kare',
 'target': 'start timer for the casserole'}

# Check Sample Prompt

In [9]:
eng_sentence = dataset['translation'][2].get("input")
hindi_sentence = dataset['translation'][2].get("target")
print(eng_sentence)
print(hindi_sentence)

instruction = """You are a top-rated customer service agent name Llama . 
Be polite to customers and answer all their questions.
Your task is to translate the given Hinglish sentence into proper English.
"""

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["input"]},
        {"role": "assistant", "content": row["target"]}
    ]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Test with sample row
formatted_row = format_chat_template({'input': eng_sentence, 'target': hindi_sentence})

# Print formatted chat template
print("\nFormatted Chat Template:")
print(formatted_row["text"])

Mujhe kal oil changed karne ke liye yaad dilaye
Remind me to to get the oil changed tomorrow .

Formatted Chat Template:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 02 Feb 2025

You are a top-rated customer service agent name Llama . 
Be polite to customers and answer all their questions.
Your task is to translate the given Hinglish sentence into proper English.<|eot_id|><|start_header_id|>user<|end_header_id|>

Mujhe kal oil changed karne ke liye yaad dilaye<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Remind me to to get the oil changed tomorrow .<|eot_id|>


# Apply Template on Datset

In [19]:
EOS_TOKEN=tokenizer.eos_token
instruction="""You are a professional translator. 
Your task is to accurately translate the given Hinglish sentence into proper English."""
def final_chat_template(batch):
    batch_texts = []
    
    for row in batch["translation"]:  
        row_json = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": row["input"]},  
            {"role": "assistant", "content": row["target"]}
        ]
        formatted_text = tokenizer.apply_chat_template(row_json, tokenize=False) + EOS_TOKEN
        batch_texts.append(formatted_text)
    
    batch["text"] = batch_texts  
    return batch

dataset = dataset.map(
    final_chat_template,
    batched=True, 
)
dataset['text'][3]

Map: 100%|██████████| 151281/151281 [00:02<00:00, 52800.26 examples/s]


'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 02 Feb 2025\n\nYou are a professional translator. \nYour task is to accurately translate the given Hinglish sentence into proper English.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\ntraffic ke sath meri ghar ke liye drive kitni lambi hogi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nhow long will my drive home be with traffic<|eot_id|><|eot_id|>'

# Model Training

In [20]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", 
    ),
)


trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)
trainer.train()

Map (num_proc=2): 100%|██████████| 151281/151281 [00:07<00:00, 21138.02 examples/s]
Map: 100%|██████████| 151281/151281 [00:03<00:00, 40605.38 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 151,281 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.9537
2,2.6756
3,3.7794
4,3.2119
5,2.2133
6,1.8348
7,1.3142
8,1.1434
9,1.6293
10,1.3112


TrainOutput(global_step=60, training_loss=0.9921244790156682, metrics={'train_runtime': 36.9142, 'train_samples_per_second': 13.003, 'train_steps_per_second': 1.625, 'total_flos': 736034440372224.0, 'train_loss': 0.9921244790156682, 'epoch': 0.00317288243148557})

# GPU Memory STats

In [21]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(
    torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = NVIDIA GeForce RTX 4070 Ti SUPER. Max memory = 15.693 GB.
2.838 GB of memory reserved.


In [23]:
model.push_to_hub_merged(
    "Amanaccessassist/Llama3.2HiEnglish-Back-Translation",
    tokenizer,
    save_method="merged_16bit",
    token=hf_token,
)

Unsloth: You are pushing to hub, but you passed your HF username = Amanaccessassist.
We shall truncate Amanaccessassist/Llama3.2HiEnglish-Back-Translation to Llama3.2HiEnglish-Back-Translation


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 58.77 out of 94.06 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 77.60it/s]


Unsloth: Saving tokenizer...

tokenizer.json: 100%|██████████| 17.2M/17.2M [00:01<00:00, 10.5MB/s]


 Done.


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

Done.
Saved merged model to https://huggingface.co/Amanaccessassist/Llama3.2HiEnglish-Back-Translation


In [None]:
from transformers import pipeline
pipe = pipeline("text-generation", model="Amanaccessassist/Llama3.2HiEnglish-Back-Translation",torch_dtype=torch.float16)
def generate_response(text):
    messages = [
        {
            "role": "user",
            "content": f"Translate the given Hinglish sentence into proper English.\nInput: {text}",
        },
    ]
    
    result = pipe(messages[0]['content'], max_length=100, num_return_sequences=1)
    
    return result[0]['generated_text'][-1]["content"].strip()

text = "casserole ke liye timer start kare"
response = generate_response(text)
print(response)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.31it/s]
Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.47 GiB. GPU 0 has a total capacity of 15.69 GiB of which 437.56 MiB is free. Including non-PyTorch memory, this process has 14.79 GiB memory in use. Of the allocated memory 14.44 GiB is allocated by PyTorch, and 93.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)