<a href="https://colab.research.google.com/github/Aaryao9/meditranslate/blob/main/Meditranslate_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# FULL SINGLE-CELL LoRA FINE-TUNING (EN → NE MEDICAL)
# ============================================================

# ---------- 0. INSTALL / RESET ENV ----------
!pip uninstall -y transformers peft accelerate sentence-transformers -q
!pip install transformers==4.41.2 peft==0.10.0 accelerate==0.29.3 datasets -q

# ---------- 1. IMPORTS ----------
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType
from shutil import make_archive

# ---------- 2. CONFIG ----------
MODEL_NAME = "rujengelal/my_awesome_english_to_nepali_tst"
DATASET_NAME = "Bibek-Poudel/ENG_NEP_MED_PARALLEL"
OUTPUT_DIR = "./eng-nep-med-lora"

MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 128

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- 3. LOAD DATASET ----------
dataset = load_dataset(DATASET_NAME)
print(dataset)

# ---------- 4. TOKENIZER & MODEL ----------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# LoRA settings
model.config.decoder_start_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False  # REQUIRED for LoRA

# ---------- 5. PREPROCESS ----------
def preprocess_function(batch):
    inputs = batch["en"]
    targets = batch["ne"]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Replace padding token ids in labels with -100
    labels["input_ids"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# ---------- 6. APPLY LoRA ----------
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value", "k_proj", "v_proj", "q_proj", "out_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ---------- 7. TRAINING ARGS ----------
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    fp16=torch.cuda.is_available(),
    save_steps=500,
    save_total_limit=3,
    logging_steps=100,
    report_to="none",
    remove_unused_columns=False
)

# ---------- 8. DATA COLLATOR ----------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# ---------- 9. TRAIN ----------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

# ---------- 10. SAVE MODEL ----------
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ LoRA fine-tuning finished and model saved to", OUTPUT_DIR)

# ---------- 11. ZIP MODEL FOR DOWNLOAD (COLAB/Temp Runtimes) ----------
make_archive("eng-nep-med-lora", 'zip', OUTPUT_DIR)
print("✅ Model zipped as eng-nep-med-lora.zip for download")

# ---------- 12. QUICK TEST ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def translate_en_to_ne(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=4
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Quick test
print(translate_en_to_ne("The patient is suffering from high blood pressure."))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m139.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m125.1 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/77.9M [00:00<?, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/58682 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7335 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7336 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ne', 'en'],
        num_rows: 58682
    })
    validation: Dataset({
        features: ['ne', 'en'],
        num_rows: 7335
    })
    test: Dataset({
        features: ['ne', 'en'],
        num_rows: 7336
    })
})


tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Map:   0%|          | 0/58682 [00:00<?, ? examples/s]

Map:   0%|          | 0/7335 [00:00<?, ? examples/s]

Map:   0%|          | 0/7336 [00:00<?, ? examples/s]

trainable params: 2,359,296 || all params: 617,433,088 || trainable%: 0.38211363236820894


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,3.0778
200,1.2904
300,1.1166
400,1.1215
500,1.0263
600,0.9766
700,1.007
800,1.0314
900,0.9862
1000,0.922




✅ LoRA fine-tuning finished and model saved to ./eng-nep-med-lora
✅ Model zipped as eng-nep-med-lora.zip for download
बिरामीलाई उच्च रक्तचापको समस्या छ।
