<a href="https://colab.research.google.com/github/AKdevi99/XLmproject/blob/main/Helsinki_NLP_opus_mt_hi_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch torchvision torchaudio
!pip install transformers[torch] datasets sacrebleu accelerate bitsandbytes

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.6 MB/s[0m eta

In [1]:
!pip install sacrebleu



In [5]:
!pip install pandas



In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset, DatasetDict # Added Dataset
import sacrebleu
import numpy as np
import os
import pandas as pd # Added pandas

# --- NEW: Import PEFT classes ---
from peft import LoraConfig, get_peft_model, TaskType

# --- Colab Setup: Install Libraries ---
# Run this cell FIRST in your Colab notebook
# !pip install torch torchvision torchaudio
# !pip install transformers[torch] datasets sacrebleu accelerate bitsandbytes pandas
# !pip install peft # <--- NEW: Install PEFT library

# --- Cell 1: Environment Setup & Device Configuration for Colab GPU ---

print("🚀 Setting up environment for Google Colab GPU...")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("✅ CUDA (NVIDIA GPU) is available and will be used.")
else:
    device = torch.device("cpu")
    print("⚠️ No GPU backend found. Training will run on CPU, which will be significantly slower.")

print(f"Using device: {device}")

# --- Cell 2: Model and Tokenizer Loading with 4-bit Quantization and PEFT ---

model_name = "Helsinki-NLP/opus-mt-hi-en"
print(f"\nLoading base model and tokenizer: {model_name}")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)

# --- NEW: Configure and apply PEFT (LoRA) ---
print("\nConfiguring and applying PEFT (LoRA) for efficient fine-tuning...")

# For Encoder-Decoder models like MarianMT, target modules typically include query ('q') and value ('v')
# You might also include other linear layers like 'k' (key) or 'o' (output), or encoder/decoder specific layers.
# Common for T5/BART variants: q, v
lora_config = LoraConfig(
    r=16, # LoRA attention dimension (rank)
    lora_alpha=32, # Scaling factor for LoRA updates
    # --- FIX: Changed target_modules to 'q_proj' and 'v_proj' for MarianMT models ---
    target_modules=["q_proj", "v_proj"], # Modules to apply LoRA to (query and value projections)
    lora_dropout=0.05, # Dropout probability for LoRA layers
    bias="none", # Bias type (none, all, lora_only)
    task_type=TaskType.SEQ_2_SEQ_LM, # Task type for the model
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters to verify LoRA is active
model.print_trainable_parameters()


# --- Cell 3: Dataset Loading and Preparation ---

print("\nLoading custom dataset from CSV using pandas...")
csv_file_path = "my_hinglish_dataset.csv"

if not os.path.exists(csv_file_path):
    print(f"❌ ERROR: CSV file not found at {csv_file_path}.")
    print("Please upload 'my_hinglish_dataset.csv' to your Colab session storage (left sidebar -> folder icon -> upload icon).")
    exit()

try:
    df = pd.read_csv(csv_file_path)
    full_dataset = Dataset.from_pandas(df)
except Exception as e:
    print(f"❌ Error loading CSV with pandas: {e}")
    exit()

split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)

raw_datasets = DatasetDict({
    'train': split_dataset['train'].rename_columns({"hi_en": "Hinglish", "en": "English"}),
    'validation': split_dataset['test'].rename_columns({"hi_en": "Hinglish", "en": "English"})
})

print(f"Dataset structure: {raw_datasets}")
print(f"An example from the training set:\n{raw_datasets['train'][0]}")

# --- Cell 4: Preprocessing the Dataset ---

print("\nPreprocessing and tokenizing dataset...")
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples["Hinglish"]]
    targets = [ex for ex in examples["English"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# --- Cell 5: Data Collator and Training Arguments ---

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results-hinglish-translator-colab-gpu",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs-colab-gpu",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    report_to="tensorboard",
    fp16=False, # Still False for safety unless specifically needed and tested
    bf16=True if torch.cuda.is_bf16_supported() else False, # Enable bfloat16 if GPU supports it
    seed=42,
)

# --- Cell 6: Metrics Function ---

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.corpus_bleu(decoded_preds, decoded_labels)
    return {"bleu": result.score}

# --- Cell 7: Trainer Initialization and Training ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n🔥 Initializing Trainer and starting fine-tuning...")
try:
    trainer.train()
    print("\n✅ Training complete!")
except Exception as e:
    print(f"\n❌ An error occurred during training: {e}")
    print("If you encountered an out-of-memory error, try reducing `per_device_train_batch_size`.")

# --- Cell 8: Saving and Inference ---

final_model_path = "./hinglish_translator_final_model_colab_gpu"
print(f"\n💾 Saving the fine-tuned model to {final_model_path}...")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print("Model and tokenizer saved successfully!")

print("\n--- Inference Test ---")

def translate(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentences = [
    "Hello, kya haal hai?",
    "Main office jaa raha hoon, wahan milte hain.",
    "Aapki madad ke liye dhanyawaad.",
    "Party shuru ho gayi hai."
]

for sentence in test_sentences:
    translated = translate(sentence)
    print(f"Hinglish: {sentence}")
    print(f"English:  {translated}\n")

🚀 Setting up environment for Google Colab GPU...
✅ CUDA (NVIDIA GPU) is available and will be used.
Using device: cuda

Loading base model and tokenizer: Helsinki-NLP/opus-mt-hi-en

Configuring and applying PEFT (LoRA) for efficient fine-tuning...
trainable params: 589,824 || all params: 76,549,632 || trainable%: 0.7705

Loading custom dataset from CSV using pandas...
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['English', 'Hinglish'],
        num_rows: 6447
    })
    validation: Dataset({
        features: ['English', 'Hinglish'],
        num_rows: 1612
    })
})
An example from the training set:
{'English': 'Yeah, though I was really disappointed with Sulley for cheating.', 'Hinglish': 'haan, sulley ka cheating se kuch disappoint hein'}

Preprocessing and tokenizing dataset...


Map:   0%|          | 0/6447 [00:00<?, ? examples/s]

Map:   0%|          | 0/1612 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🔥 Initializing Trainer and starting fine-tuning...


Epoch,Training Loss,Validation Loss



❌ An error occurred during training: CUDA out of memory. Tried to allocate 6.02 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.86 GiB is free. Process 36612 has 8.88 GiB memory in use. Of the allocated memory 6.25 GiB is allocated by PyTorch, and 2.50 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
If you encountered an out-of-memory error, try reducing `per_device_train_batch_size`.

💾 Saving the fine-tuned model to ./hinglish_translator_final_model_colab_gpu...
Model and tokenizer saved successfully!

--- Inference Test ---
Hinglish: Hello, kya haal hai?
English:  Hello, is it?

Hinglish: Main office jaa raha hoon, wahan milte hain.
English:  I don't know how to do it, so I don't want it.

Hinglish: Aapki madad ke liye dhanyawaad.
English:

In [4]:
#second version to batch size reduced
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset, DatasetDict
import sacrebleu
import numpy as np
import os
import pandas as pd

from peft import LoraConfig, get_peft_model, TaskType

# --- Colab Setup: Install Libraries ---
# Run this cell FIRST in your Colab notebook
# !pip install torch torchvision torchaudio
# !pip install transformers[torch] datasets sacrebleu accelerate bitsandbytes pandas
# !pip install peft

# --- Cell 1: Environment Setup & Device Configuration for Colab GPU ---

print("🚀 Setting up environment for Google Colab GPU...")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("✅ CUDA (NVIDIA GPU) is available and will be used.")
else:
    device = torch.device("cpu")
    print("⚠️ No GPU backend found. Training will run on CPU, which will be significantly slower.")

print(f"Using device: {device}")

# --- Cell 2: Model and Tokenizer Loading with 4-bit Quantization and PEFT ---

model_name = "Helsinki-NLP/opus-mt-hi-en"
print(f"\nLoading base model and tokenizer: {model_name}")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)

print("\nConfiguring and applying PEFT (LoRA) for efficient fine-tuning...")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()


# --- Cell 3: Dataset Loading and Preparation ---

print("\nLoading custom dataset from CSV using pandas...")
csv_file_path = "my_hinglish_dataset.csv"

if not os.path.exists(csv_file_path):
    print(f"❌ ERROR: CSV file not found at {csv_file_path}.")
    print("Please upload 'my_hinglish_dataset.csv' to your Colab session storage (left sidebar -> folder icon -> upload icon).")
    exit()

try:
    df = pd.read_csv(csv_file_path)
    full_dataset = Dataset.from_pandas(df)
except Exception as e:
    print(f"❌ Error loading CSV with pandas: {e}")
    exit()

split_dataset = full_dataset.train_test_split(test_size=0.2, seed=42)

raw_datasets = DatasetDict({
    'train': split_dataset['train'].rename_columns({"hi_en": "Hinglish", "en": "English"}),
    'validation': split_dataset['test'].rename_columns({"hi_en": "Hinglish", "en": "English"})
})

print(f"Dataset structure: {raw_datasets}")
print(f"An example from the training set:\n{raw_datasets['train'][0]}")

# --- Cell 4: Preprocessing the Dataset ---

print("\nPreprocessing and tokenizing dataset...")
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples["Hinglish"]]
    targets = [ex for ex in examples["English"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# --- Cell 5: Data Collator and Training Arguments ---

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results-hinglish-translator-colab-gpu",
    num_train_epochs=3,
    per_device_train_batch_size=4, # <--- REDUCED BATCH SIZE
    per_device_eval_batch_size=4, # <--- REDUCED BATCH SIZE
    gradient_accumulation_steps=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs-colab-gpu",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    report_to="tensorboard",
    fp16=False,
    bf16=True if torch.cuda.is_bf16_supported() else False,
    seed=42,
)

# --- Cell 6: Metrics Function ---

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.corpus_bleu(decoded_preds, decoded_labels)
    return {"bleu": result.score}

# --- Cell 7: Trainer Initialization and Training ---

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer, # <--- UPDATED: Used processing_class instead of tokenizer
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("\n🔥 Initializing Trainer and starting fine-tuning...")
try:
    trainer.train()
    print("\n✅ Training complete!")
except Exception as e:
    print(f"\n❌ An error occurred during training: {e}")
    print("If you encountered an out-of-memory error, try reducing `per_device_train_batch_size` or increasing `gradient_accumulation_steps`.")

# --- Cell 8: Saving and Inference ---

final_model_path = "./hinglish_translator_final_model_colab_gpu"
print(f"\n💾 Saving the fine-tuned model to {final_model_path}...")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print("Model and tokenizer saved successfully!")

print("\n--- Inference Test ---")

def translate(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentences = [
    "Hello, kya haal hai?",
    "Main office jaa raha hoon, wahan milte hain.",
    "Aapki madad ke liye dhanyawaad.",
    "Party shuru ho gayi hai."
]

for sentence in test_sentences:
    translated = translate(sentence)
    print(f"Hinglish: {sentence}")
    print(f"English:  {translated}\n")


🚀 Setting up environment for Google Colab GPU...
✅ CUDA (NVIDIA GPU) is available and will be used.
Using device: cuda

Loading base model and tokenizer: Helsinki-NLP/opus-mt-hi-en





Configuring and applying PEFT (LoRA) for efficient fine-tuning...
trainable params: 589,824 || all params: 76,549,632 || trainable%: 0.7705

Loading custom dataset from CSV using pandas...
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['English', 'Hinglish'],
        num_rows: 6447
    })
    validation: Dataset({
        features: ['English', 'Hinglish'],
        num_rows: 1612
    })
})
An example from the training set:
{'English': 'Yeah, though I was really disappointed with Sulley for cheating.', 'Hinglish': 'haan, sulley ka cheating se kuch disappoint hein'}

Preprocessing and tokenizing dataset...


Map:   0%|          | 0/6447 [00:00<?, ? examples/s]

Map:   0%|          | 0/1612 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🔥 Initializing Trainer and starting fine-tuning...


Epoch,Training Loss,Validation Loss



❌ An error occurred during training: CUDA out of memory. Tried to allocate 5.92 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.89 GiB is free. Process 36612 has 8.85 GiB memory in use. Of the allocated memory 6.26 GiB is allocated by PyTorch, and 2.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
If you encountered an out-of-memory error, try reducing `per_device_train_batch_size` or increasing `gradient_accumulation_steps`.

💾 Saving the fine-tuned model to ./hinglish_translator_final_model_colab_gpu...
Model and tokenizer saved successfully!

--- Inference Test ---
Hinglish: Hello, kya haal hai?
English:  Hello, is it?

Hinglish: Main office jaa raha hoon, wahan milte hain.
English:  I don't know how to do it, it's a good idea.

Hinglish