In [1]:
!pip install transformers datasets sentencepiece
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install bitsandbytes



Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments

!ls -lh final_data/en-te/train.*


-rw-r--r-- 1 pammiabhishikth pammiabhishikth 257M Apr 14  2021 final_data/en-te/train.en
-rw-r--r-- 1 pammiabhishikth pammiabhishikth 716M Apr 14  2021 final_data/en-te/train.te


In [3]:
with open("final_data/en-te/train.en", "r", encoding="utf-8") as f:
    en_lines = f.readlines()

with open("final_data/en-te/train.te", "r", encoding="utf-8") as f:
    te_lines = f.readlines()

print(f"{len(en_lines)} sentence pairs loaded")
print("Example:", en_lines[0].strip(), "→", te_lines[0].strip())

print("en lines:", len(en_lines))
print("te lines:", len(te_lines))


4841862 sentence pairs loaded
Example: Rise again. → మళ్లీ ఉదయిస్తాడు.
en lines: 4841862
te lines: 4841862


In [4]:
with open("train.en", "r", encoding="utf-8") as f_en, open("train.te", "r", encoding="utf-8") as f_te, open("train.en-te.tsv", "w", encoding="utf-8") as f_out:
    for en, te in zip(f_en, f_te):
        f_out.write(f"en\t{en.strip()}\tte\t{te.strip()}\n")


In [None]:
import torch
print(torch.__version__)
print("Torch:", torch.__file__)

import transformers
print("Transformers:", transformers.__file__)


In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

model_name = "ai4bharat/indictrans2-en-indic-dist-200M"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16  # Use float16 for reduced memory usage
).to("cuda")  # Move model to GPU if available


In [8]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,         # Smallest to avoid OOM
    gradient_accumulation_steps=1,         # Lowered from 4 to reduce memory
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    bf16=False,
    fp16=False,
    report_to="none",
    optim="adamw_torch_fused",  # Slightly faster and may use less memory
    learning_rate = 2e-6,
    max_grad_norm=1.0,

)


In [None]:
!rm -rf ~/.cache/huggingface/datasets
!rm -rf ~/.cache/huggingface/transformers
!df -h



In [6]:
from datasets import load_dataset

# Load your TSV file
raw_dataset = load_dataset(
    "csv",
    data_files={"train": "train.en-te.tsv"},
    delimiter="\t",
    column_names=["lang1", "en", "lang2", "te"]
)

#  Shuffle and take a random 100,000-sample subset
raw_dataset["train"] = raw_dataset["train"].shuffle(seed=42).select(range(15000))

#  Tokenization using text_target (not deprecated)
def preprocess_function(examples):
    inputs = examples["en"]
    targets = examples["te"]
    
    #  Shortened max_length for speed and memory efficiency
    model_inputs = tokenizer(inputs, max_length=32, truncation=True)
    labels = tokenizer(text_target=targets, max_length=32, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  In-memory tokenization to avoid disk overhead
tokenized_dataset = raw_dataset["train"].map(
    preprocess_function,
    batched=True,
    keep_in_memory=True
)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer

# Enable memory-saving mode
model.gradient_checkpointing_enable()

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
  trainer = Seq2SeqTrainer(


In [23]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32,expandable_segments:True"



In [10]:
import torch
# clear any leftover memory

trainer.train()


  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,10.3996
20,6.5309
30,5.5992
40,4.4951
50,4.0408
60,3.7758
70,3.9926
80,4.06
90,4.2031
100,4.2738


TrainOutput(global_step=15000, training_loss=2.7326329427083333, metrics={'train_runtime': 3132.964, 'train_samples_per_second': 4.788, 'train_steps_per_second': 4.788, 'total_flos': 147700009476096.0, 'train_loss': 2.7326329427083333, 'epoch': 1.0})

In [11]:
model_dir = "./indictrans2-en-te-finetuned"
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)


('./indictrans2-en-te-finetuned/tokenizer_config.json',
 './indictrans2-en-te-finetuned/special_tokens_map.json',
 './indictrans2-en-te-finetuned/dict.SRC.json',
 './indictrans2-en-te-finetuned/dict.TGT.json',
 './indictrans2-en-te-finetuned/model.SRC',
 './indictrans2-en-te-finetuned/model.TGT',
 './indictrans2-en-te-finetuned/added_tokens.json')

In [12]:
import sys
sys.path.append("./indictrans2-en-te-finetuned")

from tokenization_indictrans import IndicTransTokenizer
from transformers import AutoModelForSeq2SeqLM

# Load tokenizer
tokenizer = IndicTransTokenizer(
    src_vocab_fp="./indictrans2-en-te-finetuned/dict.SRC.json",
    tgt_vocab_fp="./indictrans2-en-te-finetuned/dict.TGT.json",
    src_spm_fp="./indictrans2-en-te-finetuned/model.SRC",
    tgt_spm_fp="./indictrans2-en-te-finetuned/model.TGT",
    do_lower_case=True,
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
    unk_token="<unk>"
)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained("./indictrans2-en-te-finetuned")


The repository for ./indictrans2-en-te-finetuned contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/./indictrans2-en-te-finetuned.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


In [14]:
# Load full dataset again for eval selection
full_eval_data = load_dataset(
    "csv",
    data_files={"train": "train.en-te.tsv"},
    delimiter="\t",
    column_names=["lang1", "en", "lang2", "te"]
)

# Shuffle and select 2,100 rows for evaluation
eval_dataset_raw = full_eval_data["train"].shuffle(seed=42).select(range(3000))


In [15]:
tokenized_eval_dataset = eval_dataset_raw.map(
    preprocess_function,
    batched=True,
    keep_in_memory=True
)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [16]:
tokenized_eval_dataset.save_to_disk("tokenized_eval_dataset_en_te")

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [17]:
eval_results = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
print(eval_results)


{'eval_loss': 2.462890625, 'eval_runtime': 12.767, 'eval_samples_per_second': 234.982, 'eval_steps_per_second': 29.373, 'epoch': 1.0}


In [18]:
!pip install evaluate




In [None]:
!wget https://huggingface.co/ai4bharat/indictrans2-en-indic-dist-200M/resolve/main/tokenization_indictrans.py -P ./en-te-finetuned


In [24]:
import evaluate
import torch

# Load BLEU metric
metric = evaluate.load("sacrebleu")

# Use a small subset (only 10 to test stability first)
subset = tokenized_eval_dataset.select(range(10))

predictions_output = trainer.predict(subset)

# Extract predictions and labels correctlym 
pred_token_ids = predictions_output.predictions  # <- this is what you want
label_token_ids = predictions_output.label_ids


In [19]:
import numpy as np
import evaluate

def compute_bleu_score(trainer, tokenizer, dataset, num_samples=10):
    subset = dataset.select(range(num_samples))
    output = trainer.predict(subset)

    # Convert logits to token IDs
    logits = output.predictions
    if isinstance(logits, tuple):
        logits = logits[0]
    pred_token_ids = np.argmax(logits, axis=-1)

    label_token_ids = output.label_ids

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)

    # Replace -100 with pad_token_id
    label_token_ids = [
        [(l if l != -100 else tokenizer.pad_token_id) for l in label]
        for label in label_token_ids
    ]
    decoded_labels = tokenizer.batch_decode(label_token_ids, skip_special_tokens=True)

    # Compute BLEU score
    metric = evaluate.load("sacrebleu")
    result = metric.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )

    print(f" BLEU score: {result['score']:.2f}")
    return result['score']


In [24]:
bleu = compute_bleu_score(trainer, tokenizer, tokenized_eval_dataset, num_samples=120)


✅ BLEU score: 7.74


In [29]:
for seq in pred_token_ids:
    for tok in seq:
        print(type(tok), tok, getattr(tok, 'shape', None))
    break


<class 'numpy.ndarray'> [-1.73  -1.745  5.902 ... -1.742 -1.766 -1.721] (122672,)
<class 'numpy.ndarray'> [-0.6733 -0.665   6.69   ... -0.664  -0.6807 -0.672 ] (122672,)
<class 'numpy.ndarray'> [-0.471  -0.467   6.1    ... -0.4663 -0.512  -0.468 ] (122672,)
<class 'numpy.ndarray'> [-0.9946 -0.9907  6.81   ... -0.9897 -1.021  -0.9917] (122672,)
<class 'numpy.ndarray'> [-0.849  -0.847   7.484  ... -0.8457 -0.907  -0.8457] (122672,)
<class 'numpy.ndarray'> [-1.584 -1.583  8.1   ... -1.581 -1.617 -1.582] (122672,)
<class 'numpy.ndarray'> [-0.6694 -0.66    7.66   ... -0.6597 -0.6777 -0.6685] (122672,)
<class 'numpy.ndarray'> [-0.6147 -0.613   7.406  ... -0.6113 -0.6533 -0.612 ] (122672,)
<class 'numpy.ndarray'> [-1.254 -1.251  7.71  ... -1.249 -1.28  -1.251] (122672,)
<class 'numpy.ndarray'> [-0.9346 -0.9326  7.996  ... -0.931  -0.9937 -0.931 ] (122672,)
<class 'numpy.ndarray'> [-1.591 -1.589  8.734 ... -1.588 -1.626 -1.588] (122672,)
<class 'numpy.ndarray'> [-0.6616 -0.6523  8.12   ... -0.

In [28]:
# If predictions is a tuple (e.g., logits), get the actual token IDs
if isinstance(pred_token_ids, tuple):
    pred_token_ids = pred_token_ids[0]

# Squeeze extra dimensions if needed (e.g., shape (10, 1, 128))
pred_token_ids = np.array(pred_token_ids)
if len(pred_token_ids.shape) == 3 and pred_token_ids.shape[1] == 1:
    pred_token_ids = np.squeeze(pred_token_ids, axis=1)

# Convert to list of ints
pred_token_ids = [
    [int(tok.item()) if hasattr(tok, 'item') else int(tok) for tok in seq]
    for seq in pred_token_ids
]

# ✅ Decode predictions
decoded_preds = tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)

# Replace -100 with pad_token_id in labels
label_token_ids = [
    [(l if l != -100 else tokenizer.pad_token_id) for l in label]
    for label in label_token_ids
]

# ✅ Decode labels
decoded_labels = tokenizer.batch_decode(label_token_ids, skip_special_tokens=True)

# ✅ Compute BLEU
result = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
print(f"✅ BLEU score: {result['score']:.2f}")

ValueError: can only convert an array of size 1 to a Python scalar

In [None]:
import os
from pathlib import Path

def list_dir_info(directory):
    print(f"\n📁 Contents of: {directory}\n")
    total_size = 0
    for file in sorted(Path(directory).iterdir()):
        if file.is_file():
            size_mb = round(file.stat().st_size / (1024 * 1024), 2)
            total_size += size_mb
            print(f"📄 {file.name:30} | {size_mb:>6} MB | Type: {file.suffix}")
        elif file.is_dir():
            print(f"📂 {file.name}/ (folder)")
    print(f"\n📦 Total size of all files: {round(total_size, 2)} MB\n")

# Example usage
list_dir_info("./indictrans2-en-te-finetuned")


In [None]:
model_dir = "./en-te-finetuned"

# Save model weights and config properly
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Optional but useful: save training args if using Trainer
trainer.args.save(model_dir)


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

sentence = "This is a sample sentence."
inputs = tokenizer(sentence, return_tensors="pt")
outputs = model.generate(**inputs, max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
!pip install --upgrade transformers
!pip install "accelerate>=0.26.0"


In [None]:
!du -h ./indictrans2-en-te-finetuned/config.json


In [None]:
import json

with open("./indictrans2-en-te-finetuned/config.json", "r") as f:
    config = json.load(f)

print(json.dumps(config, indent=2))
