<a href="https://colab.research.google.com/github/2303A510H5/batch30/blob/main/BERT_BERT0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install rouge-score package
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=41b06fb6459588564dbb4cf5f3d3b60ff558ccb9e01ff6e1d97a5f2f6dc81950
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
pip install rouge-score



In [None]:
# Install necessary packages if you haven't already (uncomment to run)
# !pip install -q transformers datasets sacrebleu nltk rouge-score

import os
os.environ["WANDB_DISABLED"] = "true"  # disable wandb
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    MT5ForConditionalGeneration,
    MT5TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
nltk.download('punkt', quiet=True)

# ---------- Settings ----------
DATA_PATH = '/content/FinalDS.xlsx'   # change if needed
SOURCE_COL = 'TELUGU'
TARGET_COL = 'ENGLISH'
MODEL_NAME = 'google/mt5-small'       # mT5-small
MAX_LENGTH = 128
NUM_EPOCHS = 3                        # fast training
SEED = 42
# ------------------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Adaptive batch size: larger when GPU available
if torch.cuda.is_available():
    PER_DEVICE_BATCH = 8   # adjust if you have more GPU memory (e.g., 16)
else:
    PER_DEVICE_BATCH = 4   # CPU -> keep small

print(f"Per-device batch size set to: {PER_DEVICE_BATCH}")

# Load data
print("Loading dataset...")
df = pd.read_excel(DATA_PATH)

# Drop unwanted columns if present (same as your earlier cleaning)
df = df.drop(columns=['KANNADA', 'MALAYALAM'], errors='ignore')
if SOURCE_COL not in df.columns or TARGET_COL not in df.columns:
    raise ValueError(f"Required columns not found. Available: {list(df.columns)}")

df = df[[SOURCE_COL, TARGET_COL]].rename(columns={SOURCE_COL: 'source', TARGET_COL: 'target'})
df = df.dropna(subset=['source', 'target'])
df = df[df['source'].str.strip() != '']
df = df[df['target'].str.strip() != '']
df = df.drop_duplicates(subset=['source', 'target']).reset_index(drop=True)

print(f"Total samples after cleaning: {len(df)}")

# Train/validation split
train_df, val_df = train_test_split(df, test_size=0.15, random_state=SEED)
print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")

# Convert to Hugging Face datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))

# Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = MT5TokenizerFast.from_pretrained(MODEL_NAME)
model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

# Tokenize function (we'll use dynamic padding via DataCollatorForSeq2Seq)
def preprocess_function(examples):
    inputs = examples['source']
    targets = examples['target']
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LENGTH, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Map datasets (this is fast enough)
print("Tokenizing datasets (this may take a minute)...")
train_tokenized = train_ds.map(preprocess_function, batched=True, remove_columns=train_ds.column_names)
val_tokenized = val_ds.map(preprocess_function, batched=True, remove_columns=val_ds.column_names)

# Data collator for dynamic padding (fast & memory efficient)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=None)

# Training args - NO generation during training to speed it up
output_dir = './mt5_telugu_en_results'
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    per_device_eval_batch_size=PER_DEVICE_BATCH,
    predict_with_generate=False,    # key speed-up: disable generate during training
    do_train=True,
    do_eval=False,                  # we will evaluate manually after training
    learning_rate=3e-4,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=3,
    fp16=torch.cuda.is_available(), # mixed precision on GPU
    gradient_accumulation_steps=1,
    dataloader_num_workers=2,
    seed=SEED,
    remove_unused_columns=False,
    save_strategy="epoch",
    report_to="none",
)

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=None,   # no eval during training
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
print("\n=== Starting training ===")
trainer.train()
print("=== Training complete ===\n")

# Save the trained model and tokenizer
print("Saving model...")
trainer.save_model('./mt5_telugu_english_model')
tokenizer.save_pretrained('./mt5_telugu_english_model')
print("Model saved to './mt5_telugu_english_model'\n")

# ---------- Manual evaluation with beam search (slower but done only once) ----------
print("Starting final evaluation on validation set with beam search (num_beams=4)...")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smooth = SmoothingFunction().method1

bleu_scores = []
rouge_scores = []

# Evaluate in batches to avoid OOM
BATCH_SIZE_EVAL = PER_DEVICE_BATCH
val_texts = val_df['source'].tolist()
val_refs = val_df['target'].tolist()

model.eval()
for i in range(0, len(val_texts), BATCH_SIZE_EVAL):
    batch_src = val_texts[i:i+BATCH_SIZE_EVAL]
    batch_refs = val_refs[i:i+BATCH_SIZE_EVAL]

    # Tokenize with padding
    inputs = tokenizer(batch_src, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH).to(device)
    # Generate with beams for better quality
    generated_tokens = model.generate(
        **inputs,
        max_length=MAX_LENGTH,
        num_beams=4,
        early_stopping=True,
        length_penalty=1.0,
        no_repeat_ngram_size=3
    )

    preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    for pred, ref in zip(preds, batch_refs):
        p = pred.strip()
        r = str(ref).strip()
        if len(p) == 0 or len(r) == 0:
            continue
        try:
            p_tokens = nltk.word_tokenize(p.lower())
            r_tokens = nltk.word_tokenize(r.lower())
            bleu = sentence_bleu([r_tokens], p_tokens, smoothing_function=smooth)
            bleu_scores.append(bleu * 100)
        except Exception:
            pass
        try:
            rouge = scorer.score(r, p)['rougeL'].fmeasure
            rouge_scores.append(rouge * 100)
        except Exception:
            pass

# Final aggregated scores
final_bleu = float(np.mean(bleu_scores)) if bleu_scores else 0.0
final_rouge = float(np.mean(rouge_scores)) if rouge_scores else 0.0

print(f"\n{'='*40}")
print(f"FINAL EVAL (on validation set)")
print(f"BLEU:  {final_bleu:.2f}%")
print(f"ROUGE-L: {final_rouge:.2f}%")
print(f"{'='*40}\n")

# Print a few sample translations
print("Sample predictions (first 8 examples):\n")
for src, ref in zip(val_texts[:8], val_refs[:8]):
    inp = tokenizer(src, return_tensors='pt', truncation=True, max_length=MAX_LENGTH).to(device)
    out = model.generate(**inp, max_length=MAX_LENGTH, num_beams=4, early_stopping=True)
    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    print("Source:", src)
    print("Reference:", ref)
    print("Predicted:", pred)
    print("-" * 80)

print("Evaluation complete.")


Device: cuda
Per-device batch size set to: 8
Loading dataset...
Total samples after cleaning: 101440
Train samples: 86224, Val samples: 15216
Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5TokenizerFast'.


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing datasets (this may take a minute)...


Map:   0%|          | 0/86224 [00:00<?, ? examples/s]

Map:   0%|          | 0/15216 [00:00<?, ? examples/s]


=== Starting training ===


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


=== Training complete ===

Saving model...
Model saved to './mt5_telugu_english_model'

Starting final evaluation on validation set with beam search (num_beams=4)...

FINAL EVAL (on validation set)
BLEU:  0.00%
ROUGE-L: 0.00%

Sample predictions (first 8 examples):

Source: గరిష్ట ఉష్ణోగ్రత 32 సి మరియు కనిష్ట 20 సి ఉంటుంది.
Reference: the maximum temperature will be around 32c and the minimum 20c.
Predicted: <0x03>
--------------------------------------------------------------------------------
Source: చివరకు రోజు వచ్చింది.
Reference: finally the day came.
Predicted: <0x03>
--------------------------------------------------------------------------------
Source: వారు అక్కడ సరైన మార్గంలో వెళ్ళాలి.
Reference: they have to get there the right way.
Predicted: <0x03>
--------------------------------------------------------------------------------
Source: విక్రమ్స్ కుమారుడు ధ్రువ్ తెలుగు హిట్ అర్జున్ రెడ్డి యొక్క తమిళ రీమేక్‌లో అడుగుపెట్టనున్నారు.
Reference: vikrams son dhruv is set to debut 

In [None]:
# Install necessary packages
# !pip install torch transformers pandas scikit-learn nltk rouge-score openpyxl

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
import torch
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import warnings
warnings.filterwarnings('ignore')

# Print transformers version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Download necessary NLTK files
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True) # Added this line to download the missing resource

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and clean data
print("Loading data...")
df = pd.read_excel('/content/FinalDS.xlsx')

# Drop unnecessary columns
df = df.drop(columns=['KANNADA', 'MALAYALAM'], errors='ignore')

# Define source and target columns
source_col = 'TELUGU'
target_col = 'ENGLISH'

# Ensure columns exist
if source_col not in df.columns or target_col not in df.columns:
    raise ValueError(f"Column names not found. Available columns: {list(df.columns)}")

# Clean data - remove NaN and empty strings
df = df.dropna(subset=[source_col, target_col])
df = df[df[source_col].str.strip() != '']
df = df[df[target_col].str.strip() != '']
df = df.rename(columns={source_col: 'source', target_col: 'target'})

# Remove duplicates
df = df.drop_duplicates(subset=['source', 'target'])

# -------------------------------
#    USE ONLY 5000 SAMPLES
# -------------------------------
df = df.head(5000)
print(f"Total samples used: {len(df)} (limited to 5000)")

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# Load tokenizer - using multilingual BERT
print("\nLoading tokenizer and model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Create encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    'bert-base-multilingual-cased',
    'bert-base-multilingual-cased'
)

# Configure model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 64 # Reduced from 128
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

# Move model to device
model = model.to(device)

max_length = 64 # Reduced from 128

# -------------------------------
#     FIXED DATASET CLASS
# -------------------------------
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        source_text = str(row['source'])
        target_text = str(row['target'])

        # Tokenize source
        source_encoding = self.tokenizer(
            source_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': source_encoding['input_ids'].squeeze(),
            'attention_mask': source_encoding['attention_mask'].squeeze(),
            'labels': labels
        }

print("Creating datasets...")
train_dataset = TranslationDataset(train_df.reset_index(drop=True), tokenizer, max_length)
val_dataset = TranslationDataset(val_df.reset_index(drop=True), tokenizer, max_length)

# -------------------------------
#     TRAINING ARGUMENTS
# -------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4, # Reduced from 8
    per_device_eval_batch_size=16,
    num_train_epochs=2,    # <<< ONLY 2 EPOCHS
    weight_decay=0.01,
    warmup_steps=200,
    predict_with_generate=True,
    generation_max_length=64, # Reduced from 128
    generation_num_beams=4,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=8, # Increased from 4
    report_to="none",
)

# -------------------------------
#       METRICS FUNCTION
# -------------------------------
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu_scores = []
    rouge_scores = []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    for pred, label in zip(decoded_preds, decoded_labels):
        pred = pred.strip()
        label = label.strip()

        if pred == "" or label == "":
            continue

        pred_tokens = nltk.word_tokenize(pred.lower())
        label_tokens = nltk.word_tokenize(label.lower())

        bleu = sentence_bleu([label_tokens], pred_tokens,
                             smoothing_function=SmoothingFunction().method1)
        bleu_scores.append(bleu * 100)

        rouge = scorer.score(label, pred)['rougeL'].fmeasure
        rouge_scores.append(rouge * 100)

    return {
        "bleu": float(np.mean(bleu_scores)) if bleu_scores else 0.0,
        "rouge": float(np.mean(rouge_scores)) if rouge_scores else 0.0
    }

print("Initializing trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# -------------------------------
#             TRAIN
# -------------------------------
print("\n========== STARTING TRAINING ==========\n")
trainer.train()

# -------------------------------
#        FINAL EVALUATION
# -------------------------------
print("\n========== FINAL EVALUATION ==========\n")
eval_results = trainer.evaluate()

print(eval_results)

# -------------------------------
#           SAVE MODEL
# -------------------------------
print("\nSaving model...")
model.save_pretrained("./telugu_english_bert_5000samples_2epochs")
tokenizer.save_pretrained("./telugu_english_bert_5000samples_2epochs")
print("Model saved successfully!")

# -------------------------------
#     SAMPLE TRANSLATIONS
# -------------------------------
print("\n========== SAMPLE TRANSLATIONS ==========\n")
for i in range(5):
    text = val_df.iloc[i]['source']
    inputs = tokenizer(text, return_tensors='pt', max_length=64, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64, num_beams=4, early_stopping=True)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Source :", text)
    print("Target :", val_df.iloc[i]['target'])
    print("Predicted:", pred)
    print("-" * 50)

Transformers version: 4.57.1
Using device: cuda
Loading data...
Total samples used: 5000 (limited to 5000)
Training samples: 4250
Validation samples: 750

Loading tokenizer and model...


Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bia

Creating datasets...
Initializing trainer...




Epoch,Training Loss,Validation Loss,Bleu,Rouge
1,46.4772,4.504223,2.586215,11.171198
2,31.7895,3.7788,3.655036,16.914278


There were missing keys in the checkpoint model loaded: ['decoder.cls.predictions.decoder.weight', 'decoder.cls.predictions.decoder.bias'].






{'eval_loss': 3.7788002490997314, 'eval_bleu': 3.655035683789598, 'eval_rouge': 16.914278381914528, 'eval_runtime': 46.4164, 'eval_samples_per_second': 16.158, 'eval_steps_per_second': 1.013, 'epoch': 2.0}

Saving model...
Model saved successfully!


Source : టామ్‌కు స్పోర్ట్స్ కారు ఉందని నాకు తెలియదు.
Target : i didn't know tom had a sports car.
Predicted: i don't know what i've done to me.
--------------------------------------------------
Source : గురువు ప్రస్తుతం పరారీలో ఉన్నాడు.
Target : the teacher is presently absconding.
Predicted: the movie was shot at the same same location.
--------------------------------------------------
Source : విలీనం చేయడానికి బ్యాంక్ ఆఫ్ బరోడా, దేనా బ్యాంక్, విజయ బ్యాంక్
Target : bank of baroda, dena bank, vijaya bank to merge
Predicted: the indian indian railway railway railways and indiann rail railwayss are being built on the railways.
--------------------------------------------------
Source : ముందుకు వెళ్ళు.
Target : move ahead.
Predicted: it's g