In [1]:
# Install necessary libraries
%pip install transformers datasets torch pandas scikit-learn evaluate accelerate sentencepiece protobuf sacrebleu

import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    AutoModelForSeq2SeqLM, 
    DataCollatorWithPadding, 
    DataCollatorForSeq2Seq,
    TrainingArguments, 
    Trainer, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
import evaluate
from sklearn.preprocessing import LabelEncoder
import os

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [None]:
# Load Data
data_dir = "data"
train_df = pd.read_caddsv(os.path.join(data_dir, "train.tsv"), sep="\t")
val_df = pd.read_csv(os.path.join(data_dir, "validation.tsv"), sep="\t")
test_df = pd.read_csv(os.path.join(data_dir, "test.tsv"), sep="\t")

print(f"Train shape: {train_df.shape}")
print(f"Val shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

# Display sample
train_df.head()

Train shape: (2993, 5)
Val shape: (1390, 5)
Test shape: (6513, 5)


Unnamed: 0,en_query,cs_query,en_parse,cs_parse,domain
0,Add a new weekly reminder for Sunday Brunch at...,9 : 30 am ko Sunday Brunch ke liye ek naya wee...,[IN:CREATE_ALARM Add a new [SL:PERIOD weekly ]...,[IN:CREATE_ALARM [SL:DATE_TIME 9 : 30 am ko ] ...,alarm
1,message danny and see if he wants to go to com...,danny ko message karo aur dekho ke he wants to...,[IN:SEND_MESSAGE message [SL:RECIPIENT danny ]...,[IN:SEND_MESSAGE [SL:RECIPIENT danny ] ko mess...,messaging
2,set alarm for 2 hours,do ghante ke liye alarm set kardo,[IN:CREATE_ALARM set alarm [SL:DATE_TIME for 2...,[IN:CREATE_ALARM [SL:DATE_TIME do ghante ke li...,alarm
3,kill the reminder for baking a cake for neil,neil ke liye cake bake karne ke reminder ko mi...,[IN:DELETE_REMINDER kill the reminder for [SL:...,[IN:DELETE_REMINDER [SL:TODO neil ke liye cake...,reminder
4,retrieve my chat requests please,Please mere chat requests ko retrieve kare,[IN:GET_MESSAGE retrieve my chat requests plea...,[IN:GET_MESSAGE Please mere chat requests ko r...,messaging


In [3]:
# --- Task 1: Error Detection (BERT) ---
import random
import string

# Define Augmentation Functions to create Synthetic Data
def introduce_spelling_error(text):
    words = text.split()
    if not words: return text
    idx = random.randint(0, len(words) - 1)
    word = list(words[idx])
    if len(word) > 1:
        i = random.randint(0, len(word) - 2)
        word[i], word[i+1] = word[i+1], word[i]
        words[idx] = "".join(word)
    return " ".join(words)

def introduce_grammar_error(text):
    words = text.split()
    if len(words) > 1:
        words.pop(random.randint(0, len(words) - 1))
    return " ".join(words)

def introduce_punctuation_error(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def introduce_word_choice_error(text):
    words = text.split()
    if not words: return text
    idx = random.randint(0, len(words) - 1)
    words.insert(idx, words[idx])
    return " ".join(words)

def introduce_needs_rewrite(text):
    words = text.split()
    if len(words) > 1:
        random.shuffle(words)
    return " ".join(words)

def create_synthetic_dataset(df):
    new_rows = []
    for _, row in df.iterrows():
        base_text = str(row["cs_query"])
        target_text = str(row["en_query"])
        
        # 1. Hinglish Detection (Clean)
        new_rows.append({"cs_query": base_text, "en_query": target_text, "error_type": "hinglish_detection"})
        
        # 2. Spelling Error
        new_rows.append({"cs_query": introduce_spelling_error(base_text), "en_query": target_text, "error_type": "spelling_error"})
        
        # 3. Grammar Error
        new_rows.append({"cs_query": introduce_grammar_error(base_text), "en_query": target_text, "error_type": "grammar_error"})
        
        # 4. Punctuation Error
        new_rows.append({"cs_query": introduce_punctuation_error(base_text), "en_query": target_text, "error_type": "punctuation_error"})
        
        # 5. Word Choice Error
        new_rows.append({"cs_query": introduce_word_choice_error(base_text), "en_query": target_text, "error_type": "word_choice_error"})
        
        # 6. Needs Rewrite
        new_rows.append({"cs_query": introduce_needs_rewrite(base_text), "en_query": target_text, "error_type": "needs_rewrite"})
        
        # 7. Slang (Simulated)
        slang_text = base_text.lower().replace("ing", "in").replace("you", "u").replace("hai", "h")
        new_rows.append({"cs_query": slang_text, "en_query": target_text, "error_type": "slang_or_informal"})
        
        # 8. Transliteration (Simulated)
        trans_text = base_text.replace("ee", "i").replace("oo", "u").replace("aa", "a")
        new_rows.append({"cs_query": trans_text, "en_query": target_text, "error_type": "transliteration_error"})

    return pd.DataFrame(new_rows)

# Generate Synthetic Data
print("Generating synthetic data (this may take a moment)...")
train_df = create_synthetic_dataset(train_df)
val_df = create_synthetic_dataset(val_df)
test_df = create_synthetic_dataset(test_df)

print("New Synthetic Distribution:")
print(train_df["error_type"].value_counts())

# Encode Labels
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["error_type"])
val_df["label"] = le.transform(val_df["error_type"])
test_df["label"] = le.transform(test_df["error_type"])

label_list = le.classes_
num_labels = len(label_list)
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

print(f"\nLabels: {label_list}")

# Create HF Datasets
hf_train = Dataset.from_pandas(train_df[["cs_query", "label"]])
hf_val = Dataset.from_pandas(val_df[["cs_query", "label"]])
hf_test = Dataset.from_pandas(test_df[["cs_query", "label"]])

# Tokenization
model_checkpoint_cls = "bert-base-multilingual-cased"
tokenizer_cls = AutoTokenizer.from_pretrained(model_checkpoint_cls)

def preprocess_function_cls(examples):
    return tokenizer_cls(examples["cs_query"], truncation=True, padding="max_length", max_length=128)

tokenized_train_cls = hf_train.map(preprocess_function_cls, batched=True)
tokenized_val_cls = hf_val.map(preprocess_function_cls, batched=True)
tokenized_test_cls = hf_test.map(preprocess_function_cls, batched=True)

Generating synthetic data (this may take a moment)...
New Synthetic Distribution:
error_type
hinglish_detection       2993
spelling_error           2993
grammar_error            2993
punctuation_error        2993
word_choice_error        2993
needs_rewrite            2993
slang_or_informal        2993
transliteration_error    2993
Name: count, dtype: int64

Labels: ['grammar_error' 'hinglish_detection' 'needs_rewrite' 'punctuation_error'
 'slang_or_informal' 'spelling_error' 'transliteration_error'
 'word_choice_error']
New Synthetic Distribution:
error_type
hinglish_detection       2993
spelling_error           2993
grammar_error            2993
punctuation_error        2993
word_choice_error        2993
needs_rewrite            2993
slang_or_informal        2993
transliteration_error    2993
Name: count, dtype: int64

Labels: ['grammar_error' 'hinglish_detection' 'needs_rewrite' 'punctuation_error'
 'slang_or_informal' 'spelling_error' 'transliteration_error'
 'word_choice_error']


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:01<00:00, 22812.19 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:01<00:00, 22812.19 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 23760.31 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 23760.31 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52104/52104 [00:02<00:00, 21985.36 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52104/52104 [00:02<00:00, 21985.36 examples/s]


In [4]:
# Load Model
model_cls = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_cls, num_labels=num_labels, id2label=id2label, label2id=label2id
)

# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics_cls(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Training Arguments
training_args_cls = TrainingArguments(
    output_dir="bert-domain-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Trainer
trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=tokenized_train_cls,
    eval_dataset=tokenized_val_cls,
    tokenizer=tokenizer_cls,
    compute_metrics=compute_metrics_cls,
)

# Train
trainer_cls.train()

# Evaluate
eval_results_cls = trainer_cls.evaluate(tokenized_test_cls)
print(f"BERT Evaluation: {eval_results_cls}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_cls = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6634,1.315541,0.496403
2,1.1576,1.10451,0.568435
3,0.9622,1.059104,0.586511


BERT Evaluation: {'eval_loss': 1.08975088596344, 'eval_accuracy': 0.5859434976201443, 'eval_runtime': 372.5984, 'eval_samples_per_second': 139.84, 'eval_steps_per_second': 8.741, 'epoch': 3.0}


In [None]:
# --- Task 2: Grammar Correction / Translation (Seq2Seq) ---
# We will train the model to take Hinglish (cs_query) and output corrected English (en_query).
import gc
import torch

# Clear memory from previous tasks
for name in ['model_cls', 'trainer_cls', 'model_seq', 'trainer_seq']:
    if name in globals():
        del globals()[name]

torch.cuda.empty_cache()
gc.collect()

# Create HF Datasets
hf_train_seq = Dataset.from_pandas(train_df[["en_query", "cs_query"]])
hf_val_seq = Dataset.from_pandas(val_df[["en_query", "cs_query"]])
hf_test_seq = Dataset.from_pandas(test_df[["en_query", "cs_query"]])

# Tokenization
model_checkpoint_seq = "google/mt5-small"
tokenizer_seq = AutoTokenizer.from_pretrained(model_checkpoint_seq)

max_input_length = 128
max_target_length = 128

def preprocess_function_seq(examples):
    inputs = examples["cs_query"] # Input: Hinglish
    targets = examples["en_query"] # Target: Corrected English
    model_inputs = tokenizer_seq(inputs, max_length=max_input_length, truncation=True)

    with tokenizer_seq.as_target_tokenizer():
        labels = tokenizer_seq(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_seq = hf_train_seq.map(preprocess_function_seq, batched=True)
tokenized_val_seq = hf_val_seq.map(preprocess_function_seq, batched=True)
tokenized_test_seq = hf_test_seq.map(preprocess_function_seq, batched=True)

# Load Model
model_seq = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint_seq)

# Optimization for Low VRAM (6GB)
model_seq.config.use_cache = False # Required for gradient checkpointing

# Data Collator
data_collator_seq = DataCollatorForSeq2Seq(tokenizer_seq, model=model_seq)

# Metrics
metric = evaluate.load("sacrebleu")

def compute_metrics_seq(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer_seq.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer_seq.pad_token_id)
    decoded_labels = tokenizer_seq.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    return result

# Training Arguments
training_args_seq = Seq2SeqTrainingArguments(
    output_dir="mt5-hinglish-correction",
    learning_rate=2e-5,
    per_device_train_batch_size=1, # Lowest possible batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16, # Accumulate gradients to simulate batch size 16
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=torch.cuda.is_available(), # Use Mixed Precision (saves memory)
    gradient_checkpointing=True,    # Trade compute for memory
    optim="adafactor",              # Use Adafactor optimizer (less memory than AdamW)
)

# Trainer
trainer_seq = Seq2SeqTrainer(
    model=model_seq,
    args=training_args_seq,
    train_dataset=tokenized_train_seq,
    eval_dataset=tokenized_val_seq,
    tokenizer=tokenizer_seq,
    data_collator=data_collator_seq,
    compute_metrics=compute_metrics_seq,
)

# Train
trainer_seq.train()

# Re-enable cache for evaluation
model_seq.config.use_cache = True

# Evaluate
eval_results_seq = trainer_seq.evaluate(tokenized_test_seq)
print(f"Correction Evaluation: {eval_results_seq}")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:00<00:00, 59912.27 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:00<00:00, 59912.27 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 67714.10 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 67714.10 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52104/52104 [00:00<00:00, 52465.89 examples/s]

  trainer_seq = Seq2SeqTrainer(
  trainer_seq = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


In [None]:
# === RESUME TRAINING FROM CHECKPOINT (Optimized for 6GB GPU) ===
import os
import gc
import torch

# Step 1: Clean up GPU memory
print("üßπ Cleaning up GPU memory...")
if 'model_seq' in globals():
    del model_seq
if 'trainer_seq' in globals():
    del trainer_seq
if 'model_cls' in globals():
    del model_cls
if 'trainer_cls' in globals():
    del trainer_cls
    
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

print(f"üíæ GPU Memory Free: {torch.cuda.get_device_properties(0).total_memory / 1024**3 - torch.cuda.memory_allocated() / 1024**3:.2f} GB")

# Step 2: Find the latest checkpoint
checkpoint_dir = "mt5-hinglish-correction"
checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
if checkpoints:
    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
    print(f"‚úÖ Resuming from: {checkpoint_path}")
else:
    checkpoint_path = None
    print("‚ùå No checkpoint found!")

# Step 3: Prepare tokenizer and datasets
model_checkpoint_seq = "google/mt5-small"
tokenizer_seq = AutoTokenizer.from_pretrained(model_checkpoint_seq)

def preprocess_function_seq(examples):
    inputs = examples["cs_query"]
    targets = examples["en_query"]
    model_inputs = tokenizer_seq(inputs, max_length=128, truncation=True)
    with tokenizer_seq.as_target_tokenizer():
        labels = tokenizer_seq(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Recreate datasets
hf_train_seq = Dataset.from_pandas(train_df[["en_query", "cs_query"]])
hf_val_seq = Dataset.from_pandas(val_df[["en_query", "cs_query"]])
hf_test_seq = Dataset.from_pandas(test_df[["en_query", "cs_query"]])

tokenized_train_seq = hf_train_seq.map(preprocess_function_seq, batched=True)
tokenized_val_seq = hf_val_seq.map(preprocess_function_seq, batched=True)
tokenized_test_seq = hf_test_seq.map(preprocess_function_seq, batched=True)

# Step 4: Load model from checkpoint - use FP32 to avoid FP16 gradient issues
print(f"üì• Loading model from checkpoint...")
model_seq = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
model_seq.config.use_cache = False

# Step 5: Setup training components
data_collator_seq = DataCollatorForSeq2Seq(tokenizer_seq, model=model_seq)
metric = evaluate.load("sacrebleu")

def compute_metrics_seq(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer_seq.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer_seq.pad_token_id)
    decoded_labels = tokenizer_seq.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# Check if BF16 is available (better than FP16 for training)
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
print(f"üîß Using BF16: {use_bf16}")

# Ultra-aggressive settings for 6GB GPU
training_args_seq = Seq2SeqTrainingArguments(
    output_dir="mt5-hinglish-correction",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Match original training
    weight_decay=0.01,
    save_total_limit=1,  # Keep only 1 checkpoint to save space
    num_train_epochs=6,  # Continue to 6 epochs
    predict_with_generate=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,  # Disable to save memory
    push_to_hub=False,
    bf16=use_bf16,  # Use BF16 if available
    fp16=False,  # Disable FP16 to avoid gradient scaling issues
    gradient_checkpointing=True,
    optim="adafactor",
    max_grad_norm=1.0,
    dataloader_num_workers=0,
    logging_steps=100,
    eval_accumulation_steps=1,
    generation_max_length=128,
)

trainer_seq = Seq2SeqTrainer(
    model=model_seq,
    args=training_args_seq,
    train_dataset=tokenized_train_seq,
    eval_dataset=tokenized_val_seq,
    tokenizer=tokenizer_seq,
    data_collator=data_collator_seq,
    compute_metrics=compute_metrics_seq,
)

print(f"\nüöÄ Starting training from {checkpoint_path}")
print(f"üìä Settings: batch_size=1, grad_accum=16 (effective batch=16)")
print(f"üéØ Target: {training_args_seq.num_train_epochs} epochs\n")

# Step 6: Resume training - start fresh if checkpoint resume fails
try:
    trainer_seq.train(resume_from_checkpoint=checkpoint_path)
except Exception as e:
    print(f"‚ö†Ô∏è Could not resume from checkpoint: {e}")
    print("üîÑ Starting training from the loaded checkpoint weights instead...")
    trainer_seq.train()

# Step 7: Final evaluation
model_seq.config.use_cache = True
print("\nüß™ Evaluating on test set...")
eval_results = trainer_seq.evaluate(tokenized_test_seq)
print(f"\n‚úÖ Final BLEU Score: {eval_results['eval_bleu']:.2f}")
print(f"üìà Full Results: {eval_results}")

üßπ Cleaning up GPU memory...
üíæ GPU Memory Free: 5.67 GB
‚úÖ Resuming from: mt5-hinglish-correction/checkpoint-1497


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:00<00:00, 55724.71 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23944/23944 [00:00<00:00, 55724.71 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 54836.53 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11120/11120 [00:00<00:00, 54836.53 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52104/52104 [00:01<00:00, 45958.93 examples/s]



üì• Loading model from checkpoint...
üîß Using BF16: True

üöÄ Starting training from mt5-hinglish-correction/checkpoint-1497
üìä Settings: batch_size=1, grad_accum=16 (effective batch=16)
üéØ Target: 6 epochs

üîß Using BF16: True

üöÄ Starting training from mt5-hinglish-correction/checkpoint-1497
üìä Settings: batch_size=1, grad_accum=16 (effective batch=16)
üéØ Target: 6 epochs



  trainer_seq = Seq2SeqTrainer(
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


‚ö†Ô∏è Could not resume from checkpoint: 'NoneType' object has no attribute 'load_state_dict'
üîÑ Starting training from the loaded checkpoint weights instead...


Epoch,Training Loss,Validation Loss
