### Importing required libraries 

In [1]:

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset

2025-04-10 13:23:12.731610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#!pip install transformers[torch] accelerate>=0.26.0

### Combining data from OPUS for model training 

In [None]:
# Load Russian and English parallel sentence files
ru_file = "TED2020.en-ru.ru"
en_file = "TED2020.en-ru.en"
output_file = "TED2020_translations.xlsx"

# Read both files line by line
with open(ru_file, "r", encoding="utf-8") as f:
    ru_sentences = f.readlines()

with open(en_file, "r", encoding="utf-8") as f:
    en_sentences = f.readlines()

# Sanity check: Ensure equal number of lines
if len(ru_sentences) != len(en_sentences):
    print("Error: Files contain a different number of lines!")
else:
    print(f"Loaded {len(ru_sentences)} sentence pairs.")

# Create a DataFrame with Russian and English sentences
df = pd.DataFrame({
    "Russian": [s.strip() for s in ru_sentences], 
    "English": [s.strip() for s in en_sentences]
})

# Save the sentence pairs to an Excel file
df.to_excel(output_file, index=False, engine="openpyxl")
print(f"Data successfully saved to '{output_file}'")

### Split Data into Train/Validation/Test Sets

In [None]:
# Load the full dataset
df = pd.read_excel("TED2020_translations.xlsx")

# Split into training (80%) and temp (20%)
train, temp = train_test_split(df, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save the splits to Excel
train.to_excel("train.xlsx", index=False, engine="openpyxl")
val.to_excel("val.xlsx", index=False, engine="openpyxl")
test.to_excel("test.xlsx", index=False, engine="openpyxl")

print(f"Data prepared and saved: train({len(train)}), val({len(val)}), test({len(test)})")

### Clean Training and Validation Data

In [None]:
# Load training and validation data
train_data = pd.read_excel("train.xlsx", engine="openpyxl")
val_data = pd.read_excel("val.xlsx", engine="openpyxl")

# Drop empty rows and ensure all values are strings
train_data = train_data.dropna().astype(str)
val_data = val_data.dropna().astype(str)

# Save cleaned versions (optional for debugging)
train_data.to_excel("train_cleaned.xlsx", index=False, engine="openpyxl")
val_data.to_excel("val_cleaned.xlsx", index=False, engine="openpyxl")

print("Data cleaned and saved as 'train_cleaned.xlsx' and 'val_cleaned.xlsx'")

### Fine-Tune mBART on Translation Task

In [None]:
# Load cleaned training and validation datasets
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load mBART tokenizer and set language codes
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "en_XX"

# Load the mBART model from a checkpoint
model = MBartForConditionalGeneration.from_pretrained("mbart_translation_full/checkpoint-9656")

# Tokenization and preprocessing function
def preprocess_function(examples):
    return tokenizer(
        examples["Russian"],
        text_target=examples["English"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

# Tokenize both datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="mbart_translation_full",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Set up Trainer with early stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Resume training from checkpoint
trainer.train(resume_from_checkpoint=True)

# Save the final trained model and tokenizer
model.save_pretrained("mbart_translation_full")
tokenizer.save_pretrained("mbart_translation_full")

print("Training complete. Model saved in 'mbart_translation_full'")

### Use Trained mBART Model for Translation

In [None]:
# Load Excel file with Russian sentences to translate
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Validate required column exists
if "Sentence" not in df.columns:
    raise ValueError("The file does not contain a 'Sentence' column. Please check the structure.")

# Load trained model and tokenizer
model_path = "mbart_translation_full"
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target language codes
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "en_XX"

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define translation function (batched)
def batch_translate(sentences, model, tokenizer, num_beams=5):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated_tokens = model.generate(
        **inputs,
        num_beams=num_beams,
        max_length=128,
        early_stopping=True
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# Translate sentences in batches
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i + batch_size], model, tokenizer)
]

# Add translations to DataFrame and save
df["Translation_mBART"] = translations
output_file = "translated_assembly_WER_mBART_bean10.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"Translation completed and saved to '{output_file}'")