### Combining data from OPUS for model training 

In [None]:
"""

import pandas as pd

# –ü—É—Ç–∏ –∫ —Ñ–∞–π–ª–∞–º (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Å–≤–æ–∏ –ª–æ–∫–∞–ª—å–Ω—ã–µ –ø—É—Ç–∏)
ru_file = "TED2020.en-ru.ru"
en_file = "TED2020.en-ru.en"
output_file = "TED2020_translations.xlsx"

# –ß–∏—Ç–∞–µ–º —Ñ–∞–π–ª—ã –ø–æ—Å—Ç—Ä–æ—á–Ω–æ
with open(ru_file, "r", encoding="utf-8") as f:
    ru_sentences = f.readlines()

with open(en_file, "r", encoding="utf-8") as f:
    en_sentences = f.readlines()

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫ —Å–æ–≤–ø–∞–¥–∞–µ—Ç
if len(ru_sentences) != len(en_sentences):
    print("–û—à–∏–±–∫–∞: –§–∞–π–ª—ã —Å–æ–¥–µ—Ä–∂–∞—Ç —Ä–∞–∑–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫!")
else:
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(ru_sentences)} –ø–∞—Ä –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π.")

# –°–æ–∑–¥–∞—ë–º DataFrame
df = pd.DataFrame({"Russian": [s.strip() for s in ru_sentences], 
                   "English": [s.strip() for s in en_sentences]})

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Excel
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"üéâ –î–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ —Ñ–∞–π–ª {output_file}")"

"""


### Data preparing 

In [None]:
"""

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_excel("TED2020_translations.xlsx")

# Split into train, validation, and test sets
train, temp = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save datasets as Excel files
train.to_excel("train.xlsx", index=False, engine="openpyxl")
val.to_excel("val.xlsx", index=False, engine="openpyxl")
test.to_excel("test.xlsx", index=False, engine="openpyxl")

print(f"Data prepared and saved: train({len(train)}), val({len(val)}), test({len(test)})")"

"""

Data prepared and saved: train(312011), val(39001), test(39002)


### Cleaning the data 

In [9]:
import pandas as pd

# Load training and validation data from Excel
train_data = pd.read_excel("train.xlsx", engine="openpyxl")
val_data = pd.read_excel("val.xlsx", engine="openpyxl")

# Drop NaN values and convert everything to strings
train_data = train_data.dropna().astype(str)
val_data = val_data.dropna().astype(str)

# Save cleaned data (optional step for debugging)
train_data.to_excel("train_cleaned.xlsx", index=False, engine="openpyxl")
val_data.to_excel("val_cleaned.xlsx", index=False, engine="openpyxl")

print("Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx")

Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx


### Train the MarianMT Model

In [2]:
!pip install transformers datasets torch sentencepiece

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transfo

In [None]:
#!pip install transformers[torch] accelerate>=0.26.0

In [2]:
### Small dataset for fast training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")

# Take only a subset of data (e.g., 10,000 samples)
subset_size = 10000
df = df.sample(n=subset_size, random_state=42)  # Randomly select 10,000 rows

# Split into train/validation
train, val = train_test_split(df, test_size=0.2, random_state=42)

# Save the smaller dataset
train.to_excel("train_small.xlsx", index=False, engine="openpyxl")
val.to_excel("val_small.xlsx", index=False, engine="openpyxl")

print(f"‚úÖ Prepared smaller dataset with {len(train)} training and {len(val)} validation samples.")

‚úÖ Prepared smaller dataset with 8000 training and 2000 validation samples.


In [None]:
### Second try training with small dataset

In [3]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load the smaller dataset
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load pre-trained MarianMT model
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments (optimized for faster training)
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model",  # ‚úÖ Save model in a clear directory
    num_train_epochs=2,  # ‚úÖ Reduce epochs for faster training
    per_device_train_batch_size=16,  # ‚úÖ Increase batch size to speed up training
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1  # ‚úÖ Keep only the latest checkpoint
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model in a clear directory
model.save_pretrained("custom_translation_model")
tokenizer.save_pretrained("custom_translation_model")

print("üéâ Training completed! Model saved in 'custom_translation_model'")


2025-03-18 19:44:58.328579: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 19:44:58.341621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742327098.356305   18486 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742327098.360866   18486 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742327098.373535   18486 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.7951,0.730192
2,0.6911,0.678546




üéâ Training completed! Model saved in 'custom_translation_model'


In [None]:
### Second try, others hyperparametrs

In [2]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load the smaller dataset
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load pre-trained MarianMT model
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Optimized Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_v2",  # ‚úÖ New folder for improved model
    num_train_epochs=4,  # ‚úÖ More epochs for better training
    per_device_train_batch_size=32,  # ‚úÖ Larger batch size (if enough memory)
    gradient_accumulation_steps=4,  # ‚úÖ Effective larger batch size if memory is limited
    learning_rate=3e-5,  # ‚úÖ Slower learning rate for stability
    weight_decay=0.01,  # ‚úÖ Regularization to prevent overfitting
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model
model.save_pretrained("custom_translation_model_v2")
tokenizer.save_pretrained("custom_translation_model_v2")

print("Training completed! Model saved in 'custom_translation_model_v2'")



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,2.1676,1.126219
2,1.1125,0.946095
3,0.9215,0.869714




Training completed! Model saved in 'custom_translation_model_v2'


In [None]:
### Third model

In [7]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load dataset
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocessing function with label smoothing support
def preprocess_function(examples):
    model_inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_v1_5",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    learning_rate=3e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True  # Required for generation-based eval
)


# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train
trainer.train()

# Save model
model.save_pretrained("custom_translation_model_v1_5")
tokenizer.save_pretrained("custom_translation_model_v1_5")

print("‚úÖ Improved model (v1.5) training complete and saved.")



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3208,0.315128
2,0.2911,0.315201
3,0.277,0.316235




‚úÖ Improved model (v1.5) training complete and saved.


In [None]:
### With EarlyStopping

In [8]:
import pandas as pd
from transformers import (
    MarianMTModel,
    MarianTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset

# üìå Step 1: Load the dataset
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# üìå Step 2: Load model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# üìå Step 3: Preprocessing
def preprocess_function(examples):
    model_inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# üìå Step 4: Training arguments with EarlyStopping setup
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_v1_5_earlystop",
    num_train_epochs=10,  # max limit, early stopping will stop earlier if needed
    per_device_train_batch_size=32,
    learning_rate=3e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True,
    load_best_model_at_end=True,  # ‚¨ÖÔ∏è restores best model
    metric_for_best_model="eval_loss",  # based on validation loss
    greater_is_better=False
)

# üìå Step 5: Trainer with EarlyStoppingCallback
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # ‚èπÔ∏è stop after 2 non-improving epochs
)

# üìå Step 6: Train and save
trainer.train()

model.save_pretrained("custom_translation_model_v1_5_earlystop")
tokenizer.save_pretrained("custom_translation_model_v1_5_earlystop")

print("‚úÖ Training complete! Best model saved in 'custom_translation_model_v1_5_earlystop'")



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3214,0.315831
2,0.289,0.317355
3,0.2665,0.320317


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


‚úÖ Training complete! Best model saved in 'custom_translation_model_v1_5_earlystop'


In [2]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

# üîπ –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# üîπ –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
if "Sentence" not in df.columns:
    raise ValueError("‚ùå –í —Ñ–∞–π–ª–µ –Ω–µ—Ç –∫–æ–ª–æ–Ω–∫–∏ 'Sentence'. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—É —Ñ–∞–π–ª–∞!")

# üîπ –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å v1_5_earlystop
model_path = "custom_translation_model_v1_5_earlystop"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

# üîπ –ò—Å–ø–æ–ª—å–∑—É–µ–º GPU –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# üîπ –§—É–Ω–∫—Ü–∏—è –ø–∞–∫–µ—Ç–Ω–æ–≥–æ –ø–µ—Ä–µ–≤–æ–¥–∞
def batch_translate(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# üîπ –†–∞–∑–±–∏–≤–∞–µ–º –Ω–∞ –±–∞—Ç—á–∏
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i+batch_size], model, tokenizer)
]

# üîπ –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç
df["Translation_v1_5"] = translations
df.to_excel("translated_assembly_WER_v1_5.xlsx", index=False, engine="openpyxl")

print("‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ 'translated_assembly_WER_v1_5.xlsx'")



‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ 'translated_assembly_WER_v1_5.xlsx'


In [13]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from datasets import Dataset

# ‚úÖ –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# ‚úÖ –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ Dataset —Ñ–æ—Ä–º–∞—Ç–∞ Hugging Face
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# ‚úÖ –ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å MarianMT (Transformer)
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# ‚úÖ –§—É–Ω–∫—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# ‚úÖ –ü—Ä–∏–º–µ–Ω—è–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—é
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# ‚úÖ –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–µ–Ω–∏—è
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_final",  # üèÅ –§–∏–Ω–∞–ª—å–Ω–∞—è –ø–∞–ø–∫–∞ –¥–ª—è –º–æ–¥–µ–ª–∏
    num_train_epochs=10,  # üî• –î–æ–ª—å—à–µ –æ–±—É—á–µ–Ω–∏–µ
    per_device_train_batch_size=16,  # ‚öñÔ∏è –û–ø—Ç–∏–º–∞–ª—å–Ω—ã–π —Ä–∞–∑–º–µ—Ä –±–∞—Ç—á–∞
    gradient_accumulation_steps=4,  # üß† –ù—É–∂–µ–Ω, –µ—Å–ª–∏ –ø–∞–º—è—Ç–∏ –º–∞–ª–æ
    learning_rate=2e-5,  # üöÄ –°—Ç–∞–±–∏–ª—å–Ω—ã–π –ª–µ—Ä–Ω–∏–Ω–≥ —Ä–µ–π—Ç
    weight_decay=0.01,  # üõ°Ô∏è –ó–∞—â–∏—Ç–∞ –æ—Ç –ø–µ—Ä–µ–æ–±—É—á–µ–Ω–∏—è
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,  # üóëÔ∏è –•—Ä–∞–Ω–∏–º —Ç–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω–∏–π —á–µ–∫–ø–æ–π–Ω—Ç
    fp16=True,  # ‚ö° –ò—Å–ø–æ–ª—å–∑—É–µ–º Mixed Precision –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
    load_best_model_at_end=True,  # üèÜ –ó–∞–≥—Ä—É–∂–∞–µ–º –ª—É—á—à—É—é –º–æ–¥–µ–ª—å
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# ‚úÖ –î–æ–±–∞–≤–ª—è–µ–º Early Stopping (–µ—Å–ª–∏ loss –Ω–µ –ø–∞–¥–∞–µ—Ç 2 —ç–ø–æ—Ö–∏ ‚Äî —Å—Ç–æ–ø)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# üöÄ –ó–∞–ø—É—Å–∫ –æ–±—É—á–µ–Ω–∏—è
trainer.train()

# ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model.save_pretrained("custom_translation_model_final")
tokenizer.save_pretrained("custom_translation_model_final")

print("‚úÖ –§–∏–Ω–∞–ª—å–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ 'custom_translation_model_final'")



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.2547,1.032827
2,0.9307,0.867239
3,0.8524,0.793226
4,0.785,0.750331
5,0.7364,0.723534
6,0.7002,0.705162
7,0.6907,0.6929
8,0.6715,0.684287
9,0.6553,0.680194
10,0.6434,0.678575


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


‚úÖ –§–∏–Ω–∞–ª—å–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ 'custom_translation_model_final'


In [16]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from datasets import Dataset

# ‚úÖ –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# ‚úÖ –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤ Dataset —Ñ–æ—Ä–º–∞—Ç–∞ Hugging Face
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# ‚úÖ –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –º–æ–¥–µ–ª—å MarianMT —Å –Ω—É–ª—è (–±–µ–∑ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã—Ö —á–µ–∫–ø–æ–π–Ω—Ç–æ–≤)
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# ‚úÖ –§—É–Ω–∫—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# ‚úÖ –ü—Ä–∏–º–µ–Ω—è–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—é
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# ‚úÖ –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–µ–Ω–∏—è —Å –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏ –¥–ª—è —Å—Ç–∞–±–∏–ª—å–Ω–æ—Å—Ç–∏ –∏ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_v3",  # –ü–∞–ø–∫–∞ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –º–æ–¥–µ–ª–∏
    num_train_epochs=5,  # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö
    per_device_train_batch_size=16,  # –†–∞–∑–º–µ—Ä –±–∞—Ç—á–∞
    gradient_accumulation_steps=4,  # –ù–∞–∫–æ–ø–ª–µ–Ω–∏–µ –≥—Ä–∞–¥–∏–µ–Ω—Ç–æ–≤ (–µ—Å–ª–∏ –ø–∞–º—è—Ç–∏ –Ω–µ —Ö–≤–∞—Ç–∞–µ—Ç)
    learning_rate=2e-5,  # –ú–µ–Ω—å—à–∏–π –ª–µ—Ä–Ω–∏–Ω–≥ —Ä–µ–π—Ç –¥–ª—è —Å—Ç–∞–±–∏–ª—å–Ω–æ—Å—Ç–∏
    weight_decay=0.01,  # –†–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è –¥–ª—è –ø—Ä–µ–¥–æ—Ç–≤—Ä–∞—â–µ–Ω–∏—è –ø–µ—Ä–µ–æ–±—É—á–µ–Ω–∏—è
    evaluation_strategy="epoch",  # –û—Ü–µ–Ω–∫–∞ –∫–∞–∂–¥—ã–π —ç–ø–æ—Ö
    save_strategy="epoch",  # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∫–∞–∂–¥—ã–π —ç–ø–æ—Ö
    logging_steps=50,  # –õ–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–∞–∂–¥—É—é 50-—é –∏—Ç–µ—Ä–∞—Ü–∏—é
    save_total_limit=1,  # –•—Ä–∞–Ω–∏—Ç—å —Ç–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω–∏–π —á–µ–∫–ø–æ–π–Ω—Ç
    fp16=True,  # –ò—Å–ø–æ–ª—å–∑—É–µ–º mixed precision –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è
    load_best_model_at_end=True,  # –ó–∞–≥—Ä—É–∂–∞–µ–º –ª—É—á—à–∏–π —á–µ–∫–ø–æ–π–Ω—Ç –ø–æ –≤–∞–ª–∏–¥. –ø–æ—Ç–µ—Ä–µ
    metric_for_best_model="eval_loss",  # –ú–µ—Ç—Ä–∏–∫–∞ –¥–ª—è –≤—ã–±–æ—Ä–∞ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏
    greater_is_better=False,  # –ß–µ–º –º–µ–Ω—å—à–µ eval_loss, —Ç–µ–º –ª—É—á—à–µ
)

# ‚úÖ –î–æ–±–∞–≤–ª—è–µ–º Early Stopping (–µ—Å–ª–∏ –º–æ–¥–µ–ª—å –Ω–µ —É–ª—É—á—à–∏–ª–∞—Å—å 2 —ç–ø–æ—Ö–∏ ‚Äî –æ—Å—Ç–∞–Ω–æ–≤–∏—Ç—å –æ–±—É—á–µ–Ω–∏–µ)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# üöÄ –ó–∞–ø—É—Å–∫ –æ–±—É—á–µ–Ω–∏—è —Å –Ω—É–ª—è
trainer.train()

# ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –ø–æ—Å–ª–µ –æ–±—É—á–µ–Ω–∏—è
model.save_pretrained("custom_translation_model_v3")
tokenizer.save_pretrained("custom_translation_model_v3")

print("‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ 'custom_translation_model_v3'")



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.2636,1.046986
2,0.9542,0.892963
3,0.8879,0.831931
4,0.8419,0.803147
5,0.8115,0.794744


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ 'custom_translation_model_v3'


In [4]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

# –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
if "Sentence" not in df.columns:
    raise ValueError("‚ùå –í —Ñ–∞–π–ª–µ –Ω–µ—Ç –∫–æ–ª–æ–Ω–∫–∏ 'Sentence'. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—É —Ñ–∞–π–ª–∞!")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±–µ –º–æ–¥–µ–ª–∏
model1_name = "custom_translation_model"
model2_name = "custom_translation_model_v2"

tokenizer1 = MarianTokenizer.from_pretrained(model1_name)
tokenizer2 = MarianTokenizer.from_pretrained(model2_name)

model1 = MarianMTModel.from_pretrained(model1_name)
model2 = MarianMTModel.from_pretrained(model2_name)

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model1.to(device)
model2.to(device)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–∞–∫–µ—Ç–Ω–æ–≥–æ –ø–µ—Ä–µ–≤–æ–¥–∞
def batch_translate(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# –†–∞–∑–±–∏–≤–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–∞ batch-–∏ –ø–æ 16 –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
batch_size = 16

df["Translation_Model_1"] = [
    trans for i in range(0, len(df), batch_size)
    for trans in batch_translate(df["Sentence"].astype(str).tolist()[i:i + batch_size], model1, tokenizer1)
]

df["Translation_Model_2"] = [
    trans for i in range(0, len(df), batch_size)
    for trans in batch_translate(df["Sentence"].astype(str).tolist()[i:i + batch_size], model2, tokenizer2)
]

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –Ω–æ–≤—ã–π Excel-—Ñ–∞–π–ª
output_file = "translated_assembly_WER.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω! –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ '{output_file}'")



‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω! –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ 'translated_assembly_WER.xlsx'


In [2]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

# –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
if "Sentence" not in df.columns:
    raise ValueError("‚ùå –í —Ñ–∞–π–ª–µ –Ω–µ—Ç –∫–æ–ª–æ–Ω–∫–∏ 'Sentence'. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—É —Ñ–∞–π–ª–∞!")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±–µ –º–æ–¥–µ–ª–∏
model1_name = "custom_translation_model"
model2_name = "custom_translation_model_v2"

tokenizer1 = MarianTokenizer.from_pretrained(model1_name)
tokenizer2 = MarianTokenizer.from_pretrained(model2_name)

model1 = MarianMTModel.from_pretrained(model1_name).to("cuda")  # ‚úÖ –°—Ä–∞–∑—É –Ω–∞ GPU
model2 = MarianMTModel.from_pretrained(model2_name).to("cuda")  # ‚úÖ –°—Ä–∞–∑—É –Ω–∞ GPU

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–∞–∫–µ—Ç–Ω–æ–≥–æ –ø–µ—Ä–µ–≤–æ–¥–∞
def batch_translate(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# –†–∞–∑–±–∏–≤–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–∞ batch-–∏ –ø–æ 16 –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π
batch_size = 16

df["Translation_Model_1"] = [
    trans for i in range(0, len(df), batch_size)
    for trans in batch_translate(df["Sentence"].astype(str).tolist()[i:i + batch_size], model1, tokenizer1)
]

df["Translation_Model_2"] = [
    trans for i in range(0, len(df), batch_size)
    for trans in batch_translate(df["Sentence"].astype(str).tolist()[i:i + batch_size], model2, tokenizer2)
]

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –Ω–æ–≤—ã–π Excel-—Ñ–∞–π–ª
output_file = "translated_assembly_WER.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω! –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ '{output_file}'")



‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω! –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ 'translated_assembly_WER.xlsx'


In [None]:
### First try of training

In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load cleaned training and validation data from Excel
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# Convert Pandas DataFrame to Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocessing function for tokenization
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization with batch processing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model
model.save_pretrained(".")
tokenizer.save_pretrained(".")

print("Training completed! Model saved")

2025-03-18 16:31:51.257754: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 16:31:51.270838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742315511.285192   15960 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742315511.289643   15960 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742315511.302109   15960 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/309030 [00:00<?, ? examples/s]

Map:   0%|          | 0/38613 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.4433,0.363478
2,0.367,0.337283
3,0.2935,0.327943




Training completed! Model saved


In [None]:
### FINAL TRAINING - !!!

In [3]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from datasets import Dataset

# Load the full training and validation data
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization to datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model_full",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_steps=50,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Create the trainer with EarlyStopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
trainer.train()

# Save the final model
model.save_pretrained("custom_translation_model_full")
tokenizer.save_pretrained("custom_translation_model_full")

print("Training completed. Model saved to 'custom_translation_model_full'")

INFO: PyTorch version 2.6.0 available.
INFO: TensorFlow version 2.19.0 available.


Map:   0%|          | 0/309030 [00:00<?, ? examples/s]

Map:   0%|          | 0/38613 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
0,0.4723,0.432152
1,0.4205,0.390194
2,0.4107,0.37144
3,0.3842,0.361543
4,0.3653,0.354061
5,0.3546,0.349739
6,0.3597,0.345944
7,0.3414,0.343992
8,0.3311,0.342949
9,0.3325,0.34244


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


Training completed. Model saved to 'custom_translation_model_full'


In [6]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

# Load the input dataset
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Load the final trained translation model
model_path = "custom_translation_model_full"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Batch translation function
def batch_translate(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated = model.generate(**inputs)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# Translate in batches
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i + batch_size], model, tokenizer)
]

# Save the translations
df["Translation"] = translations
df.to_excel("translated_assembly_WER_full.xlsx", index=False, engine="openpyxl")

print("Translation completed successfully. Output saved to 'translated_assembly_WER_full.xlsx'.")



Translation completed successfully. Output saved to 'translated_assembly_WER_full.xlsx'.


In [None]:
### Translation with beam=5

In [7]:
import pandas as pd
import torch
from transformers import MarianMTModel, MarianTokenizer

# Load the input Excel file
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Ensure the sentence column exists
if "Sentence" not in df.columns:
    raise ValueError("The file must contain a column named 'Sentence'.")

# Load your trained translation model
model_path = "custom_translation_model_full"
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Translation function with beam search
def batch_translate(sentences, model, tokenizer, num_beams=5):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated = model.generate(
        **inputs,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

# Translate in batches
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i + batch_size], model, tokenizer, num_beams=5)
]

# Save results to new column and file
df["Translation_beam5"] = translations
df.to_excel("translated_assembly_WER_beam5.xlsx", index=False, engine="openpyxl")

print("Translation completed using beam search. Output saved to 'translated_assembly_WER_beam5.xlsx'.")



Translation completed using beam search. Output saved to 'translated_assembly_WER_beam5.xlsx'.


In [None]:
### Training with mBART with small dataset

In [3]:
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from datasets import Dataset

# Load the small train and validation sets
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load mBART model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Set source and target language
tokenizer.src_lang = "ru_RU"
target_lang = "en_XX"

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Training settings
training_args = Seq2SeqTrainingArguments(
    output_dir="mbart_translation_tuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=4,  # –Ω–æ –±—É–¥–µ—Ç —Ä–∞–Ω–Ω—è—è –æ—Å—Ç–∞–Ω–æ–≤–∫–∞
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=200,
    label_smoothing_factor=0.1,
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available()
)


# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train
trainer.train()

# Save the model
model.save_pretrained("mbart_translation_tuned")
tokenizer.save_pretrained("mbart_translation_tuned")

print("mBART training on small dataset completed.")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,2.4611,2.448836
2,2.3729,2.400036
3,2.3097,2.417953
4,2.2578,2.405698


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


mBART training on small dataset completed.


In [None]:
### Translaiting with small dataset

In [2]:
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load the test file
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Ensure the sentence column exists
if "Sentence" not in df.columns:
    raise ValueError("The file must contain a column named 'Sentence'.")

# Load trained mBART model and tokenizer
model_path = "mbart_translation_small"
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target languages
tokenizer.src_lang = "ru_RU"
target_lang_code = "en_XX"

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Translation function with beam search
def batch_translate(sentences, tokenizer, model, batch_size=16, num_beams=5):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        translated = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_code),
            num_beams=num_beams,
            max_length=128,
            early_stopping=True
        )
        decoded = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translations.extend(decoded)
    return translations

# Translate all sentences
sentences = df["Sentence"].astype(str).tolist()
translations = batch_translate(sentences, tokenizer, model)

# Save translations
df["Translation_mBART"] = translations
df.to_excel("translated_assembly_WER_mbart.xlsx", index=False, engine="openpyxl")

print("Translation completed using mBART. Output saved to 'translated_assembly_WER_mbart.xlsx'.")

Translation completed using mBART. Output saved to 'translated_assembly_WER_mbart.xlsx'.


In [None]:
### Training mBART with full dataset

In [9]:
import pandas as pd
import torch
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset

# Load full training and validation data
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load tokenizer (original name is fine here)
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "en_XX"

# Load model directly from checkpoint
model = MBartForConditionalGeneration.from_pretrained("mbart_translation_full/checkpoint-9656")

# Preprocessing function using `text_target` (modern API)
def preprocess_function(examples):
    return tokenizer(
        examples["Russian"],
        text_target=examples["English"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="mbart_translation_full",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Resume training
trainer.train(resume_from_checkpoint=True)

# Save final model
model.save_pretrained("mbart_translation_full")
tokenizer.save_pretrained("mbart_translation_full")

print("Training complete. Model saved in 'mbart_translation_full'")

Map:   0%|          | 0/309030 [00:00<?, ? examples/s]

Map:   0%|          | 0/38613 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss
2,0.195,0.22939
3,0.1768,0.234234


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training complete. Model saved in 'mbart_translation_full'


In [None]:
### Translation using mBART

In [1]:
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load the Excel file
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Check that 'Sentence' column exists
if "Sentence" not in df.columns:
    raise ValueError("The file does not contain a 'Sentence' column. Please check the structure.")

# Load the trained mBART model and tokenizer
model_path = "mbart_translation_full"
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

# Set source and target languages
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "en_XX"

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Translation function
def batch_translate(sentences, model, tokenizer, num_beams=10):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    translated_tokens = model.generate(
        **inputs,
        num_beams=num_beams,
        max_length=128,
        early_stopping=True
    )
    return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# Perform translation in batches
batch_size = 16
sentences = df["Sentence"].astype(str).tolist()
translations = [
    translation
    for i in range(0, len(sentences), batch_size)
    for translation in batch_translate(sentences[i:i + batch_size], model, tokenizer)
]

# Add translations to DataFrame
df["Translation_mBART"] = translations

# Save to new Excel file
output_file = "translated_assembly_WER_mBART_bean10.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"Translation completed and saved to '{output_file}'")

2025-03-26 09:55:29.396922: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-26 09:55:29.477641: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742982929.515803      22 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742982929.528359      22 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742982929.601800      22 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Translation completed and saved to 'translated_assembly_WER_mBART_bean10.xlsx'
