### Combining data from OPUS for model training 

In [None]:
"""

import pandas as pd

# –ü—É—Ç–∏ –∫ —Ñ–∞–π–ª–∞–º (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Å–≤–æ–∏ –ª–æ–∫–∞–ª—å–Ω—ã–µ –ø—É—Ç–∏)
ru_file = "TED2020.en-ru.ru"
en_file = "TED2020.en-ru.en"
output_file = "TED2020_translations.xlsx"

# –ß–∏—Ç–∞–µ–º —Ñ–∞–π–ª—ã –ø–æ—Å—Ç—Ä–æ—á–Ω–æ
with open(ru_file, "r", encoding="utf-8") as f:
    ru_sentences = f.readlines()

with open(en_file, "r", encoding="utf-8") as f:
    en_sentences = f.readlines()

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫ —Å–æ–≤–ø–∞–¥–∞–µ—Ç
if len(ru_sentences) != len(en_sentences):
    print("–û—à–∏–±–∫–∞: –§–∞–π–ª—ã —Å–æ–¥–µ—Ä–∂–∞—Ç —Ä–∞–∑–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫!")
else:
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(ru_sentences)} –ø–∞—Ä –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π.")

# –°–æ–∑–¥–∞—ë–º DataFrame
df = pd.DataFrame({"Russian": [s.strip() for s in ru_sentences], 
                   "English": [s.strip() for s in en_sentences]})

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Excel
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"üéâ –î–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ —Ñ–∞–π–ª {output_file}")"

"""


### Data preparing 

In [None]:
"""

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_excel("TED2020_translations.xlsx")

# Split into train, validation, and test sets
train, temp = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save datasets as Excel files
train.to_excel("train.xlsx", index=False, engine="openpyxl")
val.to_excel("val.xlsx", index=False, engine="openpyxl")
test.to_excel("test.xlsx", index=False, engine="openpyxl")

print(f"Data prepared and saved: train({len(train)}), val({len(val)}), test({len(test)})")"

"""

Data prepared and saved: train(312011), val(39001), test(39002)


### Cleaning the data 

In [9]:
import pandas as pd

# Load training and validation data from Excel
train_data = pd.read_excel("train.xlsx", engine="openpyxl")
val_data = pd.read_excel("val.xlsx", engine="openpyxl")

# Drop NaN values and convert everything to strings
train_data = train_data.dropna().astype(str)
val_data = val_data.dropna().astype(str)

# Save cleaned data (optional step for debugging)
train_data.to_excel("train_cleaned.xlsx", index=False, engine="openpyxl")
val_data.to_excel("val_cleaned.xlsx", index=False, engine="openpyxl")

print("Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx")

Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx


### Train the MarianMT Model

In [None]:
#!pip install transformers datasets torch sentencepiece

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transfo

In [None]:
#!pip install transformers[torch] accelerate>=0.26.0

In [2]:
### Small dataset for fast training

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")

# Take only a subset of data (e.g., 10,000 samples)
subset_size = 10000
df = df.sample(n=subset_size, random_state=42)  # Randomly select 10,000 rows

# Split into train/validation
train, val = train_test_split(df, test_size=0.2, random_state=42)

# Save the smaller dataset
train.to_excel("train_small.xlsx", index=False, engine="openpyxl")
val.to_excel("val_small.xlsx", index=False, engine="openpyxl")

print(f"‚úÖ Prepared smaller dataset with {len(train)} training and {len(val)} validation samples.")

‚úÖ Prepared smaller dataset with 8000 training and 2000 validation samples.


### Second try training with small dataset

In [3]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load the smaller dataset
train_data = pd.read_excel("train_small.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_small.xlsx", engine="openpyxl")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load pre-trained MarianMT model
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments (optimized for faster training)
training_args = Seq2SeqTrainingArguments(
    output_dir="custom_translation_model",  # ‚úÖ Save model in a clear directory
    num_train_epochs=2,  # ‚úÖ Reduce epochs for faster training
    per_device_train_batch_size=16,  # ‚úÖ Increase batch size to speed up training
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1  # ‚úÖ Keep only the latest checkpoint
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model in a clear directory
model.save_pretrained("custom_translation_model")
tokenizer.save_pretrained("custom_translation_model")

print("üéâ Training completed! Model saved in 'custom_translation_model'")


2025-03-18 19:44:58.328579: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 19:44:58.341621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742327098.356305   18486 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742327098.360866   18486 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742327098.373535   18486 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.7951,0.730192
2,0.6911,0.678546




üéâ Training completed! Model saved in 'custom_translation_model'


### Third try, others hyperparametrs

In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# –ó–∞–≥—Ä—É–∂–∞–µ–º –ø–æ–ª–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ —Ñ–æ—Ä–º–∞—Ç Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# –ó–∞–≥—Ä—É–∂–∞–µ–º –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å MarianMT –∏ –µ—ë —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# –§—É–Ω–∫—Ü–∏—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# –ü—Ä–∏–º–µ–Ω—è–µ–º —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—é –∫–æ –≤—Å–µ–º—É –¥–∞—Ç–∞—Å–µ—Ç—É
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# **–û–ø—Ç–∏–º–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –≥–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã**
training_args = Seq2SeqTrainingArguments(
    output_dir="full_translation_model",  # ‚úÖ –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –≤ –ø–æ–Ω—è—Ç–Ω—É—é –ø–∞–ø–∫—É
    num_train_epochs=4,  # üî• –£–≤–µ–ª–∏—á–∏–ª–∏ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —ç–ø–æ—Ö
    per_device_train_batch_size=8,  # üî• –£–º–µ–Ω—å—à–∏–ª–∏ batch size –¥–ª—è —Å—Ç–∞–±–∏–ª—å–Ω–æ—Å—Ç–∏
    per_device_eval_batch_size=8,
    learning_rate=3e-5,  # üî• –ë–æ–ª–µ–µ –º—è–≥–∫–æ–µ –æ–±—É—á–µ–Ω–∏–µ
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # ‚úÖ –•—Ä–∞–Ω–∏–º —Ç–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω—é—é –º–æ–¥–µ–ª—å
    gradient_accumulation_steps=4,  # üî• –≠—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ–µ –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ –≤–µ—Å–æ–≤
    fp16=True,  # ‚úÖ –£—Å–∫–æ—Ä–µ–Ω–∏–µ –¥–ª—è GPU (–µ—Å–ª–∏ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç—Å—è)
    logging_dir="./logs",
    logging_steps=100
)

# –°–æ–∑–¥–∞—ë–º Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# –ó–∞–ø—É—Å–∫–∞–µ–º –æ–±—É—á–µ–Ω–∏–µ
trainer.train()

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —É–ª—É—á—à–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å
model.save_pretrained("full_translation_model")
tokenizer.save_pretrained("full_translation_model")

print("Training completed! Model saved in 'full_translation_model'")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-packages/tornado

AttributeError: _ARRAY_API not found

RuntimeError: Failed to import transformers.models.marian.modeling_marian because of the following error (look up to see its traceback):
numpy.core.multiarray failed to import

In [None]:
### First try of training

In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load cleaned training and validation data from Excel
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# Convert Pandas DataFrame to Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocessing function for tokenization
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization with batch processing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model
model.save_pretrained(".")
tokenizer.save_pretrained(".")

print("Training completed! Model saved")

2025-03-18 16:31:51.257754: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 16:31:51.270838: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742315511.285192   15960 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742315511.289643   15960 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742315511.302109   15960 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/309030 [00:00<?, ? examples/s]

Map:   0%|          | 0/38613 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.4433,0.363478
2,0.367,0.337283
3,0.2935,0.327943




Training completed! Model saved


In [None]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

# –ó–∞–≥—Ä—É–∂–∞–µ–º —Ç–µ—Å—Ç–æ–≤—ã–π –¥–∞—Ç–∞—Å–µ—Ç
file_path = "assembly_WER.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –∫–æ–ª–æ–Ω–∫–∞ —Å —Ç–µ–∫—Å—Ç–æ–º –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
if "Sentence" not in df.columns:
    raise ValueError("‚ùå –í —Ñ–∞–π–ª–µ –Ω–µ—Ç –∫–æ–ª–æ–Ω–∫–∏ 'Sentence'. –ü—Ä–æ–≤–µ—Ä—å—Ç–µ —Å—Ç—Ä—É–∫—Ç—É—Ä—É —Ñ–∞–π–ª–∞!")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –ø–µ—Ä–≤—É—é –º–æ–¥–µ–ª—å
model1_name = "custom_translation_model"
tokenizer1 = MarianTokenizer.from_pretrained(model1_name)
model1 = MarianMTModel.from_pretrained(model1_name)

# –ó–∞–≥—Ä—É–∂–∞–µ–º –≤—Ç–æ—Ä—É—é –º–æ–¥–µ–ª—å
model2_name = "custom_translation_model_v2"
tokenizer2 = MarianTokenizer.from_pretrained(model2_name)
model2 = MarianMTModel.from_pretrained(model2_name)

# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –ø–µ—Ä–µ–≤–æ–¥–∞
def translate(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# –ü—Ä–∏–º–µ–Ω—è–µ–º –ø–µ—Ä–µ–≤–æ–¥
df["Translation_Model_1"] = df["Sentence"].astype(str).apply(lambda x: translate(x, model1, tokenizer1))
df["Translation_Model_2"] = df["Sentence"].astype(str).apply(lambda x: translate(x, model2, tokenizer2))

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ –Ω–æ–≤—ã–π Excel-—Ñ–∞–π–ª
output_file = "translated_assembly_WER.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"‚úÖ –ü–µ—Ä–µ–≤–æ–¥ –∑–∞–≤–µ—Ä—à—ë–Ω! –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ '{output_file}'")