### Combining data from OPUS for model training 

In [None]:
"""

import pandas as pd

# –ü—É—Ç–∏ –∫ —Ñ–∞–π–ª–∞–º (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ —Å–≤–æ–∏ –ª–æ–∫–∞–ª—å–Ω—ã–µ –ø—É—Ç–∏)
ru_file = "TED2020.en-ru.ru"
en_file = "TED2020.en-ru.en"
output_file = "TED2020_translations.xlsx"

# –ß–∏—Ç–∞–µ–º —Ñ–∞–π–ª—ã –ø–æ—Å—Ç—Ä–æ—á–Ω–æ
with open(ru_file, "r", encoding="utf-8") as f:
    ru_sentences = f.readlines()

with open(en_file, "r", encoding="utf-8") as f:
    en_sentences = f.readlines()

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, —á—Ç–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫ —Å–æ–≤–ø–∞–¥–∞–µ—Ç
if len(ru_sentences) != len(en_sentences):
    print("–û—à–∏–±–∫–∞: –§–∞–π–ª—ã —Å–æ–¥–µ—Ä–∂–∞—Ç —Ä–∞–∑–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫!")
else:
    print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(ru_sentences)} –ø–∞—Ä –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π.")

# –°–æ–∑–¥–∞—ë–º DataFrame
df = pd.DataFrame({"Russian": [s.strip() for s in ru_sentences], 
                   "English": [s.strip() for s in en_sentences]})

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ Excel
df.to_excel(output_file, index=False, engine="openpyxl")

print(f"üéâ –î–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ —Ñ–∞–π–ª {output_file}")"

"""


### Data preparing 

In [None]:
"""

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_excel("TED2020_translations.xlsx")

# Split into train, validation, and test sets
train, temp = train_test_split(df, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

# Save datasets as Excel files
train.to_excel("train.xlsx", index=False, engine="openpyxl")
val.to_excel("val.xlsx", index=False, engine="openpyxl")
test.to_excel("test.xlsx", index=False, engine="openpyxl")

print(f"Data prepared and saved: train({len(train)}), val({len(val)}), test({len(test)})")"

"""

Data prepared and saved: train(312011), val(39001), test(39002)


### Cleaning the data 

In [9]:
import pandas as pd

# Load training and validation data from Excel
train_data = pd.read_excel("train.xlsx", engine="openpyxl")
val_data = pd.read_excel("val.xlsx", engine="openpyxl")

# Drop NaN values and convert everything to strings
train_data = train_data.dropna().astype(str)
val_data = val_data.dropna().astype(str)

# Save cleaned data (optional step for debugging)
train_data.to_excel("train_cleaned.xlsx", index=False, engine="openpyxl")
val_data.to_excel("val_cleaned.xlsx", index=False, engine="openpyxl")

print("Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx")

Data cleaned and saved as train_cleaned.xlsx & val_cleaned.xlsx


### Train the MarianMT Model

In [None]:
#!pip install transformers datasets torch sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp38-cp38-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp38-cp38-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.7 kB ? eta -:--:--
   --------------------------------------- 991.7/991.7 kB 11.6 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [None]:
#!pip install transformers[torch] accelerate>=0.26.0

In [None]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load cleaned training and validation data from Excel
train_data = pd.read_excel("train_cleaned.xlsx", engine="openpyxl")
val_data = pd.read_excel("val_cleaned.xlsx", engine="openpyxl")

# Convert Pandas DataFrame to Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Load the MarianMT model and tokenizer
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Preprocessing function for tokenization
def preprocess_function(examples):
    inputs = tokenizer(examples["Russian"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["English"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization with batch processing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Save trained model
model.save_pretrained(".")
tokenizer.save_pretrained(".")

print("Training completed! Model saved")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 309030/309030 [02:30<00:00, 2050.64 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38613/38613 [00:18<00:00, 2050.05 examples/s]


ImportError: cannot import name 'computed_field' from 'pydantic' (c:\Users\artjo\anaconda3\envs\block_b\lib\site-packages\pydantic\__init__.cp38-win_amd64.pyd)

: 