In [3]:
# ✅ Fine-tuning MarianMT (Helsinki-NLP/opus-mt-en-uk) на базі HPLT
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import torch
import os

# 🔘 1. Визначення середовища
USE_COLAB = "COLAB_GPU" in os.environ

if USE_COLAB:
    from google.colab import drive
    print("📂 Підключення Google Drive...")
    drive.mount('/content/drive')

# 📄 2. Шлях до TSV з правильним порядком: "en \t uk"
file_path = "/content/drive/MyDrive/train/en-uk_dataset.tsv"  # або ваш локальний шлях

# 📊 3. Завантаження і підсічка датафрейму
target_sample_size = 100000
chunk_size = 50000
samples = []
total_collected = 0

for chunk in pd.read_csv(file_path, sep="\t", names=["en", "uk"], quoting=3, chunksize=chunk_size):
    chunk = chunk.dropna().query("en.str.len() > 3 and uk.str.len() > 3", engine="python")
    available = len(chunk)
    need = target_sample_size - total_collected
    if available > need:
        chunk = chunk.sample(n=need, random_state=42)
    samples.append(chunk)
    total_collected += len(chunk)
    if total_collected >= target_sample_size:
        break

df = pd.concat(samples).reset_index(drop=True)
print(f"✅ Отримано {len(df)} пар для тренування.")

# 📊 4. HuggingFace Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.05)

# 🧠 5. Завантаження моделі та токенізатора `en → uk`
model_name = "Helsinki-NLP/opus-mt-en-uk"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# ✏️ 6. Препроцесинг
def preprocess(example):
    inputs = tokenizer(example["en"], max_length=128, padding="max_length", truncation=True)
    targets = tokenizer(example["uk"], max_length=128, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# ⚙️ 7. Параметри тренування
training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-en-uk-hplt",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
)

# 🧪 8. Trainer
!pip install wandb
import os
os.environ["WANDB_DISABLED"] = "true"

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# 🚀 9. Запуск тренування
trainer.train()

# 💾 10. Збереження
output_path = "./marianmt-en-uk-hplt-final"
trainer.save_model(output_path)

# 📦 11. Архівація та завантаження (тільки для Colab)
if USE_COLAB:
    !zip -r marianmt-en-uk-hplt-final.zip marianmt-en-uk-hplt-final
    from google.colab import files
    files.download("marianmt-en-uk-hplt-final.zip")


📂 Підключення Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Отримано 100000 пар для тренування.




Map:   0%|          | 0/95000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).




  trainer = Seq2SeqTrainer(


Step,Training Loss
500,1.526
1000,1.146
1500,1.0042
2000,0.92
2500,0.8681
3000,0.8215
3500,0.779
4000,0.7531
4500,0.7305
5000,0.7217




  adding: marianmt-en-uk-hplt-final/ (stored 0%)
  adding: marianmt-en-uk-hplt-final/model.safetensors (deflated 7%)
  adding: marianmt-en-uk-hplt-final/generation_config.json (deflated 43%)
  adding: marianmt-en-uk-hplt-final/vocab.json (deflated 78%)
  adding: marianmt-en-uk-hplt-final/source.spm (deflated 51%)
  adding: marianmt-en-uk-hplt-final/tokenizer_config.json (deflated 68%)
  adding: marianmt-en-uk-hplt-final/config.json (deflated 62%)
  adding: marianmt-en-uk-hplt-final/target.spm (deflated 57%)
  adding: marianmt-en-uk-hplt-final/special_tokens_map.json (deflated 35%)
  adding: marianmt-en-uk-hplt-final/training_args.bin (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
!pip install transformers evaluate sacrebleu
# 📊 10. Оцінка моделі за допомогою sacreBLEU
from datasets import load_metric

# Завантаження BLEU метрики
bleu = load_metric("sacrebleu")  # або "bleu"

# Отримання результатів моделі на тестовому датасеті
raw_preds = trainer.predict(tokenized_dataset["test"])

# Декодування
preds = tokenizer.batch_decode(raw_preds.predictions, skip_special_tokens=True)
labels = tokenizer.batch_decode(raw_preds.label_ids, skip_special_tokens=True)

# sacrebleu очікує: list of predictions, list of list of references
references = [[ref] for ref in labels]

# Розрахунок BLEU
bleu_score = bleu.compute(predictions=preds, references=references)
print(f"\n🎯 sacreBLEU: {bleu_score['score']:.2f}")

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

  bleu = load_metric("sacrebleu")  # або "bleu"


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]


🎯 sacreBLEU: 18.70
