In [None]:
# ✅ Fine-tuning MarianMT on Dell 7567 (local GPU: GTX)
!pip uninstall keras
!pip install tf-keras

from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from evaluate import load as load_metric
from datasets import Dataset
import pandas as pd
import torch
import os

# ✅ Шлях до локального файлу
file_path = "C://Users//skyjet//Downloads//en-uk_dataset.tsv"  # Замінити на актуальний локальний шлях

# 📊 Обробка датафрейму
sample_size = 100_000
chunk_size = 50_000
samples = []
total_collected = 0

for chunk in pd.read_csv(file_path, sep="\t", names=["en", "uk"], quoting=3, chunksize=chunk_size):
    chunk = chunk.dropna()
    chunk = chunk[
        chunk["en"].str.len().between(4, 256) &
        chunk["uk"].str.len().between(4, 256)
    ]
    available = len(chunk)
    need = sample_size - total_collected
    if available > need:
        chunk = chunk.sample(n=need, random_state=42)
    samples.append(chunk)
    total_collected += len(chunk)
    if total_collected >= sample_size:
        break

df = pd.concat(samples).reset_index(drop=True)
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.05)

# 🔠 Модель + токенізатор
model_name = "Helsinki-NLP/opus-mt-en-uk"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 🧼 Токенізація
def preprocess(example):
    model_inputs = tokenizer(example["en"], max_length=256, padding="max_length", truncation=True)
    labels = tokenizer(example["uk"], max_length=256, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

# 📏 BLEU
bleu = load_metric("sacrebleu")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

# ⚙️ Тренувальні параметри (GTX-compatible)
training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-en-uk-hplt",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    label_smoothing_factor=0.1,
    fp16=False,  # GTX 10xx does not fully support native AMP
)

# 🏋️‍♂️ Тренування
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./marianmt-en-uk-hplt-final")
