In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch


!pip install --upgrade datasets transformers


def load_parquet_datasets():
    train1 = pd.read_parquet("/content/train1.parquet")
    train2 = pd.read_parquet("/content/train2.parquet")
    train3 = pd.read_parquet("/content/train3.parquet")

    combined_train = pd.concat([train1, train2, train3], ignore_index=True)

    val = pd.read_parquet("/content/val.parquet")

    return combined_train, val

train_df, val_df = load_parquet_datasets()


def prepare_data(df):
    df = df[df['premise'].notna() & df['premise_ru'].notna()]

    examples = []
    for _, row in df.iterrows():
        examples.append({
            "input_text": row['premise'].strip(),
            "target_text": row['premise_ru'].strip()
        })
    return examples

train_examples = prepare_data(train_df)
val_examples = prepare_data(val_df)

dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame(train_examples)),
    "validation": Dataset.from_pandas(pd.DataFrame(val_examples))
})


model_name = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )

    targets = tokenizer(
        examples["target_text"],
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )

    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids
    }

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=64,
    remove_columns=["input_text", "target_text"]
)

tokenized_datasets.save_to_disk("/content/tokenized_data")


training_args = TrainingArguments(
    output_dir="./translation_model",
    per_device_train_batch_size=256,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    remove_unused_columns=False,
    eval_strategy="steps",
    eval_steps=100,
    learning_rate=5e-5,
    warmup_steps=50,
    weight_decay=0.01,
    fp16=True,
    report_to="none"
)


data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)


print("Начало обучения...")
trainer.train()





Map (num_proc=64):   0%|          | 0/1756548 [00:00<?, ? examples/s]

Map (num_proc=64):   0%|          | 0/34615 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1756548 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34615 [00:00<?, ? examples/s]

Начало обучения...


Step,Training Loss,Validation Loss
100,0.9478,0.728931
200,0.6351,0.471466
300,0.5005,0.356302
400,0.4288,0.301463
500,0.382,0.264466
600,0.3487,0.238
700,0.3251,0.217279
800,0.2994,0.201325
900,0.2859,0.187686
1000,0.2701,0.176507




TrainOutput(global_step=5148, training_loss=0.25049322540202434, metrics={'train_runtime': 8430.0151, 'train_samples_per_second': 625.105, 'train_steps_per_second': 0.611, 'total_flos': 8.931615767632282e+16, 'train_loss': 0.25049322540202434, 'epoch': 3.0})

In [None]:
!zip -r translation_model.zip /content/translation_model

  adding: content/translation_model/ (stored 0%)
  adding: content/translation_model/runs/ (stored 0%)
  adding: content/translation_model/runs/Jul04_11-54-16_4c50ec277dc7/ (stored 0%)
  adding: content/translation_model/runs/Jul04_11-54-16_4c50ec277dc7/events.out.tfevents.1751630057.4c50ec277dc7.1958.1 (deflated 63%)
  adding: content/translation_model/runs/Jul04_11-42-23_4c50ec277dc7/ (stored 0%)
  adding: content/translation_model/runs/Jul04_11-42-23_4c50ec277dc7/events.out.tfevents.1751629346.4c50ec277dc7.1958.0 (deflated 62%)
  adding: content/translation_model/checkpoint-5148/ (stored 0%)
  adding: content/translation_model/checkpoint-5148/vocab.json (deflated 79%)
  adding: content/translation_model/checkpoint-5148/rng_state.pth (deflated 25%)
  adding: content/translation_model/checkpoint-5148/special_tokens_map.json (deflated 35%)
  adding: content/translation_model/checkpoint-5148/trainer_state.json (deflated 80%)
  adding: content/translation_model/checkpoint-5148/target.spm