In [None]:
import pandas as pd

In [None]:
corpus = ["Book_of_John_RUS_MANS.csv", "Gospel_Mark_RUS_MANS.csv", "Bible_UDM_RUS.csv", "DICTIONARY_MANS_RUS_2.csv", "DICTIONARY_MANS_RUS.csv", "train.csv"]
corpus_text = list(pd.read_csv(corpus[0])['mans'].values)
corpus_text += list(pd.read_csv(corpus[1])['mans'].values)
corpus_text += list(pd.read_csv(corpus[2])['udm'].values)
corpus_text += list(pd.read_csv(corpus[3])['mans'].values)
corpus_text += list(pd.read_csv(corpus[4])['mans'].values)
corpus_text += list(pd.read_csv(corpus[5])['pr_target'].values)

In [None]:
import numpy as np

with open("corpus.txt", 'w') as f:
    for text in corpus_text:
        if text is not None and not isinstance(text, float):
            f.write(text + '\n')

In [None]:
from datasets import load_dataset
from transformers import T5TokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments


# Initialize the tokenizer
fast_tokenizer = T5TokenizerFast.from_pretrained('ai-forever/ruT5-large')

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# Initialize a Byte-Pair Encoding tokenizer
tokenizer = Tokenizer(models.BPE())

# Set pre-tokenizer and post-processor (if needed)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.BPEDecoder()

# Prepare the tokenizer training
trainer = trainers.BpeTrainer(vocab_size=15000, special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<mask>"])

# Train the tokenizer on your new language data
files = ["corpus.txt"]
tokenizer.train(files, trainer)

# Save the tokenizer
tokenizer.save("new_tokenizer.json")

In [None]:
existing_vocab = set(fast_tokenizer.get_vocab().keys())

In [None]:
new_tokens = list(tokenizer.get_vocab().keys())

In [None]:
tokens_to_add = [token for token in new_tokens if token not in existing_vocab]

In [None]:
fast_tokenizer.add_tokens(tokens_to_add)

In [None]:
len(fast_tokenizer)

In [None]:
model = 

In [None]:
fast_tokenizer.save_pretrained("rut5token_mans")

In [None]:
len(tokenizer.get_vocab())

In [None]:
corpus_text_filtered = []
for text in corpus_text:
    if text is not None and not isinstance(text, float) and len(text) > 10:
        corpus_text_filtered.append(text)

In [None]:
set([type(corpus_text_filtered[0]) for i in range(len(corpus_text_filtered))])

In [None]:
corpus_text_filtered[22681]

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random
from tqdm import tqdm

# Define the dataset
class SpanCorruptionDataset(Dataset):
    def __init__(self, texts, tokenizer, span_prob=0.15, span_length=5):
        self.texts = texts
        self.tokenizer = tokenizer
        self.span_prob = span_prob
        self.span_length = span_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        try:
            tokenized = self.tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', max_length=256, truncation=True)
        except:
            print("!=: ", text, idx)
            assert 1 == 0

        input_ids = tokenized['input_ids'].squeeze()
        attention_mask = tokenized['attention_mask'].squeeze()

        # Create corrupted inputs
        input_ids_corrupted = input_ids.clone()
        labels = input_ids.clone()  # Labels are the same as input_ids initially

        if random.random() < self.span_prob:
            start_idx = random.randint(0, len(input_ids) - self.span_length)
            end_idx = min(start_idx + self.span_length, len(input_ids))
            
            input_ids_corrupted[start_idx:end_idx] = self.tokenizer.convert_tokens_to_ids(f"<extra_ids_{0}>")#torch.Tensor([self.tokenizer.convert_tokens_to_ids(f"<extra_ids_{i}>") for i in range(end_idx-start_idx)])

            # Set the labels to -100 where the span is corrupted, so they are ignored in loss computation
            labels[start_idx:end_idx] = -100

        return {'input_ids': input_ids_corrupted, 'attention_mask': attention_mask, 'labels': labels}

# Initialize tokenizer and model
tokenizer = T5TokenizerFast.from_pretrained("ruT5token")
model = T5ForConditionalGeneration.from_pretrained('ruT5_3ep.pt')
#model.resize_token_embeddings(len(tokenizer))

# Create dataset and dataloader
dataset = SpanCorruptionDataset(corpus_text_filtered, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * 7)  # 3 epochs

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
loss_hist = []
for epoch in range(3, 10):
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
    loss_hist.append(total_loss / len(dataloader))
    model.save_pretrained(f'ruT5_ep{epoch+1}')
    tokenizer.save_pretrained(f'ruT5tok_ep{epoch+1}')
    print(f"Epoch {epoch + 1}: Loss = {total_loss / len(dataloader)}")

In [None]:
model.save_pretrained('ruT5_3ep.pt')
tokenizer.save_pretrained('ruT5token')

In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from datasets import load_metric
import sacrebleu

# Загрузка данных из CSV
csv_file_path = "train.csv"
df = pd.read_csv(csv_file_path)
df_val = pd.read_csv('val.csv')
# Инициализация токенизатора и модели
tokenizer = T5TokenizerFast.from_pretrained("ruT5tok_ep7")
model = T5ForConditionalGeneration.from_pretrained("ruT5_ep7")

# Создание пользовательского датасета
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_col, target_col, max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_col = source_col
        self.target_col = target_col
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        source_text = self.dataframe.iloc[idx][self.source_col]
        target_text = self.dataframe.iloc[idx][self.target_col]
        
        source_encoding = self.tokenizer(source_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        
        return {
            'input_ids': torch.squeeze(source_encoding['input_ids']),
            'attention_mask': torch.squeeze(source_encoding['attention_mask']),
            'labels': torch.squeeze(target_encoding['input_ids'])
        }

# Создание объектов Dataset
train_dataset = TranslationDataset(df, tokenizer, 'pr_source', 'pr_target')
val_dataset = TranslationDataset(df_val, tokenizer, 'pr_source', 'pr_target')

# Определение аргументов для тренировки
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    output_dir="./results",
    num_train_epochs=10,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

# Определение метрик для оценки
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print(decoded_preds[:10])

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_labels[:10])
    # Оценка BLEU
    bleu_metric = load_metric("bleu")

    formatted_labels = [[label] for label in decoded_labels]

    # Дополнительные проверки для отладки форматов
    print(f"Пример decoded_preds: {decoded_preds[:2]}")  # Посмотреть первые 2 предсказания
    print(f"Пример formatted_labels: {formatted_labels[:2]}")  # Посмотреть первые 2 референса

    # Проверка типов данных перед расчетом метрик

    # Считаем метрику BLEU
    #bleu_result = bleu_metri.compute(predictions=decoded_preds, references=formatted_labels)
    bleu_score = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])
    # Оценка CHRF
    chrf_metric = load_metric("chrf")
    chrf_score = chrf_metric.compute(predictions=[[[d]] for d in decoded_preds], references=[[[d]] for d in decoded_labels])
    return {"bleu": bleu_score.score, "chrf": chrf_score['score']}

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

# Инициализация Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

# Запуск тренировки
trainer.train()

# Оценка модели
results = trainer.evaluate()
print(results)

In [None]:
model = T5ForConditionalGeneration.from_pretrained("results/checkpoint-762")

In [None]:
import random

In [None]:
random.seed(5)
random.randint(0,100)

In [None]:
import pandas as pd
df_train = pd.read_csv("train.csv")

In [None]:
df_train = pd.concat([df_train[["pr_target", "pr_source"]], pd.read_csv("test.csv")[["pr_target", "pr_source"]]])
df_train.columns = ["mans", "rus"]
df_train = pd.concat([df_train, pd.read_csv("DICTIONARY_MANS_RUS.csv")[["mans", "rus"]], pd.read_csv("DICTIONARY_MANS_RUS_2.csv")[["mans", "rus"]], pd.read_csv("Gospel_Mark_RUS_MANS.csv")[["mans", "rus"]], pd.read_csv("Book_of_John_RUS_MANS.csv")[["mans", "rus"]]])

In [None]:
df_train.to_csv("train_mansi.csv")

In [None]:
pd.read_csv("test.csv")