In [None]:
!pip install transformers
!pip install transformers torch
!pip install python-docx
!pip install --upgrade jupyter ipywidgets
!pip install pymilvus
!pip install pymongo
!pip install datasets
!pip install accelerate -U
!pip install pymongo
!pip install 'pymongo[srv]'


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
[0mColle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import logging
from docx import Document
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

logging.basicConfig(filename='training_logs.txt', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

def extract_text(doc_path):
    logging.info(f"Извлечение документов {doc_path}")
    doc = Document(doc_path)
    return [para.text for para in doc.paragraphs if para.text.strip()]

def create_dataset(directory_bad, directory_good, tokenizer):
    logging.info("Объявление датасета")
    data_entries = []
    bad_files = sorted([f for f in os.listdir(directory_bad) if f.endswith('.docx')])
    good_files = sorted([f for f in os.listdir(directory_good) if f.endswith('.docx')])
    for bad_file, good_file in zip(bad_files, good_files):
        bad_texts = extract_text(os.path.join(directory_bad, bad_file))
        good_texts = extract_text(os.path.join(directory_good, good_file))
        for bad_text, good_text in zip(bad_texts, good_texts):
            tokenized_input = tokenizer(bad_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
            tokenized_target = tokenizer(good_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
            data_entries.append({
                'input_ids': tokenized_input['input_ids'].squeeze(),
                'attention_mask': tokenized_input['attention_mask'].squeeze(),
                'labels': tokenized_target['input_ids'].squeeze()
            })
    return Dataset.from_dict({
        'input_ids': [entry['input_ids'] for entry in data_entries],
        'attention_mask': [entry['attention_mask'] for entry in data_entries],
        'labels': [entry['labels'] for entry in data_entries]
    })

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to('cuda')

bad_directory = '/content/drive/My Drive/Colab Notebooks/плохо'
good_directory = '/content/drive/My Drive/Colab Notebooks/хорошо'
dataset = create_dataset(bad_directory, good_directory, tokenizer)
train_test_split = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict(train=train_test_split['train'], test=train_test_split['test'])

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test']
)

logging.info("Начало обучения")
trainer.train()
logging.info("Умная модель готова")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Epoch,Training Loss,Validation Loss
1,No log,47.575459
2,No log,44.598907


Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
import os
import logging
import re
from docx import Document
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
import concurrent.futures

logging.basicConfig(filename='generation_logs.txt', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
model.to('cuda')

def clean_text(text):
    text = re.sub(r'<extra_id_\d+>', '', text)
    text = text.replace('.', '')
    return text.strip()

def generate_improved_text(text):
    logging.info(f"Начало обработки текста: {text[:30]}...")  # Логгирование начала генерации
    input_ids = tokenizer.encode("improve text: " + text, return_tensors="pt").to('cuda')
    generated_ids = model.generate(input_ids, max_length=512)
    improved_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    # Очистка текста
    improved_text = clean_text(improved_text)
    logging.info("Конец генерации")
    return improved_text

def process_paragraph(text):
    return generate_improved_text(text) if text.strip() else ""

doc_path = '/content/drive/My Drive/Colab Notebooks/elibrary_37083625_67327706.docx'
doc = Document(doc_path)
new_doc = Document()

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(executor.map(process_paragraph, [para.text for para in doc.paragraphs]))

for result, para in zip(results, doc.paragraphs):
    new_para = new_doc.add_paragraph(result)
    for run in para.runs:
        new_run = new_para.add_run(run.text)
        new_run.bold = run.bold
        new_run.italic = run.italic
        new_run.underline = run.underline
        if run.font.size:
            new_run.font.size = run.font.size
        if run.font.name:
            new_run.font.name = run.font.name

new_doc_path = '/content/drive/My Drive/Colab Notebooks/19updated_elibrary_37083625_67327706.docx'
new_doc.save(new_doc_path)
logging.info("Файл успешно создан")
