In [None]:
!pip install Cython transformers[torch] accelerate

Collecting transformers[torch]
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[torch])
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from

In [None]:
import torch

import locale
locale.getpreferredencoding = lambda: "UTF-8"

from transformers import GPT2Tokenizer, T5ForConditionalGeneration
tokenizer = GPT2Tokenizer.from_pretrained('ai-forever/FRED-T5-large',eos_token='</s>')
tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})
model = T5ForConditionalGeneration.from_pretrained('ai-forever/FRED-T5-large')


In [None]:

import os
import json
import sys
import io
import random
import itertools
from typing import Any, Dict, List, Optional, Tuple, Union
import shutil
import logging
from dataclasses import dataclass, field

import torch
import torch.optim
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from transformers import HfArgumentParser

In [None]:
import pandas as pd
df = pd.read_csv('/content/true_labels_50.csv')
df['Неправильный вариант'] = pd.read_csv('/content/raw_labels_50.csv')['Неправильный вариант']

In [None]:
df_train = df[:]

In [None]:
from tqdm.auto import trange
def load_samples(dataset_path, tokenizer):
    samples = []
    with open('/content/true_labels_50.csv', 'r') as f:
        for n in trange(len(df_train)):
                # 01.05.2023 эксперимент: вместо спецтокенов <b> и <h> используем метки
              seed = '<LM> Исправь ошибки, добавь HTML тэги: '+ str(df_train['Неправильный вариант'].iloc[n])
              reply = 'HTML:' + str(df_train['Правильный вариант'].iloc[n])
              input_tokens = tokenizer.encode(seed.lower(), add_special_tokens=False, truncation=True, max_length=256)
              output_tokens = tokenizer.encode(reply.lower(), add_special_tokens=False,truncation = True,max_length=256)  # , truncation=True, max_length=1024)
              if len(input_tokens) < 512 and len(output_tokens) < 512:  # пока ограничим многословность
                samples.append({'input_tokens': input_tokens,
                                    'output_tokens': output_tokens,
                                    'seed': seed,
                                    'reply': reply})
    return samples




In [None]:
import numpy as np
toks = np.argmax(p.predictions[0],axis = -1)

In [None]:
class FinetuneDataset(Dataset):
    def __init__(self, samples, tokenizer):
        self.tokenizer = tokenizer
        self.max_input_len = 0
        self.max_output_len = 0
        self.samples = []

        self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
        self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
        self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]

        for sample in samples:
            input_ids = sample['input_tokens']
            output_ids = sample['output_tokens'] + [self.eos_token_id]
            self.samples.append((input_ids, output_ids))
            self.max_input_len = max(self.max_input_len, len(input_ids))
            self.max_output_len = max(self.max_output_len, len(output_ids))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index: int):
        input_ids, output_ids = self.samples[index]

        input_npad = self.max_input_len - len(input_ids)
        attention_mask = [1]*len(input_ids) + [0]*input_npad
        input_ids = input_ids + input_npad * [self.pad_token_id]

        output_npad = self.max_output_len - len(output_ids)
        labels = output_ids + output_npad * [-100]

        return {'input_ids': torch.LongTensor(input_ids),
                'attention_mask': attention_mask,
                'labels': torch.LongTensor(labels),
                }


In [None]:
out_dataset = FinetuneDataset(samples, tokenizer)

In [None]:
train_dataset = FinetuneDataset(samples[:40], tokenizer)
test_dataset = FinetuneDataset(samples[40:], tokenizer)
span_args = TrainingArguments(".", save_steps = 200, num_train_epochs=10,optim = 'adafactor',per_device_eval_batch_size=1, logging_steps = 50, fp16 = False
,per_device_train_batch_size=1)


In [None]:
!zip -r zip_model.zip "/content/checkpoint-450"

updating: content/checkpoint-250/ (stored 0%)
updating: content/checkpoint-250/trainer_state.json (deflated 48%)
updating: content/checkpoint-250/optimizer.pt


zip error: Interrupted (aborting)


In [None]:
!cp -r /content/checkpoint-html/ drive/MyDrive/

In [None]:
trainer.save_model("checkpoint-html")

In [None]:
    trainer = Trainer(
        model=model,
        args=span_args,
        train_dataset=train_dataset,
        eval_dataset = test_dataset,
        tokenizer=tokenizer,
    )

In [None]:
trainer.train()

Step,Training Loss
50,0.1009
100,0.0892
150,0.0795
200,0.0779
250,0.0644
300,0.0608
350,0.0574
400,0.0612


TrainOutput(global_step=400, training_loss=0.07391866445541381, metrics={'train_runtime': 362.8515, 'train_samples_per_second': 1.102, 'train_steps_per_second': 1.102, 'total_flos': 424445853696000.0, 'train_loss': 0.07391866445541381, 'epoch': 10.0})

In [None]:
joke_dataset = FinetuneDataset(samples[-6:-5], tokenizer)


In [None]:
p = trainer.predict(joke_dataset)

In [None]:
toks = np.argmax(p.predictions[0],axis = -1)
tokenizer.decode(toks[0][:])

'html:<p>задачи волонтеров фестиваля:</p>\n<ul>\n  <li>помощь в координации, встрече и регистрации команд-участников;</li>\n  <li>помощь в координации посетителей;</li>\n  <li>помощь в раздаче материалов для участников;</li>\n  <li>помощь сотрудникам фестиваля во время застройки площадок фестиваля;</li>\n  <li>ассистирование.</li>\n</ul></s>'

In [None]:
tokenizer.decode(p.label_ids[0][:])


'html:<p>задачи волонтеров фестиваля:</p>\n<ul>\n  <li>помощь в координации, встрече и регистрации команд-участников;</li>\n  <li>помощь в координации посетителей;</li>\n  <li>помощь в раздаче материалов для участников;</li>\n  <li>помощь сотрудникам фестиваля во время застройки площадок фестиваля;</li>\n  <li>ассистирование.</li>\n</ul></s>'

In [None]:
preds = []
for t in trange(len(df_test)):
  lm_text=('<LM> Исправь ошибки, добавь HTML тэги:' + str(df_test['Непр'].iloc[t])).lower()
  input_ids=torch.tensor([tokenizer.encode(lm_text)]).cuda()
  outputs=model.generate(input_ids,eos_token_id=tokenizer.eos_token_id,early_stopping=True)
  s = tokenizer.decode(outputs[0][1:])
  print(s)
  preds.append(s)

NameError: ignored

In [None]:
input_ids = tokenizer("<LM> Исправь ошибки, добавь HTML тэги:" + str(df['Неправильный вариант'].iloc[1]) + ' HTML:', return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids,eos_token_id=tokenizer.eos_token_id,early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))




<pad>html:p>По всем вопросам вы можете обращаться по эл. почте mobility@av


In [None]:
task_prefix = "<LM> Текст: "                 # Токенизирование данных

input_sequence = ''
if type(input_sequences) != list:
    input_sequences = [input_sequences]
encoded = tokenizer(
  [task_prefix + sequence for sequence in input_sequences],
  padding="longest",
  max_length=MAX_INPUT,
  truncation=True,
  return_tensors="pt",
)


TypeError: ignored

In [None]:
%%timeit




категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
категория:нектар бренд товара: фруктовый сад</s>
1.02 s ± 345 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
preds = []
for t in trange(len(df_test)):
  lm_text=('<LM>' + str(df_test['name'].iloc[t])).lower()
  input_ids=torch.tensor([tokenizer.encode(lm_text)]).cuda()
  outputs=model.generate(input_ids,eos_token_id=tokenizer.eos_token_id,early_stopping=True)
  s = tokenizer.decode(outputs[0][1:])
  print(s)
  preds.append(s)

NameError: ignored