In [1]:
inputs = ["https://drive.google.com/uc?id=1ki7iNkXGdQ7lSmnze5Ox9CPzHWvvtCQ_"]

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd
from typing import List

# https://huggingface.co/cointegrated/rut5-base-absum 

def summarize(tokenizer, model,
    text, n_words=None, compression=None,
    max_length=1000, num_beams=3, do_sample=False, repetition_penalty=10.0, 
    **kwargs
):
    """
    Summarize the text
    The following parameters are mutually exclusive:
    - n_words (int) is an approximate number of words to generate.
    - compression (float) is an approximate length ratio of summary and original text.
    """
    if n_words:
        text = '[{}] '.format(n_words) + text
    elif compression:
        text = '[{0:.1g}] '.format(compression) + text
    x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
    with torch.inference_mode():
        out = model.generate(
            **x, 
            max_length=max_length, num_beams=num_beams, 
            do_sample=do_sample, repetition_penalty=repetition_penalty, 
            **kwargs
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)


def summarize_news(input: List[str]) -> List:
    '''суммаризация'''
    MODEL_NAME = 'cointegrated/rut5-base-absum'
    model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    model.eval()

    preds = []
    news = pd.read_csv(input[0])["inputs"].tolist()
    for rec in news:
        summary = summarize(tokenizer, model, rec)
        preds.append(summary)

    return preds

outputs = summarize_news(inputs)

  from .autonotebook import tqdm as notebook_tqdm
Downloading config.json: 100%|██████████| 753/753 [00:00<00:00, 368kB/s]
Downloading pytorch_model.bin: 100%|██████████| 932M/932M [00:15<00:00, 62.3MB/s] 
Downloading spiece.model: 100%|██████████| 808k/808k [00:00<00:00, 8.27MB/s]
Downloading special_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 36.2kB/s]
Downloading tokenizer_config.json: 100%|██████████| 315/315 [00:00<00:00, 185kB/s]
  return torch._C._cuda_getDeviceCount() > 0


In [3]:
outputs

['Встреча началась с песенки о дружбе. Участники встречи отвечали на вопросы викторины, рассказывали пословицы о дружбе.',
 'Амурский тигр и дальневосточный леопард спасены от угрозы полного исчезновения.']

In [1]:
#!:docker-publish noda_clone:1.0.1 cr.yandex/crp6ermefad6f9dc0jbr:noda

Registry username: json_key
Secret name of password: docker


Pushing image: layers = 4/4, pushed = 9.273/9.273 GB                          