# Utils

In [1]:
from scripts import utils
from scripts import metrics
from scripts import inference_model
from tqdm.notebook import tqdm

In [2]:
DATASET_NAME = 'rcp-meetings/rudialogsum_v2'

## MTSAIR_Cotype_Nano_inference

In [13]:
MODEL_NAME = "MTSAIR/Cotype-Nano"

In [4]:
data = utils.get_dataset(DATASET_NAME)['train']

In [5]:
pipe = utils.get_model_pipeline(MODEL_NAME)

### zero_shot

In [None]:
infer_dict = {}

for idx, text in tqdm(enumerate(data)):
    messages = [
        {"role": "system", "content": "Реши задачу по инструкции ниже. Не давай никаких объяснений и пояснений к своему ответу. Инструкция: суммаризируй диалог до одного двух предложений."},
        {"role": "user", "content": text['dialog']}
        ]
    pred = inference_model.get_predicts_hf(pipe, messages)
    infer_dict[idx] = pred

In [8]:
utils.save_json(infer_dict, './DATA/MTSAIR_Cotype_Nano_inference/pred_zero_shot.json')

### one_shot

In [None]:
example_indices_full = [30]

infer_dict = {}

for idx, text in tqdm(enumerate(data)):
    one_shot_prompt = utils.make_prompt(data, example_indices_full, idx)
    messages = [
        {"role": "system", "content": "Ты ИИ-помощник. Инструкция: суммаризируй диалог до одного двух предложений."},
        {"role": "user", "content": one_shot_prompt}
        ]
    pred = inference_model.get_predicts_hf(pipe, messages)
    infer_dict[idx] = pred

0it [00:00, ?it/s]

In [10]:
utils.save_json(infer_dict, './DATA/MTSAIR_Cotype_Nano_inference/pred_one_shot.json')

## T_lite_it_1.0_Q8_0_GGUF

In [3]:
MODEL_NAME = "hf.co/t-tech/T-lite-it-1.0-Q8_0-GGUF:Q8_0"

In [4]:
data = utils.get_dataset(DATASET_NAME)['train']

### one_shot

In [5]:
example_indices_full = [30]

infer_dict = {}

for idx, text in tqdm(enumerate(data)):
    one_shot_prompt = utils.make_prompt(data, example_indices_full, idx)
    messages = [
        {"role": "system", "content": "Ты ИИ-помощник. Тебе даны два диалога, первый написан как пример, а второй нужно сократить до двух предложений. Не пиши ничего лишнего, только сокращенный текст."},
        {"role": "user", "content": one_shot_prompt}
        ]
    pred = inference_model.get_predicts_ollama(MODEL_NAME, messages)
    infer_dict[idx] = pred

0it [00:00, ?it/s]

In [6]:
utils.save_json(infer_dict, './DATA/T_lite_it_1.0_Q8_0_GGUF/pred_one_shot.json')

## T_pro_it_1.0_Q4_K_M_GGUF

In [None]:
MODEL_NAME = "hf.co/t-tech/T-pro-it-1.0-Q4_K_M-GGUF:latest"

In [None]:
data = utils.get_dataset(DATASET_NAME)['train']

### one shot

In [None]:
example_indices_full = [30]

infer_dict = {}

for idx, text in tqdm(enumerate(data)):
    one_shot_prompt = utils.make_prompt(data, example_indices_full, idx)
    messages = [
        {"role": "system", "content": "Ты ИИ-помощник. Тебе даны два диалога, первый написан как пример, а второй нужно сократить до двух предложений. Не пиши ничего лишнего, только сокращенный текст."},
        {"role": "user", "content": one_shot_prompt}
        ]
    pred = inference_model.get_predicts_ollama(MODEL_NAME, messages)
    infer_dict[idx] = pred

In [None]:
utils.save_json(infer_dict, './DATA/T_lite_it_1.0_Q8_0_GGUF/pred_one_shot.json')

# Metrics

In [19]:
data = utils.get_dataset(DATASET_NAME)['train']
preds = utils.load_json('./DATA/T_pro_it_1.0_Q4_K_M_GGUF/pred_one_shot.json')
preds = [preds[str(idx)] for idx in range(len(data))]

In [4]:
metrics_model = metrics.get_metrics(preds, data['summary'])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wzakh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\wzakh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wzakh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Computing metrics...
Rouge computed
Bleu computed
Meteor computed


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Bertscore computed
Metrics computed


In [16]:
utils.save_json(metrics_model, './DATA/T_pro_it_1.0_Q4_K_M_GGUF/metrics_one_shot.json')

# DATASET inference

In [28]:
import json

In [3]:
MODEL_NAME = "hf.co/t-tech/T-lite-it-1.0-Q8_0-GGUF:Q8_0"
# MODEL_NAME = "hf.co/t-tech/T-pro-it-1.0-Q4_K_M-GGUF:latest"

In [4]:
data = utils.load_json('./DATASET/DATASET_DIALOG_SUMMARY.json')

In [16]:
example_indices_full = [30]

infer_dict = {}

# первый семпл не учитыаем, тк берем его как пример промта
for idx, text in tqdm(enumerate(range(len(data['dialog'])))):
    # if idx in example_indices_full:
    #     continue
    one_shot_prompt = utils.make_prompt_final(data, example_indices_full, idx)
    messages = [
        {"role": "system", "content": "Ты ИИ-помощник. Тебе даны два диалога, первый написан как пример, а второй нужно сократить до такой же структуры json файла, как и в примере. Напиши только JSON файл такой же структуры. Если у работника несколько задач, то перечисли их через запятую."},
        {"role": "user", "content": one_shot_prompt}
        ]
    pred = inference_model.get_predicts_ollama(MODEL_NAME, messages)
    infer_dict[idx] = pred

0it [00:00, ?it/s]

In [18]:
utils.save_json(infer_dict, './DATASET/predicts/pred_one_shot_hard_prompt.json')