## Prepare model

### Imports & Definitions

In [1]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.",
)

warnings.filterwarnings(
    "ignore",
    message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.",
)

warnings.filterwarnings(
    "ignore",
    message="You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset",
)

warnings.filterwarnings(
    "ignore",
    message="`do_sample` is set to `False`. However, `temperature` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.",
)

In [2]:
import os
from huggingface_hub import login
login(os.environ['hf-read-token'])

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /tmp/xdg_cache/huggingface/token
Login successful


In [3]:
import sys
sys.path.append("../../utils")
from definitions import *

/home/jupyter/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [4]:
CACHE_DIR = Path("../../../../cache/")
DATASET_DIR = Path("/home/jupyter/mnt/datasets/diplomas/russian_dataset/")

In [5]:
MODEL_EXPERIMENT_NAME = "learnt_16k"

MODEL_MAX_LENGTH = 16384

In [5]:
import sys
sys.path.append("../../../../LongLoRA-diploma-research")

### Prompt Dict

In [9]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
    "prompt_input_llama2": (
        "[INST] <<SYS>>\n"
        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
    ),
    "prompt_llama2": "[INST]{instruction}[/INST]",
    "prompt_input_diploma_special":(
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\n{input}\n\n### Response:"
    ),
}

### Load model & tokenizer

In [25]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[LLAMA_2_7B], 
    cache_dir=CACHE_DIR, 
    # config=config,
    torch_dtype=torch.bfloat16,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

print("Loaded model")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[LLAMA_2_7B],
    cache_dir=CACHE_DIR,
    # model_max_length=MODEL_MAX_LENGTH,
    padding_side="right",
    use_fast=True,
)

print("Loaded tokenizer")

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.13s/it]


Loaded model
Loaded tokenizer


#### Load peft_model

In [None]:
import sys
import io
import os
import copy
import json
import math
import logging
import pandas as pd
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import torch
import transformers
from torch.utils.data import Dataset
from transformers import Trainer, DataCollatorForLanguageModeling
from llama_attn_replace_sft import replace_llama_attn
from gptneox_attn_replace import replace_gpt_neox_attn
from peft import LoraConfig, get_peft_model
from torch.distributed import barrier


model_name = LLAMA_2_7B

replace_llama_attn(True, False, inference=True)

# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name],
    cache_dir=CACHE_DIR,
)

orig_rope_scaling = getattr(config, "rope_scaling", None)
if orig_rope_scaling is None:
    orig_rope_scaling = {"factor": 1}
orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
orig_ctx_len = getattr(config, "max_position_embeddings", None)
if orig_ctx_len:
    orig_ctx_len *= orig_rope_scaling_factor
    if MODEL_MAX_LENGTH > orig_ctx_len:
        scaling_factor = float(math.ceil(MODEL_MAX_LENGTH / orig_ctx_len))
        config.rope_scaling = {"type": "linear", "factor": scaling_factor}

print("Created config")

# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name], 
    cache_dir=CACHE_DIR, 
    config=config,
    torch_dtype=torch.bfloat16,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

print("Loaded model")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name],
    cache_dir=CACHE_DIR,
    model_max_length=MODEL_MAX_LENGTH,
    padding_side="right",
    use_fast=True,
)

print("Loaded tokenizer")

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"


special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

# if training_args.low_rank_training:
#     if model_args.model_type == "gpt-neox":
#         # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
#         targets = ["query_key_value", "dense"]
#     else:
#         targets=["q_proj", "k_proj", "v_proj", "o_proj"]

#     config = LoraConfig(
#         r=8,
#         lora_alpha=16,
#         target_modules=targets,
#         lora_dropout=0,
#         bias="none",
#         task_type="CAUSAL_LM",
#     )
#     model = get_peft_model(model, config)
#     # enable trainable params
#     [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]

# model.config.use_cache = False         # required for gradient checkpointing
# model.enable_input_require_grads()     # required for gradient checkpointing
# model.gradient_checkpointing_enable()  # enable gradient checkpointing

# print("Prepared model to learn")

###

# model_name = LLAMA_2_7B
# model = AutoModelForCausalLM.from_pretrained(
#     HUGGINGFACE_MODEL_TO_REPO[model_name], 
#     cache_dir=CACHE_DIR, 
#     device_map='auto'
# )
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model.to(device)

# MODEL_MAX_LENGTH = 16384

# tokenizer = AutoTokenizer.from_pretrained(
#     HUGGINGFACE_MODEL_TO_REPO[model_name], 
#     cache_dir=CACHE_DIR, 
#     model_max_length=MODEL_MAX_LENGTH,
#     padding_side="right",
#     use_fast=True
# )

# IGNORE_INDEX = -100
# DEFAULT_PAD_TOKEN = "[PAD]"
# DEFAULT_EOS_TOKEN = "</s>"
# DEFAULT_BOS_TOKEN = "<s>"
# DEFAULT_UNK_TOKEN = "<unk>"

# def smart_tokenizer_and_embedding_resize(
#     special_tokens_dict: Dict,
#     tokenizer: transformers.PreTrainedTokenizer,
#     model: transformers.PreTrainedModel,
# ):
#     """Resize tokenizer and embedding.

#     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
#     """
#     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
#     model.resize_token_embeddings(len(tokenizer))

#     if num_new_tokens > 0:
#         input_embeddings = model.get_input_embeddings().weight.data
#         output_embeddings = model.get_output_embeddings().weight.data

#         input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
#         output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

#         input_embeddings[-num_new_tokens:] = input_embeddings_avg
#         output_embeddings[-num_new_tokens:] = output_embeddings_avg

# special_tokens_dict = dict()
# if tokenizer.pad_token is None:
#     special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
# if tokenizer.eos_token is None:
#     special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
# if tokenizer.bos_token is None:
#     special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
# if tokenizer.unk_token is None:
#     special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

# smart_tokenizer_and_embedding_resize(
#     special_tokens_dict=special_tokens_dict,
#     tokenizer=tokenizer,
#     model=model,
# )



Created config


Loading checkpoint shards: 100%|██████████| 2/2 [03:40<00:00, 110.36s/it]


Loaded model


Using pad_token, but it is not set yet.


Loaded tokenizer


In [None]:
    
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg
        
smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

In [None]:
from peft import PeftModel

model_id = "nvdenisov2002/llama-longLoRA-v3-16k-5000-samples-220-iterations"
peft_model = PeftModel.from_pretrained(model, model_id)

Downloading adapter_config.json: 100%|██████████| 674/674 [00:00<00:00, 1.75MB/s]
Downloading adapter_model.bin: 100%|██████████| 541M/541M [00:08<00:00, 60.5MB/s] 


## Inference model on tasks

#### Try inference smth

In [26]:
print("kek")

kek


In [34]:
text = "How to write diploma work?"

In [56]:
tokenizer.use_default_system_prompt

True

In [57]:
model_inputs = tokenizer(text, return_tensors="pt").to("cpu")

In [58]:
model_inputs

{'input_ids': tensor([[    1,  1128,   304,  2436,   652,   572,  4125,   664, 29973]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [60]:
tokenizer.decode(model_inputs["input_ids"].flatten())

'<s> How to write diploma work?'

In [37]:
config = transformers.GenerationConfig(max_new_tokens=512, max_time=15)

In [50]:
generated = model.generate(**model_inputs, generation_config=config)

In [51]:
generated.shape

torch.Size([1, 105])

In [52]:
print(tokenizer.decode(generated.flatten()))

<s> How to write diploma work?
The diploma work is a scientific research work, which is written by the student in the last year of study. The diploma work is a mandatory part of the educational program. The diploma work is a scientific research work, which is written by the student in the last year of study. The diploma work is a mandatory part of the educational program.
The diploma work is a scientific research work, which is written by the


In [23]:
prompt = PROMPT_DICT["prompt_no_input_llama2"].format(instruction=text)
prompt_tokens = torch.tensor(tokenizer(prompt)["input_ids"]).to(device)
generated_tokens = model.generate(input_ids=prompt_tokens.reshape((1, -1)))

In [31]:
len(prompt_tokens)

147

In [33]:
print(tokenizer.decode(prompt_tokens))

<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>> 

 How to write diploma work? [/INST]


In [28]:
len(generated_tokens.flatten())

4096

In [30]:
decoded_part = tokenizer.decode(generated_tokens.flatten()[:200])
decoded_part

"<s> [INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>> \n\n How to write diploma work? [/INST]\n\n[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist"

In [25]:
decoded = tokenizer.decode(generated_tokens.flatten())
decoded

"<s> [INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>> \n\n How to write diploma work? [/INST]\n\n[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of a

In [12]:
prefix_tokens = torch.tensor(tokenizer("a cat sat on a ")["input_ids"], dtype=torch.int)
prefix_tokens

tensor([    1,   263,  6635,  3290,   373,   263, 29871], dtype=torch.int32)

In [16]:
generated = peft_model.generate(input_ids=prefix_tokens.reshape((1, -1)).to(device), do_sample=False, num_beams=1)



In [17]:
tokenizer.decode(generated.to('cpu').flatten())

'<s> a cat sat on a 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

#### Define some constants

In [9]:
first_k = int(1e5)
INF = int(1e7)

In [None]:
first_k, INF

(100000, 10000000)

#### Try inference model on diplomas

In [14]:
df = pd.read_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k.csv"))
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt,learnt_8k
0,0,12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...,В работе рассматриваются алгебраические теории...
1,1,25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...,В
2,2,37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...,In this work we propose a method for improving...
3,3,101,45046,2023,Санкт-Петербургский государственный университе...,В работе мы обобщаем результаты об энергии нат...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются классы случайных проц...,В данной работе рассматривается энергетически-...,В работе мы обобщаем результаты об энергии нат...
4,4,152,45047,2023,Санкт–Петербургский государственный университе...,В рамках данной работы рассматривается подход ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе рассматривается задача настраи...,В данной работе рассматривается задача добавле...,В работе рассматривается применение добавления...


In [17]:
from llm_helpers_fixed_tokenizer import get_prefix_len_and_tokens

prefix_len, prefix_tokens = get_prefix_len_and_tokens(tokenizer, df.loc[0], INF)
prefix_len, prefix_tokens 

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


(13700, tensor([    1, 13866,   338,  ...,    15,    13,    13]))

In [18]:
peft_model.eval()
generated = peft_model.generate(input_ids=prefix_tokens.reshape((1, -1)).to(device), do_sample=True, num_beams=5)
generated_continue = tokenizer.decode(generated.to('cpu').flatten()[prefix_len:])
generated_continue



OutOfMemoryError: CUDA out of memory. Tried to allocate 536.00 MiB. GPU 

In [23]:
tokenizer.decode(prefix_tokens)

'<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nBelow is a diploma text. Your task is to generate abstract of this diploma.\n\n### Input:\nАЙВАЗЬЯН Аршак Владимирович\nВыпускная квалификационная работа\n\nМодельная структура на категории алгебр над\nобогащенной теорией Ловера\nОбразовательная программа бакалавриат «Математика»\nНаправление и код: 01.03.01 «Математика»\nШифр ОП: СВ.5000.2019\n\nНаучный руководитель:\nд.ф.-м.н., доцент\nБондарко Михаил Владимирович\nРецензент:\nк.ф.-м.н., Пекинский институт\nматематических наук и приложений\nу озера Янчи\nИванов Сергей Олегович\n\nСанкт-Петербург\n2023 год\n\n\x0c1\n\nВведение\n\nКлассически, алгебраическая теория – это следующий набор данных:\n• множество носителей (или сортов) S\n• множество операций O, имеющих dom вида S1 × ... × Sn (формальное произведение; может\nбыть пустым — 1) и cod вида Si .\n• 

### Inference model on asessors questions

In [None]:
from llm_helpers_fixed_tokenizer import get_some_model_result

test_df = pd.read_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k.csv"))

new_rows = []
for _, row in tqdm(test_df[:first_k].iterrows(), total=len(test_df[:first_k]), desc="Rows..."):
    new_row = copy.deepcopy(row)
    new_row[MODEL_EXPERIMENT_NAME] = get_some_model_result(peft_model, tokenizer, row, device, diploma_prefix_len=INF)
    new_rows.append(new_row)
new_df = pd.DataFrame(new_rows)
new_df.head()

Rows...:   0%|          | 0/70 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]



Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]



Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]



Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]



Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

Texts...:   0%|          | 0/1 [00:00<?, ?it/s]

  input_id = torch.tensor(res, dtype=torch.int)


In [None]:
new_df.to_csv(ARTIFACTS_DIR_PATH.joinpath(f"diplomas_abstracts/baselines_with_{MODEL_EXPERIMENT_NAME}.csv"))

### Inference model on MERA

In [None]:
from mera_helpers import construct_prompt
from path_helpers import get_dataset_path, get_metric_dir_path
from llm_helpers import calculate_token_interest_probs, get_answer

for name, dataset_meta in tqdm(HUGGINGFACE_NAME_TO_DATASET.items(), desc="Datasets..."):
    for subset, split in tqdm(zip(dataset_meta["subsets"], dataset_meta["splits"]), total=len(dataset_meta["splits"]), desc="Splits..."):
        path = get_dataset_path(subset, name, split)
        dataset = load_from_disk(path)
        probs_list = []
        a_list = []
        for row in tqdm(list(dataset)[:first_k], desc="Rows..."):
            q = construct_prompt(row)
            probs = calculate_token_interest_probs(q, tokenizer, peft_model)
            probs_list.append({
                "probs": probs,
                "meta": row["meta"],
            })
            a = get_answer(probs)
            a_list.append({
                "answer": a,
                "meta": row["meta"],
            })
        print(a_list)
        metric_dir_path = get_metric_dir_path(MODEL_EXPERIMENT_NAME, subset, name, split)
        print(metric_dir_path)
        metric_dir_path.mkdir(exist_ok=True, parents=True)
        with open(metric_dir_path.joinpath("probs.jsonl"), "w") as f:
            json.dump(probs_list, f, ensure_ascii=False, indent=2)
        with open(metric_dir_path.joinpath("answers.jsonl"), "w") as f:
            json.dump(a_list, f, ensure_ascii=False, indent=2)

### Inference & eval model on landmark passkey

In [None]:
from landmark_run_test import test_passkey_full, load_pipes
from transformers import pipeline


models = [MODEL_EXPERIMENT_NAME]
pipe = pipeline("text-generation", model=peft_model, tokenizer=tokenizer, device='cuda')
pipes = {MODEL_EXPERIMENT_NAME: pipe}
test_passkey_full(pipes, models, n_values=[14000, 18000])

## Evaluate model results on tasks

### Evaluate asessors questions

In [None]:
new_df = new_df.set_index("Unnamed: 0")
new_df

In [10]:
! ls ..

data_analization
data_collection
experiments
junk
metrics_calculation
pattern.ipynb


In [27]:
df = pd.read_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract.csv"))
new_df = df.set_index("Unnamed: 0")
new_df

Unnamed: 0_level_0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...
25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...
37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...
101,45046,2023,Санкт-Петербургский государственный университе...,В работе мы обобщаем результаты об энергии нат...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются классы случайных проц...,В данной работе рассматривается энергетически-...
152,45047,2023,Санкт–Петербургский государственный университе...,В рамках данной работы рассматривается подход ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе рассматривается задача настраи...,В данной работе рассматривается задача добавле...
...,...,...,...,...,...,...,...,...,...
1062,45131,2023,Санкт-Петербургский государственный университе...,В настоящей работе были рассмотрены две задачи...,MATHEMATICS AND COMPUTER SCIENCE,MASTER'S STUDIES,.pdf,"В работе рассматриваются морфизмы, порождающие...",В работе рассматриваются некоторые гипотезы о ...
1064,45132,2023,Санкт-Петербургский государственный университе...,"В данной работе исследовались MS/MS спектры, с...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Данная работа посвящена разработке алгоритмов ...,Данная работа посвящена поиску и классификации...
1079,45133,2023,Санкт-Петербургский государственный университе...,В выпускной работе описываются триангуляции ве...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматриваются асимптотики числа три...,Автор производит подсчёт асимптотики числа три...
1209,45135,2023,Санкт–Петербургский государственный университе...,В рамках данной работы было разработано програ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной выпускной квалификационной работе пре...,Работа посвящена разработке программного средс...


In [63]:
len(new_df["diploma"][12])

35565

In [1]:
print("kek")

kek


In [2]:
asessors_df

NameError: name 'asessors_df' is not defined

In [20]:
metric_dir_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/diploma_test/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
metric_dir_path.mkdir(exist_ok=True, parents=True)
probs_entry = {
    "probs": {"A": 0, "B": 0.1, "C": 0.1, "D": 0.2},
    "meta": {
      "abstract": "В этой работе мы строим правую трансферную модельную структуру на категории алгебр обогащенной алгебраической теории в обогащенной модельной категории, удовлетворяющей некоторым свойствам. Также мы даем несколько общих конструкций генерирующих замкнутые симметричные моноидальные локально конечно представимые категори, которые служат базой обогащения.",
      "id": "12"
    },
}
with open(metric_dir_path.joinpath("probs_appended.jsons"), "a") as f:
    json.dump(probs_entry, f, ensure_ascii=False)
    f.write("\n")

In [6]:
metric_dir_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/diploma_appended/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
with open(metric_dir_path.joinpath("probs_appended.jsons"), "r") as f:
    probs = [json.loads(x) for x in f.readlines()]
with open(metric_dir_path.joinpath("answers_appended.jsons"), "r") as f:
    answers = [json.loads(x) for x in f.readlines()]
len(probs), len(answers)

(22, 22)

In [15]:
with open(ARTIFACTS_DIR_PATH.joinpath("datasets/diplomas_asessors_questions/mcs_df_human_filled_processed.json"), "r") as f:
    asessors_dataset = json.load(f)

In [None]:
from mera_helpers import construct_prompt
from llm_helpers import calculate_token_interest_probs, get_answer




for split in ["diploma"]:
    # a_list = []
    # probs_list = []
    metric_dir_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/{split}_appended/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
    metric_dir_path.mkdir(exist_ok=True, parents=True)
    for row in tqdm(asessors_dataset[1:], desc="Rows..."):
        x = copy.deepcopy(row)
        if split == "empty":
            x['inputs']['context'] = ""
        else:
            x['inputs']['context'] = new_df[split].loc[int(x['meta']['id'])] # df["diploma"].iloc[int(x['meta']['id'])]
        q = construct_prompt(x)
        probs = calculate_token_interest_probs(q, tokenizer, model)
        probs_entry = {
            "probs": probs,
            "meta": row["meta"],
        }
        a = get_answer(probs)
        a_entry = {
            "answer": a,
            "meta": row["meta"],
        }
        with open(metric_dir_path.joinpath("probs_appended.jsons"), "a") as f:
            json.dump(probs_entry, f, ensure_ascii=False)
            f.write("\n")
        with open(metric_dir_path.joinpath("answers_appended.jsons"), "a") as f:
            json.dump(a_entry, f, ensure_ascii=False)
            f.write("\n")

Rows...:   0%|          | 0/69 [00:00<?, ?it/s]

[0;31mKernelOutOfMemory[0m: Kernel ran out of memory and has been restarted. If the restart fails, restart the kernel from the Kernel menu.
If the error persists, try choosing a different configuration or optimizing your code.

In [13]:
! ls /home/jupyter/work/resources/long_context_LLMs/artifacts/metrics/diplomas_asessors_questions/diploma_appended/llama-2-7b/answers_appended.jsons

/home/jupyter/work/resources/long_context_LLMs/artifacts/metrics/diplomas_asessors_questions/diploma_appended/llama-2-7b/answers_appended.jsons


In [21]:
rows = []
pred_by_model = dict()
# for baseline_model in HUGGINFACE_BASELINE_MODELS:
splits = ["diploma", "empty", "abstract"]
# if baseline_model == LLAMA_2_7B:
#     splits.extend(["learnt", "learnt_8k", MODEL_EXPERIMENT_NAME])
for split in splits:
    if split == "diploma":
        metric_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/{split}_appended/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
    else:
        metric_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/{split}/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
    if split == "diploma":
        some_path = metric_path.joinpath("answers_appended.jsons")
        with open(some_path, "r") as f:
            answers = [json.loads(x) for x in f.readlines()]
    else:
        some_path = metric_path.joinpath("answers.jsonl")     
        with open(some_path, "r") as f:
            answers = json.load(f) #[json.loads(x) for x in f.readlines()]
    # with open(metric_path.joinpath("answers.jsonl"), "r") as f:
        # answers = json.load(f)
    pred = [x["answer"] for x in answers[:22]]
    pred_by_model[f"{split}_{LLAMA_2_7B}"] = pred
    true = [x["outputs"] for x in asessors_dataset[:22]]
    rows.append({
        "model": LLAMA_2_7B,
        "subset": "asessors_questions",
        "split": split,
        "accuracy_score": accuracy_score(true, pred),
    })
asessors_df = pd.DataFrame(rows)
asessors_df

Unnamed: 0,model,subset,split,accuracy_score
0,llama-2-7b,asessors_questions,diploma,0.090909
1,llama-2-7b,asessors_questions,empty,0.363636
2,llama-2-7b,asessors_questions,abstract,0.636364


In [None]:
asessors_df.to_csv(METRICS_DIR_PATH.joinpath(f"asessors_baseline.csv"))

### Evaluate MERA

In [None]:
rows = []
for baseline_model in HUGGINFACE_BASELINE_MODELS.union([VIKHR_7B, SAIGA_MISTRAL_7B_LORA]).union([MODEL_EXPERIMENT_NAME]):
    for name, dataset_meta in HUGGINGFACE_NAME_TO_DATASET.items():
        for subset, split in zip(dataset_meta["subsets"], dataset_meta["splits"]):
            dataset_path = get_dataset_path(subset, name, split)
            dataset = load_from_disk(dataset_path)
            metric_path = get_metric_dir_path(baseline_model, subset, name, split)
            with open(metric_path.joinpath("answers.jsonl"), "r") as f:
                answers = json.load(f)
            pred = [x["answer"] for x in answers[:first_k]]
            true = [x["outputs"] for x in list(dataset)[:first_k]]
            rows.append({
                "model": baseline_model,
                "subset": subset,
                "split": split,
                "accuracy_score": accuracy_score(true, pred),
            })
df = pd.DataFrame(rows)
df

In [None]:
df.to_csv(METRICS_DIR_PATH.joinpath(f"ru_metrics_with_{MODEL_EXPERIMENT_NAME}.csv"))