## Prepare model

### Imports & Definitions

In [32]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.",
)

warnings.filterwarnings(
    "ignore",
    message="torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.",
)

warnings.filterwarnings(
    "ignore",
    message="You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset",
)

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import sys
sys.path.append("../../utils")
from definitions import *

In [4]:
CACHE_DIR = Path("../../../../cache/")
DATASET_DIR = Path("/home/jupyter/mnt/datasets/diplomas/russian_dataset/")

In [5]:
MODEL_EXPERIMENT_NAME = "test_pipeline_working"

### Load model & tokenizer

In [6]:
model_name = LLAMA_2_7B
model = AutoModelForCausalLM.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name], 
    cache_dir=CACHE_DIR, 
    device_map='auto'
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

MODEL_MAX_LENGTH = 16384

tokenizer = AutoTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_TO_REPO[model_name], 
    cache_dir=CACHE_DIR, 
    model_max_length=MODEL_MAX_LENGTH,
    padding_side="right",
    use_fast=True
)

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

smart_tokenizer_and_embedding_resize(
    special_tokens_dict=special_tokens_dict,
    tokenizer=tokenizer,
    model=model,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.58s/it]
Using pad_token, but it is not set yet.


In [9]:
from peft import PeftModel

model_id = "nvdenisov2002/llama-longLoRA-v1"
peft_model = PeftModel.from_pretrained(model, model_id)

Downloading adapter_config.json: 100%|██████████| 674/674 [00:00<00:00, 5.04MB/s]
Downloading adapter_model.bin: 100%|██████████| 1.08G/1.08G [00:17<00:00, 60.8MB/s]


## Inference model on tasks

In [None]:
first_k = 3

### Inference model on asessors questions

In [11]:
from llm_helpers import get_some_model_result

test_df = pd.read_csv(ARTIFACTS_DIR_PATH.joinpath("diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k.csv"))

new_rows = []
for _, row in tqdm(test_df[:first_k].iterrows(), total=len(test_df[:first_k]), desc="Rows..."):
    new_row = copy.deepcopy(row)
    new_row[MODEL_EXPERIMENT_NAME] = get_some_model_result(peft_model, tokenizer, row, device, diploma_prefix_len=8000)
    new_rows.append(new_row)
new_df = pd.DataFrame(new_rows)
# new_df.to_csv(ARTIFACTS_DIR_PATH.joinpath(f"diplomas_abstracts/mcs_raw_learnt_abstract_learnt8k_{MODEL_EXPERIMENT_NAME}.csv"))
new_df.head()

Rows...:   0%|          | 0/3 [00:00<?, ?it/s]



Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt,learnt_8k,test_pipeline_working
0,0,12,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...,В работе рассматриваются алгебраические теории...,"Алгебраическая теория – это понятие, которое о..."
1,1,25,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...,В,\n
2,2,37,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...,In this work we propose a method for improving...,The Boolean satisfiability problem (often abbr...


In [37]:
test_df.__len__()

70

### Inference model on MERA

In [57]:
from mera_helpers import construct_prompt
from path_helpers import get_dataset_path, get_metric_dir_path
from llm_helpers import calculate_token_interest_probs, get_answer

for name, dataset_meta in tqdm(HUGGINGFACE_NAME_TO_DATASET.items(), desc="Datasets..."):
    for subset, split in tqdm(zip(dataset_meta["subsets"], dataset_meta["splits"]), total=len(dataset_meta["splits"]), desc="Splits..."):
        path = get_dataset_path(subset, name, split)
        dataset = load_from_disk(path)
        probs_list = []
        a_list = []
        for row in tqdm(list(dataset)[:first_k], desc="Rows..."):
            q = construct_prompt(row)
            probs = calculate_token_interest_probs(q, tokenizer, peft_model)
            probs_list.append({
                "probs": probs,
                "meta": row["meta"],
            })
            a = get_answer(probs)
            a_list.append({
                "answer": a,
                "meta": row["meta"],
            })
        print(a_list)
        metric_dir_path = get_metric_dir_path(MODEL_EXPERIMENT_NAME, subset, name, split)
        print(metric_dir_path)
        metric_dir_path.mkdir(exist_ok=True, parents=True)
        with open(metric_dir_path.joinpath("probs.jsonl"), "w") as f:
            json.dump(probs_list, f, ensure_ascii=False, indent=2)
        with open(metric_dir_path.joinpath("answers.jsonl"), "w") as f:
            json.dump(a_list, f, ensure_ascii=False, indent=2)

Datasets...:   0%|          | 0/1 [00:00<?, ?it/s]

Splits...:   0%|          | 0/2 [00:00<?, ?it/s]

Rows...:   0%|          | 0/3 [00:00<?, ?it/s]

[{'answer': 'B', 'meta': {'domain': 'moral_scenarios', 'id': 0}}, {'answer': 'B', 'meta': {'domain': 'moral_scenarios', 'id': 1}}, {'answer': 'B', 'meta': {'domain': 'moral_scenarios', 'id': 2}}]
/home/jupyter/work/resources/long_context_LLMs/artifacts/metrics/rummlu/mera/public_test/test_pipeline_working


Rows...:   0%|          | 0/3 [00:00<?, ?it/s]

[{'answer': 'D', 'meta': {'id': 0}}, {'answer': 'D', 'meta': {'id': 1}}, {'answer': 'D', 'meta': {'id': 2}}]
/home/jupyter/work/resources/long_context_LLMs/artifacts/metrics/ruopenbookqa/mera/train/test_pipeline_working


### Inference & eval model on landmark passkey

In [29]:
peft_model.device

device(type='cuda', index=0)

In [31]:
from landmark_run_test import test_passkey_full, load_pipes
from transformers import pipeline


models = [MODEL_EXPERIMENT_NAME]
pipe = pipeline("text-generation", model=peft_model, tokenizer=tokenizer)
pipes = {MODEL_EXPERIMENT_NAME: pipe}
test_passkey_full(pipes, models, n_values=[14000, 18000])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartForCausalLM', 'Prophe

models = ['test_pipeline_working']


n_values...:   0%|          | 0/2 [00:00<?, ?it/s]

Tests...:   0%|          | 0/50 [00:00<?, ?it/s]



Accuracy test_pipeline_working for n = 14000: 0.98%


Tests...:   0%|          | 0/50 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Accuracy test_pipeline_working for n = 18000: 0.0%


## Evaluate model results on tasks

### Evaluate asessors questions

In [40]:
new_df = new_df.set_index("Unnamed: 0")
new_df

Unnamed: 0_level_0,Unnamed: 0.1,id,year,diploma,abstract,study_field,degree,original_diploma_extension,raw_model,learnt,learnt_8k,test_pipeline_working
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12,0,45042,2023,АЙВАЗЬЯН Аршак Владимирович\nВыпускная квалифи...,В этой работе мы строим правую трансферную мод...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В работе рассматривается модельная структура н...,В данной работе рассматривается модельная стру...,В работе рассматриваются алгебраические теории...,"Алгебраическая теория – это понятие, которое о..."
25,1,45043,2023,Санкт-Петербургский государственный университе...,"Пусть 𝐾 выпуклое тело в ℝ^𝑛. Определим 𝑑𝑛,𝑛−1(...",MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,Плотность решетки трансляций - это минимальная...,В работе рассматриваются плотности решеток тра...,В,\n
37,2,45044,2023,Санкт-Петербургский государственный университе...,Работа посвящена повышению производительности ...,MATHEMATICS AND COMPUTER SCIENCE,BACHELOR STUDIES,.pdf,В данной работе представлены результаты исслед...,В работе рассматривается задача булевой выполн...,In this work we propose a method for improving...,The Boolean satisfiability problem (often abbr...


In [45]:
with open(ARTIFACTS_DIR_PATH.joinpath("datasets/diplomas_asessors_questions/mcs_df_human_filled_processed.json"), "r") as f:
    dataset = json.load(f)

a_list = []
probs_list = []
for row in tqdm(dataset[:first_k], desc="Rows..."):
    x = copy.deepcopy(row)
    x['inputs']['context'] = new_df[MODEL_EXPERIMENT_NAME].loc[int(x['meta']['id'])] # df["diploma"].iloc[int(x['meta']['id'])]
    q = construct_prompt(x)
    probs = calculate_token_interest_probs(q, tokenizer, peft_model)
    probs_list.append({
        "probs": probs,
        "meta": row["meta"],
    })
    a = get_answer(probs)
    a_list.append({
        "answer": a,
        "meta": row["meta"],
    })
metric_dir_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/{MODEL_EXPERIMENT_NAME}/{LLAMA_2_7B}/") # get_metric_dir_path(baseline_model, subset, name, split)
metric_dir_path.mkdir(exist_ok=True, parents=True)
with open(metric_dir_path.joinpath("probs.jsonl"), "w") as f:
    json.dump(probs_list, f, ensure_ascii=False, indent=2)
with open(metric_dir_path.joinpath("answers.jsonl"), "w") as f:
    json.dump(a_list, f, ensure_ascii=False, indent=2)

Rows...:   0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
rows = []
pred_by_model = dict()
for baseline_model in HUGGINFACE_BASELINE_MODELS:
    splits = ["abstract", "empty", "diploma"]
    if baseline_model == LLAMA_2_7B:
        splits.extend(["learnt", "learnt_8k", MODEL_EXPERIMENT_NAME])
    for split in splits:
        metric_path = ARTIFACTS_DIR_PATH.joinpath(f"metrics/diplomas_asessors_questions/{split}/{baseline_model}/") # get_metric_dir_path(baseline_model, subset, name, split)
        with open(metric_path.joinpath("answers.jsonl"), "r") as f:
            answers = json.load(f)
        pred = [x["answer"] for x in answers[:first_k]]
        pred_by_model[f"{split}_{baseline_model}"] = pred
        true = [x["outputs"] for x in dataset[:first_k]]
        rows.append({
            "model": baseline_model,
            "subset": "asessors_questions",
            "split": f"{split}_8000" if split == "diploma" else split,
            "accuracy_score": accuracy_score(true, pred),
        })
df = pd.DataFrame(rows)
# df.to_csv(METRICS_DIR_PATH.joinpath("asessors.csv"))
df

Unnamed: 0,model,subset,split,accuracy_score
0,mistral-7b,asessors_questions,abstract,0.0
1,mistral-7b,asessors_questions,empty,0.333333
2,mistral-7b,asessors_questions,diploma_8000,0.333333
3,llama-2-7b,asessors_questions,abstract,0.333333
4,llama-2-7b,asessors_questions,empty,0.333333
5,llama-2-7b,asessors_questions,diploma_8000,0.0
6,llama-2-7b,asessors_questions,learnt,0.0
7,llama-2-7b,asessors_questions,learnt_8k,0.0
8,llama-2-7b,asessors_questions,test_pipeline_working,0.0


### Evaluate MERA

In [58]:
rows = []
for baseline_model in HUGGINFACE_BASELINE_MODELS.union([VIKHR_7B, SAIGA_MISTRAL_7B_LORA]).union([MODEL_EXPERIMENT_NAME]):
    for name, dataset_meta in HUGGINGFACE_NAME_TO_DATASET.items():
        for subset, split in zip(dataset_meta["subsets"], dataset_meta["splits"]):
            dataset_path = get_dataset_path(subset, name, split)
            dataset = load_from_disk(dataset_path)
            metric_path = get_metric_dir_path(baseline_model, subset, name, split)
            with open(metric_path.joinpath("answers.jsonl"), "r") as f:
                answers = json.load(f)
            pred = [x["answer"] for x in answers[:first_k]]
            true = [x["outputs"] for x in list(dataset)[:first_k]]
            rows.append({
                "model": baseline_model,
                "subset": subset,
                "split": split,
                "accuracy_score": accuracy_score(true, pred),
            })
df = pd.DataFrame(rows)
# df.to_csv(METRICS_DIR_PATH/"ru_metrics.csv")
df

Unnamed: 0,model,subset,split,accuracy_score
0,saiga_mistral_7b_lora,rummlu,public_test,0.333333
1,saiga_mistral_7b_lora,ruopenbookqa,train,1.0
2,mistral-7b,rummlu,public_test,0.666667
3,mistral-7b,ruopenbookqa,train,1.0
4,test_pipeline_working,rummlu,public_test,0.333333
5,test_pipeline_working,ruopenbookqa,train,0.0
6,vikhr-7b,rummlu,public_test,0.333333
7,vikhr-7b,ruopenbookqa,train,1.0
8,llama-2-7b,rummlu,public_test,0.333333
9,llama-2-7b,ruopenbookqa,train,0.0
