### Check if GPU has free memory

In [1]:
import subprocess

def get_gpu_memory():
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=memory.free,memory.total', '--format=csv,nounits,noheader'],
        stdout=subprocess.PIPE, text=True
    )
    lines = result.stdout.strip().split('\n')
    for i, line in enumerate(lines):
        free, total = map(int, line.split(','))
        print(f"GPU {i}: {free / 1024: .4} GiB free / {total / 1024: .4} GiB total")

get_gpu_memory()

GPU 0:  23.46 GiB free /  23.99 GiB total


## Setup

In [2]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path.cwd()
HAYSTACK_DIR = PROJECT_DIR / "haystack"
RELEVANT_DIR = HAYSTACK_DIR / "relevant"
IRRELAVANT_DIR = HAYSTACK_DIR / "irrelevant"

sys.path.append(str(PROJECT_DIR))

# Set Hugging Face cache path. Use Absolute path.
os.environ['HF_HOME'] = '/cs/student/projects1/2021/aagarwal/SNLP_Project/.cache/hf_with_quota'

In [None]:
import json
from argparse import Namespace

def get_args(id, context_type, data):
    
    item = next(item for item in data if item["id"] == id)

    if context_type == "relevant":
        haystack_file = RELEVANT_DIR / item["context_relevant"]
    elif context_type == "irrelevant":
        haystack_file = IRRELAVANT_DIR / item["context_irrelevant"]

    args = Namespace(
        model_name = "yaofu/llama-2-7b-80k",
        model_name_suffix = f"{context_type} + {item["id"]}",
        model_provider = "LLaMA",

        context_lengths_min = 0,
        context_lengths_max = 5000,
        context_lengths_num_intervals = 10,

        document_depth_percent_intervals = 10,

        needle = item["needle"],
        real_needle = item["real_needle"],
        retrieval_question = item["question"],
        haystack_file = haystack_file,
    )

    return args

## Run Tests

In [None]:
from retrieval_head_detection import LLMNeedleHaystackTester as Tester

with open(HAYSTACK_DIR / "needles.json", "r") as f:
    data = json.load(f)

args = get_args(id=44, context_type="relevant")
tester = Tester(**vars(args))

tester.start_test()

loading from yaofu/llama-2-7b-80k
layer number: 32, head number 32


The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]