In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import ujson
import json
import sys
import os

import pandas as pd
import numpy as np
import torch
import random

from tqdm import tqdm
from collections import defaultdict
from typing import Optional

from bioel.utils.umls_utils import UmlsMappings
from bioel.utils.bigbio_utils import CUIS_TO_REMAP, CUIS_TO_EXCLUDE, DATASET_NAMES, VALIDATION_DOCUMENT_IDS
from bioel.utils.bigbio_utils import load_bigbio_dataset, add_deabbreviations, load_dataset_df, dataset_to_documents, dataset_to_df, load_dataset_df, resolve_abbreviation, dataset_unique_tax_ids
from bioel.utils.solve_abbreviation.solve_abbreviations import create_abbrev

from bioel.ontology import BiomedicalOntology
from bioel.models.arboel.biencoder.data.data_utils import process_ontology
from bioel.evaluate import Evaluate

from torch.utils.data import DataLoader
from peft import PeftModel
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import argparse
import concurrent
from dotenv import load_dotenv
from tqdm import tqdm
import textgrad as tg
from textgrad.tasks import load_task
import random
load_dotenv(override=True)

import openai
import json
from ids import open_ai_api_key
openai.api_key = open_ai_api_key
os.environ["OPENAI_API_KEY"] = open_ai_api_key
import re
import logging
from collections import Counter, defaultdict
from utils_functions import *

  from .autonotebook import tqdm as notebook_tqdm




2024-09-12 15:53:17,530	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logging.getLogger("httpx").setLevel(logging.WARNING)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 
sampling_params = SamplingParams(temperature=0, top_p=0.9, max_tokens=20, stop=["<|eot_id|>"])

In [None]:
ontology_dir = "/mitchell/entity-linking/kbs/medic.tsv"
name = "medic"
ontology2 = BiomedicalOntology.load_medic(filepath=ontology_dir, name=name)

# entrez_dict = {"name" : "entrez",
#              "filepath" : "/mitchell/entity-linking/el-robustness-comparison/data/gene_info.tsv",
#              "dataset" : "gnormplus",}
# ontology = BiomedicalOntology.load_entrez(**entrez_dict)


In [12]:
dataset_name = 'ncbi_disease'
# dataset_name = 'gnormplus'
path_to_abbrev = "/home2/cye73/data_test2/abbreviations.json"
dataset = load_bigbio_dataset(dataset_name)
dataset = add_deabbreviations(dataset, path_to_abbrev)

In [13]:
dataset_df = dataset_to_df(dataset)
test_df = dataset_df[dataset_df['split'] == 'test']
train_df = dataset_df[dataset_df['split'] == 'train']
# test_df
docs = dataset_to_documents(dataset)

# TextGrad

In [6]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

In [7]:
def eval_sample(item, eval_fn, model):
    """
    This function allows us to evaluate if an answer to a question in the prompt is a good answer.

    """
    x, y = item
    x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
    y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
    response = model(x)
    try:
        eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
        return int(eval_output_variable.value)
    except:
        eval_output_variable = eval_fn([x, y, response])
        eval_output_parsed = eval_fn.parse_output(eval_output_variable)
        return int(eval_output_parsed)

In [8]:
def eval_dataset(test_set, eval_fn, model, max_samples: int=None):
    if max_samples is None:
        max_samples = len(test_set)
    accuracy_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        for _, sample in enumerate(test_set):
            
            future = executor.submit(eval_sample, sample, eval_fn, model)
            futures.append(future)
            if len(futures) >= max_samples:
                break
        tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0)
        for future in tqdm_loader:
            acc_item = future.result()
            accuracy_list.append(acc_item)
            tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
    return accuracy_list 

In [9]:
def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set):
    val_performance = np.mean(eval_dataset(val_set, eval_fn, model))
    previous_performance = np.mean(results["validation_acc"][-1])
    print("val_performance: ", val_performance)
    print("previous_performance: ", previous_performance)
    previous_prompt = results["prompt"][-1]
    
    if val_performance < previous_performance:
        print(f"rejected prompt: {system_prompt.value}")
        system_prompt.set_value(previous_prompt)
        val_performance = previous_performance

    results["validation_acc"].append(val_performance)

In [3]:
set_seed(12)
# Set CUDA devices to see both GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

llm_api_test = tg.get_engine(engine_name="vllm-meta-llama/Meta-Llama-3.1-8B-Instruct", 
                             dtype='half', 
                             enforce_eager=True, 
                             gpu_memory_utilization=0.6
                             )

llm_api_eval = tg.get_engine(engine_name="gpt-4o-mini", 
                             dtype='half', 
                             enforce_eager=True, 
                             gpu_memory_utilization=0.6)




INFO 09-12 15:53:51 llm_engine.py:213] Initializing an LLM engine (v0.6.0) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=25000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Meta-Llama-3.1-8B-Instruct, use_v2_block_manager=False, num_scheduler_steps=1, enable_p

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 09-12 15:54:01 model_runner.py:915] Starting to load model meta-llama/Meta-Llama-3.1-8B-Instruct...
INFO 09-12 15:54:01 selector.py:217] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 09-12 15:54:01 selector.py:116] Using XFormers backend.
INFO 09-12 15:54:01 weight_utils.py:236] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.86s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.13s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:04<00:01,  1.44s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:06<00:00,  1.62s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:06<00:00,  1.55s/it]



INFO 09-12 15:54:09 model_runner.py:926] Loading model weights took 14.9888 GB
INFO 09-12 15:54:13 gpu_executor.py:122] # GPU blocks: 11158, # CPU blocks: 2048
INFO 09-12 15:54:16 model_runner.py:1217] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-12 15:54:16 model_runner.py:1221] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-12 15:54:36 model_runner.py:1335] Graph capturing finished in 20 secs.


In [4]:
tg.set_backward_engine(llm_api_eval, override=True)

# Load the data and the evaluation function
train_set, val_set, test_set, eval_fn = load_task("BBH_object_counting", evaluation_api=llm_api_eval)
print("Train/Val/Test Set Lengths: ", len(train_set), len(val_set), len(test_set))
STARTING_SYSTEM_PROMPT = train_set.get_task_description()

Train/Val/Test Set Lengths:  50 100 100


In [11]:
train_loader = tg.tasks.DataLoader(train_set, batch_size=3, shuffle=True)


# Testing the 0-shot performance of the evaluation engine
system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT, 
                            requires_grad=True, 
                            role_description="system prompt to the language model")
model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt)

system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT, 
                            requires_grad=True,
                            role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task")
model = tg.BlackboxLLM(llm_api_test, system_prompt)

optimizer = tg.TextualGradientDescent(engine=llm_api_eval, parameters=[system_prompt])

results = {"test_acc": [], "prompt": [], "validation_acc": []}
results["test_acc"].append(eval_dataset(test_set, eval_fn, model))
results["validation_acc"].append(eval_dataset(val_set, eval_fn, model))
results["prompt"].append(system_prompt.get_value())


  0%|          | 0/100 [00:00<?, ?it/s]


AssertionError: Value must be a string, int, or image (bytes). Got: <class 'numpy.int64'>

In [None]:
for epoch in range(3):
    for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):
        pbar.set_description(f"Training step {steps}. Epoch {epoch}")
        optimizer.zero_grad()
        losses = []
        for (x, y) in zip(batch_x, batch_y):
            x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
            y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
            response = model(x)
            try:
                eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
            except:
                eval_output_variable = eval_fn([x, y, response])
            losses.append(eval_output_variable)
        total_loss = tg.sum(losses)
        total_loss.backward()
        optimizer.step()
        
        run_validation_revert(system_prompt, results, model, eval_fn, val_set)
        
        print("sys prompt: ", system_prompt)
        test_acc = eval_dataset(test_set, eval_fn, model)
        results["test_acc"].append(test_acc)
        results["prompt"].append(system_prompt.get_value())
        if steps == 3:
            break