### Summarizer head evaluation

Assess the quality of the generated text

In [1]:
import pickle

from llama_index.core import (
    VectorStoreIndex,
    ServiceContext)
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core.evaluation import (
    BatchEvalRunner,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)

from finetuning import get_tuned_model
from text_generation_evaluation import generate_broad_qa, get_topic_lists_from_pdf, preprocess


qa_dataset_path = "../data/icrc_qa_dataset_semantic2_2_1024.pkl"
nodes_path = "../data/nodes_icrc_semantic2_2_1024.pkl"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = get_tuned_model("test_model") # load previously tuned model in embedding_finetuning.ipynb
# embed_model = "dunzhang/stella_en_400M_v5"
# embed_model = HuggingFaceEmbedding(model_name =embed_model, trust_remote_code=True)

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [3]:
# load dataset of precise queries
nodes = pickle.load(open(nodes_path,'rb'))
qa_dataset = pickle.load(open(qa_dataset_path,'rb')) # generated in data_preprocessing

In [4]:
# model_url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
# llm = LlamaCPP(
#     model_url=model_url,
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=512,
#     context_window=3900,
#     generate_kwargs={},
#     model_kwargs={"n_gpu_layers": -1},
#     verbose=True,
# )

# model_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.gguf"
# llm = LlamaCPP(
#     model_url=model_url,
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=512,
#     context_window=3900,
#     generate_kwargs={},
#     model_kwargs={"n_gpu_layers": -1},
#     verbose=True,
# )

model_url = "https://huggingface.co/RichardErkhov/openai-community_-_gpt2-xl-gguf/resolve/main/gpt2-xl.Q5_K_M.gguf"
llm = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)

llama_model_loader: loaded meta data with 17 key-value pairs and 581 tensors from /tmp/llama_index/models/gpt2-xl.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gpt2
llama_model_loader: - kv   1:                               general.name str              = gpt2-xl
llama_model_loader: - kv   2:                           gpt2.block_count u32              = 48
llama_model_loader: - kv   3:                        gpt2.context_length u32              = 1024
llama_model_loader: - kv   4:                      gpt2.embedding_length u32              = 1600
llama_model_loader: - kv   5:                   gpt2.feed_forward_length u32              = 6400
llama_model_loader: - kv   6:                  gpt2.attention.head_count u32              = 25
llama_model_loader: - kv   7:          gpt2.attention.layer_norm_epsilo

### Evaluation with precise queries

In [8]:
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True, device='cuda')

Generating embeddings: 100%|██████████| 2048/2048 [01:02<00:00, 32.64it/s]
Generating embeddings: 100%|██████████| 2048/2048 [01:09<00:00, 29.45it/s]
Generating embeddings: 100%|██████████| 2048/2048 [01:06<00:00, 30.89it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:34<00:00, 58.54it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:34<00:00, 58.96it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:38<00:00, 53.01it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:42<00:00, 48.69it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:42<00:00, 48.27it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:42<00:00, 47.69it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:43<00:00, 46.82it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:54<00:00, 37.62it/s]
Generating embeddings: 100%|██████████| 1295/1295 [00:27<00:00, 47.87it/s]


In [6]:
import nest_asyncio
nest_asyncio.apply() 

In [7]:
len(list(qa_dataset.queries.values()))

500

In [8]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)

batch_eval_queries = list(qa_dataset.queries.values())[0:100]

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy},
    workers=8,
    show_progress=True
)
query_engine = index.as_query_engine(llm=llm, similar_top_k=3)
# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
  0%|          | 0/100 [00:00<?, ?it/s]
llama_print_timings:        load time =    7884.44 ms
llama_print_timings:      sample time =       3.97 ms /    28 runs   (    0.14 ms per token,  7045.80 tokens per second)
llama_print_timings: prompt eval time =    7884.27 ms /   309 tokens (   25.52 ms per token,    39.19 tokens per second)
llama_print_timings:        eval time =    2762.87 ms /    27 runs   (  102.33 ms per token,     9.77 tokens per second)
llama_print_timings:       total time =   10659.92 ms /   336 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    7884.44 ms
llama_print_timings:      sample time =      49.83 ms /   322 runs   (    0.15 ms per token,  6462.49 tokens per second)
llama_print_timings: prompt eval time =   20884.63 ms /   970 tokens (   21.53 ms per token,    46.45 tokens per second)
llama_print_timings:        eval time =   34151.31 ms /   321

In [10]:
# Let's get faithfulness score

faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])

print("faithfulness_score", faithfulness_score)

# Let's get relevancy score

relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

print("relevancy_score", relevancy_score)

faithfulness_score 0.84
relevancy_score 0.97


### Evaluation with broad questions

In [5]:
model_url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
llm_gen = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)

llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from /tmp/llama_index/models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:      

llm_load_vocab: special tokens cache size = 256
llm_load_vocab: token to piece cache size = 0.8000 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 8192
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 32
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0

In [6]:
#generate broad queries
list_of_topicwords = get_topic_lists_from_pdf(nodes[:500], 100, 10)
broad_queries = generate_broad_qa(list_of_topicwords, llm_gen, 10)

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/nvme/home/durech/camille/rag/lib/python3.10/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package stopwords is already up-to-date!
  0%|          | 0/20 [00:00<?, ?it/s]
llama_print_timings:        load time =    2971.87 ms
llama_print_timings:      sample time =     127.66 ms /   233 runs   (    0.55 ms per token,  1825.12 tokens per second)
llama_print_timings: prompt eval time =    2971.75 ms /   102 tokens (   29.13 ms per token,    34.32 tokens per second)
llama_print_timings:        eval time =   21696.75 ms /   232 runs   (   93.52 ms per token,    10.69 tokens per second)
llama_print_timings:       total time =   24954.75 ms /   334 tokens
  5%|▌         | 1/20 [00:24<07:54, 24.96s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =    2971.87 ms
llama_print_timings:      sample time =      25.34 ms /    45 runs   (    0.56 ms per token,  1776.13 toke

In [7]:
len(broad_queries)

173

In [9]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)

batch_eval_queries = broad_queries

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy},
    workers=8,
    show_progress=True
)
query_engine = index.as_query_engine(llm=llm, similar_top_k=3)
# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
  0%|          | 0/173 [00:00<?, ?it/s]
llama_print_timings:        load time =    2114.37 ms
llama_print_timings:      sample time =     108.39 ms /   512 runs   (    0.21 ms per token,  4723.68 tokens per second)
llama_print_timings: prompt eval time =    2114.21 ms /   336 tokens (    6.29 ms per token,   158.92 tokens per second)
llama_print_timings:        eval time =   17853.29 ms /   511 runs   (   34.94 ms per token,    28.62 tokens per second)
llama_print_timings:       total time =   20358.63 ms /   847 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    2114.37 ms
llama_print_timings:      sample time =      68.46 ms /   512 runs   (    0.13 ms per token,  7479.15 tokens per second)
llama_print_timings: prompt eval time =    6110.95 ms /  1119 tokens (    5.46 ms per token,   183.11 tokens per second)
llama_print_timings:        eval time =   20512.79 ms /   511

In [10]:
# Let's get faithfulness score

faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
print("faithfulness_score", faithfulness_score)

# Let's get relevancy score

relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])
print("relevancy_score", relevancy_score)

faithfulness_score 0.21965317919075145
relevancy_score 0.1676300578034682
