#### Embedding Improvement
This notebook tests and assesses the performance of different improvement of the embedding model stella en 400M

In [2]:
import pickle
import random
import sys
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

from finetuning import get_tuned_model, clf_text_topic, group_by_topic, build_df, train_model, EmbeddingDataset, train_xgb_model

# LlamaIndex imports
from llama_index.core import VectorStoreIndex
from llama_index.core.evaluation import (
    generate_qa_embedding_pairs
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP

# Adding the parent directory to sys.path
sys.path.append('../')

# Utility imports
from utils import (
    retriever_evaluation, 
    display_results_retriever, 
    print_result_lang
)

# load data and base model
base_embed_model = HuggingFaceEmbedding(model_name = "dunzhang/stella_en_400M_v5", device = "cuda", trust_remote_code=True, embed_batch_size=20)
nodes = pickle.load(open( "../data/nodes_icrc_semantic2_2_1024.pkl",'rb'))

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  torch.load(os.path.jo

### Generate training data

In [2]:
model_url = "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_0.gguf"
llm_llama3 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)

llama_model_loader: loaded meta data with 27 key-value pairs and 291 tensors from /tmp/llama_index/models/Meta-Llama-3-8B-Instruct.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                            general.license str              = llama3
llama_model_loader: - kv   5:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
llama_model_loader: - kv   6:                          general.languages arr[str,1]       = ["en"]
llama_model

In [3]:
# split train val nodes randomly :
random.shuffle(nodes)

# Determine the split index
split_index = int(0.7 * len(nodes))

# Split the list into two parts
train_nodes = nodes[:split_index]
val_nodes = nodes[split_index:]

In [14]:
train_dataset = generate_qa_embedding_pairs(
    llm=llm_llama3, nodes= random.sample(list(train_nodes), k = 1000)
)
val_dataset = generate_qa_embedding_pairs(
    llm=llm_llama3, nodes=random.sample(list(train_nodes), k = 500)
)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

100%|██████████| 1000/1000 [22:46<00:00,  1.37s/it]
100%|██████████| 500/500 [11:08<00:00,  1.34s/it]


In [3]:
# [Optional] Load
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

### Evaluation Before Training

In [None]:
index = VectorStoreIndex(nodes, embed_model=base_embed_model, show_progress=True, device='cuda')
base_retriever = index.as_retriever(similarity_top_k=3) # set retriever
base_retriever_evaluator = retriever_evaluation(base_retriever) #set evaluation
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(val_dataset) # evaluate
display_results_retriever("Base Retriever", base_eval_results) # display

### Fine tuning embedding model

In [None]:
import torch
from llama_index.finetuning import SentenceTransformersFinetuneEngine

# finetune
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="dunzhang/stella_en_400M_v5",
    model_output_path="stella_finetuned",
    val_dataset=val_dataset,
    trust_remote_code=True,
    device = "cuda",
    epochs = 4,
    batch_size = 10
)

finetune_engine.finetune()

In [18]:
embed_model = get_tuned_model("stella_finetuned") #load finetuned model

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


In [9]:
#evaluate
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True, device='cuda')
base_retriever = index.as_retriever(similarity_top_k=3)
base_retriever_evaluator = retriever_evaluation(base_retriever)
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(val_dataset)
display_results_retriever("Base Retriever", base_eval_results)

Generating embeddings: 100%|██████████| 2048/2048 [00:46<00:00, 44.16it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:47<00:00, 43.08it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.10it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.58it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.62it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 41.81it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.14it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.32it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 41.93it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.15it/s]
Generating embeddings: 100%|██████████| 2048/2048 [00:48<00:00, 42.14it/s]
Generating embeddings: 100%|██████████| 1295/1295 [00:30<00:00, 42.71it/s]


Unnamed: 0,Retriever Name,mrr,hit_rate
0,Base Retriever,0.5875,0.7


In [10]:
print_result_lang(index, base_eval_results, nodes)

100%|██████████| 1000/1000 [00:00<00:00, 1053.48it/s]

0.7
0.6395348837209303
0.7210242587601078





(0.7, 0.6395348837209303, 0.7210242587601078)

### Linear adapter

In [12]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine

base_embed_model=HuggingFaceEmbedding(model_name = "dunzhang/stella_en_400M_v5", device = "cuda", trust_remote_code=True, embed_batch_size=20)

# add an adapter
finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="stella_linear",
    epochs=4,
    batch_size = 10,
    verbose=True,
    device = "cuda",
    # can optionally pass along any parameters that go into `train_model`
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.01}
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: dunzhang/stella_en_400M_v5
Load pretrained SentenceTransformer: dunzhang/stella_en_400M_v5



You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))
Batches: 100%|██████████| 1/1 [00:00<00:00, 126.03it/s]


In [None]:
# finetune
finetune_engine.finetune()
embed_model_linear = finetune_engine.get_finetuned_model()

In [15]:
#evaluate
index = VectorStoreIndex(nodes, embed_model=embed_model_linear, show_progress=True, device='cuda')
base_retriever = index.as_retriever(similarity_top_k=3)
base_retriever_evaluator = retriever_evaluation(base_retriever)
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(val_dataset)
display_results_retriever("Base Retriever", base_eval_results)

Batches: 100%|██████████| 1/1 [00:00<00:00, 48.72it/s]?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.33it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.87it/s]<01:17, 26.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 75.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.

Unnamed: 0,Retriever Name,mrr,hit_rate
0,Base Retriever,0.558167,0.656


In [16]:
print_result_lang(index, base_eval_results, nodes)

100%|██████████| 1000/1000 [00:00<00:00, 1127.54it/s]

0.656
0.5930232558139535
0.6778975741239892





(0.656, 0.5930232558139535, 0.6778975741239892)

### 2 layer NN

In [3]:
# requires torch dependency
from llama_index.legacy.embeddings.adapter_utils import TwoLayerNN

from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.core.embeddings import resolve_embed_model
from llama_index.embeddings.adapter import AdapterEmbeddingModel

In [23]:
# set adapter
adapter_model = TwoLayerNN(
    1024,  # input dimension
    2048,  # hidden dimension
    1024,  # output dimension
    bias=True,
    add_residual=True,
)

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model5_output_test",
    model_checkpoint_path="stella_nn",
    adapter_model=adapter_model,
    epochs=4,
    batch_size = 10,
    verbose=True,
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 32.99it/s]

INFO:llama_index.finetuning.embeddings.adapter:Use pytorch device: cuda
Use pytorch device: cuda





In [25]:
#finetuning
finetune_engine.finetune()
embed_model_2layer = finetune_engine.get_finetuned_model(
    adapter_cls=TwoLayerNN
)

INFO:llama_index.embeddings.adapter.base:Use pytorch device: cuda
Use pytorch device: cuda


  torch.load(


In [26]:
#evaluate
index = VectorStoreIndex(nodes, embed_model=embed_model_2layer, show_progress=True, device='cuda')
base_retriever = index.as_retriever(similarity_top_k=3)
base_retriever_evaluator = retriever_evaluation(base_retriever)
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(val_dataset)
display_results_retriever("Base Retriever", base_eval_results)

Batches: 100%|██████████| 1/1 [00:00<00:00, 48.11it/s]?, ?it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.86it/s]<01:17, 26.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 75.

Unnamed: 0,Retriever Name,mrr,hit_rate
0,Base Retriever,0.562833,0.662


In [27]:
print_result_lang(index, base_eval_results, nodes)

100%|██████████| 1000/1000 [00:01<00:00, 846.54it/s]

0.662
0.5736434108527132
0.692722371967655





(0.662, 0.5736434108527132, 0.692722371967655)

### Combine finetuning and adapter

In [None]:
embed_model_ft = get_tuned_model("tella_finetuned")

In [23]:
#linear if commented otherwise nn
# from llama_index.legacy.embeddings.adapter_utils import TwoLayerNN
# adapter_model = TwoLayerNN(
#     1024,  # input dimension
#     2048,  # hidden dimension
#     1024,  # output dimension
#     bias=True,
#     add_residual=True,
# )

from llama_index.finetuning import EmbeddingAdapterFinetuneEngine

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    embed_model_ft,
    model_output_path="embed_model_ft-l_adapter",
    model_checkpoint_path="model_ft-l_adapter",
    # adapter_model=adapter_model, # by default linear
    epochs=4,
    batch_size = 10,
    verbose=True,
)

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.65it/s]

INFO:llama_index.finetuning.embeddings.adapter:Use pytorch device: cuda
Use pytorch device: cuda





In [None]:
#finetuning
finetune_engine.finetune()
embed_model_2layer = finetune_engine.get_finetuned_model(
    # adapter_cls=TwoLayerNN
)

In [25]:
index = VectorStoreIndex(nodes, embed_model=embed_model_2layer, show_progress=True, device='cuda')

Batches: 100%|██████████| 1/1 [00:00<00:00, 57.51it/s]?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 55.25it/s]


Batches: 100%|██████████| 1/1 [00:00<00:00, 56.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 78.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.49it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.38it/s]<00:48, 41.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 24.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.87it/s]<01:01, 33.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.02it/s]
Batches: 100%|██████████| 1/1

In [26]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
base_retriever = index.as_retriever(similarity_top_k=3)
# bge_reranker = FlagEmbeddingReranker(
#     top_n=3,
#     model="BAAI/bge-reranker-large",
#     use_fp16=False
# ) #add a reranker

base_bge_retriever_evaluator = retriever_evaluation(base_retriever, metrics =["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"])
base_bge_eval_results =  await base_bge_retriever_evaluator.aevaluate_dataset(val_dataset)
display_results_retriever("Base and bge Retriever", base_bge_eval_results)

Batches: 100%|██████████| 1/1 [00:00<00:00, 133.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 137.55it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 134.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 135.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 134.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 135.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 136.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 134.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 124.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 121.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 123.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 135.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 123.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 124.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 121.


Batches: 100%|██████████| 1/1 [00:00<00:00, 121.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 133.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 120.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 120.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 121.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 129.88it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 120.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 132.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 121.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 131.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 131.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 130.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 131

Unnamed: 0,Retriever Name,mrr,hit_rate
0,Base and bge Retriever,0.596167,0.709


In [27]:
print_result_lang(base_bge_eval_results, nodes)

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:00<00:00, 1994.26it/s]

0.709
0.6511627906976745
0.72911051212938





(0.709, 0.6511627906976745, 0.72911051212938)

In [9]:
# assess faithfulness and relevancy with a sumamrizer head
model_url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
llm_llama = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)


model_url = "https://huggingface.co/TheBloke/meditron-7B-GGUF/resolve/main/meditron-7b.Q4_K_M.gguf"
llm_meditron = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=512,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    verbose=True,
)


llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from /tmp/llama_index/models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:      

In [10]:
from llama_index.core import (
    VectorStoreIndex,
    ServiceContext
)

from llama_index.core.evaluation import (
    BatchEvalRunner,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)

from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

service_context = ServiceContext.from_defaults(llm=llm_llama, embed_model=embed_model_2layer)

faithfulness = FaithfulnessEvaluator(service_context=service_context)
relevancy = RelevancyEvaluator(service_context=service_context)

batch_eval_queries = list(val_dataset.queries.values())[0:100]

# Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy},
    workers=8,
    show_progress=True
)

base_retriever = index.as_retriever(similarity_top_k=10)
bge_reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-large", # "Alibaba-NLP/gte-Qwen2-7B-instruct"
    use_fp16=False
)
query_engine = index.as_query_engine(llm=llm_llama, similar_top_k=10, node_postprocessors = [bge_reranker])
# Compute evaluation
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)

INFO:datasets:PyTorch version 2.4.0 available.
PyTorch version 2.4.0 available.
INFO:datasets:TensorFlow version 2.17.0 available.
TensorFlow version 2.17.0 available.


  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model_2layer)
2024-09-10 00:26:44.392297: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-10 00:26:44.398913: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 00:26:44.405957: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 00:26:44.407999: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has alrea

In [16]:
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])

print("faithfulness_score", faithfulness_score)

# Let's get relevancy score

relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

print("relevancy_score", relevancy_score)

faithfulness_score 0.93
relevancy_score 0.98


### Generate Topic training data

In [None]:
from tqdm import tqdm
topic_pairs = []
doc_pairs = []

corpus = {}

# generate the topic of each text
for node in tqdm(nodes):
    tag = clf_text_topic([node.text], llm_llama3)
    topic_pairs.append(tag[0])
    doc_pairs.append(node.id_)
    corpus[node.id_] = node.text

In [None]:
import random

# Step 1: Zip the lists together
zipped_lists = list(zip(topic_pairs, doc_pairs))

# Step 2: Shuffle the zipped list
random.shuffle(zipped_lists)

# Step 3: Unzip the lists back into separate lists
shuffled_topic, shuffled_doc= zip(*zipped_lists)

# Convert back to lists (because zip returns tuples)
shuffled_topic = list(shuffled_topic)
shuffled_doc = list(shuffled_doc)

In [None]:
#form a train and test set
split_index = int(0.7 * len(shuffled_topic))

# Split the list into two parts
train_topics = shuffled_topic[:split_index]
val_topics = shuffled_topic[split_index:]

train_docs = shuffled_doc[:split_index]
val_docs = shuffled_doc[split_index:]

train_corpus = {key: corpus[key] for key in train_docs}
val_corpus = {key: corpus[key] for key in val_docs}

In [None]:
train_queries, train_relevant_docs = group_by_topic(train_topics, train_docs) # group chunk that have the same topic
val_queries, val_relevant_docs = group_by_topic(val_topics, val_docs)

#generate dataset
train_dataset_topic = EmbeddingQAFinetuneDataset(queries = train_queries, relevant_docs=train_relevant_docs, corpus=train_corpus)
val_dataset_topic = EmbeddingQAFinetuneDataset(queries = val_queries, relevant_docs=val_relevant_docs, corpus=val_corpus)

### XGBOOST

In [3]:
import pickle

topic_pairs = pickle.load(open( "topic_pairs.pkl",'rb'))
doc_pairs = pickle.load(open( "doc_pairs.pkl",'rb'))
corpus = pickle.load(open( "corpus.pkl",'rb'))

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("dunzhang/stella_en_400M_v5", device = "cuda", trust_remote_code=True)


df = build_df(topic_pairs, doc_pairs, corpus, model)

You try to use a model that was created with version 3.0.1, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['doc_embeddings'], df['topic'], test_size=.32)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
# y_test = le.transform(y_test)

In [9]:
# Train model
xgb_model = train_xgb_model(data=pd.DataFrame(list(X_train.values)), labels=y_train)

  feature_names = data.columns.format()


In [None]:
from transformers import AutoTokenizer, AutoModel

# Create the dataset
dataset = EmbeddingDataset(list(df['corpus'][0:1]), le.transform(df["topic"][0:1]))

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("dunzhang/stella_en_400M_v5", device = "cuda", trust_remote_code=True)
# Train the model
train_model(embedding_model=model, 
            xgboost_model=xgb_model, 
            dataset=dataset,
            num_epochs=10,
            learning_rate=0.1)


### Topic finetuning
finetune embedding model based on text-topic pairs

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset_topic,
    model_id="dunzhang/stella_en_400M_v5",
    model_output_path="stella_topic_finetuning",
    val_dataset=val_dataset_topic,
    trust_remote_code=True,
    device = "cuda",
    epochs = 4,
    batch_size = 10
)

finetune_engine.finetune()

In [15]:
#evaluation
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True, device='cuda')
base_retriever = index.as_retriever(similarity_top_k=3)
base_retriever_evaluator = retriever_evaluation(base_retriever)
base_eval_results = await base_retriever_evaluator.aevaluate_dataset(val_dataset_topic)
display_results_retriever("Base Retriever", base_eval_results)

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.68it/s]?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.00it/s]<00:37, 53.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]<00:46, 43.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.32it/s]<00:44, 45.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.33it/s]<00:45, 43.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.71it/s]<00:46, 42.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.98it/s]<00:49, 40.10it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.92it/s]<00:46, 42.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s]<00:41, 47.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.58it/s]<00:46, 42.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]2<00:49, 39.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]2<00:51, 37.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]2<00:52, 36.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]3<0

Unnamed: 0,Retriever Name,mrr,hit_rate
0,Base Retriever,0.431833,0.526


In [18]:
print_result_lang(base_eval_results, nodes)

100%|██████████| 1000/1000 [00:00<00:00, 1752.69it/s]

0.526
0.43023255813953487
0.5592991913746631





(0.526, 0.43023255813953487, 0.5592991913746631)