<a href="https://colab.research.google.com/github/Deji01/LlamaIndex/blob/main/Finetune_Embedding_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetune Embeddings - LllamaIndex Example

In [9]:
# !pip install llama_index pypdf openai accelerate llama-cpp-python -q
!pip install llama-index langchain pypdf -q

In [2]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

## Download Data

In [3]:
!mkdir -p 'data/10k/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10k/lyft_2021.pdf'

--2024-01-09 10:47:55--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1880483 (1.8M) [application/octet-stream]
Saving to: ‘data/10k/uber_2021.pdf’


2024-01-09 10:47:55 (31.0 MB/s) - ‘data/10k/uber_2021.pdf’ saved [1880483/1880483]

--2024-01-09 10:47:55--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1440303 (1.4M) [application/oc

In [4]:
TRAIN_FILES = ["./data/10k/lyft_2021.pdf"]
VAL_FILES = ["./data/10k/uber_2021.pdf"]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [5]:
def load_corpus(files, verbose=False):
  if verbose:
    print(f"Loading files {files}")

  reader = SimpleDirectoryReader(input_files=files)
  docs = reader.load_data()

  if verbose:
    print(f"Loaded {len(docs)} docs")

  parser = SentenceSplitter()
  nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

  if verbose:
    print(f"Parsed {len(nodes)} nodes")

  return nodes

In [6]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['./data/10k/lyft_2021.pdf']
Loaded 238 docs


Parsing nodes:   0%|          | 0/238 [00:00<?, ?it/s]

Parsed 344 nodes
Loading files ['./data/10k/uber_2021.pdf']
Loaded 307 docs


Parsing nodes:   0%|          | 0/307 [00:00<?, ?it/s]

Parsed 410 nodes


## Generate synthetic queries

In [7]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [10]:
from llama_index.llms import OpenAILike

In [11]:
import os
from getpass import getpass


os.environ["TOGETHER_API_KEY"] = getpass("TOGETHER_API_KEY")
api_key = os.environ["TOGETHER_API_KEY"]


TOGETHER_API_KEY··········


In [13]:
llm = OpenAILike(
     model = "mistralai/Mixtral-8x7B-Instruct-v0.1",
    api_base = "https://api.together.xyz/v1",
    api_key=api_key
     )

In [14]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

100%|██████████| 344/344 [10:07<00:00,  1.77s/it]
100%|██████████| 410/410 [09:12<00:00,  1.35s/it]


### Run Embedding Finetuning

In [17]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [22]:
!pip install -U sentence-transformers -q

In [23]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [24]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/79 [00:00<?, ?it/s]

Iteration:   0%|          | 0/79 [00:00<?, ?it/s]

In [25]:
embed_model = finetune_engine.get_finetuned_model()

In [26]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7a00d5db39a0>, tokenizer_name='test_model', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

### Evaluate Finetuned Model

In [27]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [34]:
def evaluate(dataset, embed_model, top_k=5, verbose=False):
  corpus = dataset.corpus
  queries = dataset.queries
  relevant_docs = dataset.relevant_docs

  service_context= ServiceContext.from_defaults(embed_model=embed_model)
  nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
  index = VectorStoreIndex(
      nodes,
      service_context=service_context,
      show_progress=True
  )
  retriever = index.as_retriever(similarity_top_k=top_k)

  eval_results = []
  for query_id, query in tqdm(queries.items()):
    retrieved_nodes = retriever.retrieve(query)
    retrieved_ids = [node.node.node_id for node in retrieved_nodes]
    expected_id = relevant_docs[query_id][0]
    is_hit = expected_id in retrieved_ids

    eval_result = {
        "is_hit": is_hit,
        "retrieved": retrieved_ids,
        "expected": expected_id,
        "query": query_id
    }
    eval_results.append(eval_result)
  return eval_results

In [29]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(dataset, model_id, name):
  corpus = dataset.corpus
  queries = dataset.queries
  relevant_docs = dataset.relevant_docs

  evaluator = InformationRetrievalEvaluator(
      queries, corpus, relevant_docs, name=name
  )
  model = SentenceTransformer(model_id)
  output_path = "results/"
  Path(output_path).mkdir(exist_ok=True, parents=True)
  return evaluator(model, output_path=output_path)

### Run Evals

In [32]:
# import openai

# os.environ["OPENAI_API_KEY"] = getpass("OPENAI_API_KEY")
# openai.api_key = os.environ["OPENAI_API_KEY"]

# ada = OpenAIEmbedding()
# ada_val_results = evaluate(val_dataset, ada)

# df_ada = pd.DataFrame(ada_val_results)

# hit_rate_ada = df_ada["is_hit"].mean()
# hit_rate_ada

In [35]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)
df_bge = pd.DataFrame(bge_val_results)
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating embeddings:   0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 0/1006 [00:00<?, ?it/s]

0.6163021868787276

In [36]:
evaluate_st(val_dataset, "BAAI/bge-small-en", name="bge")

0.46355582695619196

In [37]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

Generating embeddings:   0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 0/1006 [00:00<?, ?it/s]

In [38]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [39]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

0.668986083499006

In [41]:
evaluate_st(val_dataset, "test_model", name="finetuned")

0.5478756097147947

### Summary of Results

In [42]:
df_bge["model"] = "bge"
df_finetuned["model"] = "fine_tuned"

In [43]:
df_all = pd.concat([df_bge, df_finetuned])
df_all.groupby("model").mean("is_hit")

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
bge,0.616302
fine_tuned,0.668986


### Information Retrieval Evaluator

In [44]:
df_st_bge = pd.read_csv(
    "results/Information-Retrieval_evaluation_bge_results.csv"
)
df_st_finetuned = pd.read_csv(
    "results/Information-Retrieval_evaluation_finetuned_results.csv"
)

In [45]:
df_st_bge["model"] = "bge"
df_st_finetuned["model"] = "fine_tuned"
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index("model")
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,0.359841,0.525845,0.581511,0.650099,0.359841,0.359841,0.175282,0.525845,...,0.359841,0.175282,0.525845,0.116302,0.581511,0.06501,0.650099,0.456223,0.50306,0.463556
fine_tuned,-1,-1,0.452286,0.605368,0.668986,0.720676,0.452286,0.452286,0.201789,0.605368,...,0.452286,0.201789,0.605368,0.133797,0.668986,0.072068,0.720676,0.54236,0.585675,0.547876
