# Embedding Finetuning

### Setup

In [4]:
import os
import openai
from pathlib import Path
from pprint import pprint
import ray
from tqdm import tqdm

In [5]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()

True

In [6]:
EFS_DIR = Path("/efs/shared_storage/simon")
ROOT_DIR = Path(os.getcwd()).parent
print (ROOT_DIR)

/home/ray/default/llm-applications


In [8]:
# Credentials
ray.init(runtime_env={"env_vars": {
    "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
    "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    "ANYSCALE_API_BASE": os.environ["ANYSCALE_API_BASE"],
    "ANYSCALE_API_KEY": os.environ["ANYSCALE_API_KEY"],
    "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
}})

2023-09-10 23:33:07,204	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.28.181:6379...
2023-09-10 23:33:07,213	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com [39m[22m
2023-09-10 23:33:07,218	INFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_58a20c9210cc0cd96d977f813530a2fd.zip' (1.13MiB) to Ray cluster...
2023-09-10 23:33:07,220	INFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_58a20c9210cc0cd96d977f813530a2fd.zip'.


0,1
Python version:,3.9.15
Ray version:,2.6.2
Dashboard:,http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com


In [10]:
import json

def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

### Load data

First, we load data and convert it into LlamaIndex document format.

In [16]:
from llama_index.node_parser import SimpleNodeParser
from llama_index import Document

In [17]:
def to_doc(entry_dict):
    return Document(text=entry_dict['text'], metadata={'source': entry_dict['source']})

In [19]:
sections = read_json(Path(ROOT_DIR, "datasets/eval_full_corpus.json"))
docs = [to_doc(dict_) for dict_ in sections]

Now, we parse documents into 512 token chunks.

In [37]:
parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = parser.get_nodes_from_documents(docs, show_progress=True)
print('Parsed {} docs into {} nodes'.format(len(docs), len(nodes)))

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Parsed 8944 docs into 14242 nodes


### Create train/val dataset

In [50]:
SUBSAMPLE_RATIO = 0.05

First, we create a train/test split with 80% in training, and 20% in validation.

In [40]:
import random

def train_test_split(data, split_ratio=0.8):
    """
    Split a list of items into training and testing sets.

    Args:
        data (list): The list of items to be split.
        split_ratio (float): The ratio of items to include in the training set (default is 0.8).

    Returns:
        tuple: A tuple containing two lists - the training set and the testing set.
    """
    if not 0 <= split_ratio <= 1:
        raise ValueError("Split ratio must be between 0 and 1")

    # Shuffle the data to ensure randomness in the split
    random.shuffle(data)

    # Calculate the split indices
    split_index = int(len(data) * split_ratio)

    # Split the data into training and testing sets
    train_set = data[:split_index]
    test_set = data[split_index:]

    return train_set, test_set

def subsample(data, ratio):
    """
    Subsample a list to a given ratio.

    Args:
        data (list): The list of items to be subsampled.
        ratio (float): The ratio of items to retain in the subsample.

    Returns:
        list: A subsampled list containing the specified ratio of items.
    """
    if not 0 <= ratio <= 1:
        raise ValueError("Ratio must be between 0 and 1")

    # Calculate the number of items to retain in the subsample
    num_items_to_retain = int(len(data) * ratio)

    # Randomly select items to retain
    subsampled_data = random.sample(data, num_items_to_retain)

    return subsampled_data

In [46]:
train_nodes, val_nodes = train_test_split(nodes)

In [47]:
print('{} train nodes, {} val nodes'.format(len(train_nodes), len(val_nodes)))

11393 train nodes, 2849 val nodes


In [49]:
train_nodes = subsample(train_nodes, SUBSAMPLE_RATIO)
val_nodes = subsample(val_nodes, SUBSAMPLE_RATIO)
print('After subsampling: {} train nodes, {} val nodes'.format(len(train_nodes), len(val_nodes)))

After subsampling: 569 train nodes, 142 val nodes


In [51]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes)
# val_dataset = generate_qa_embedding_pairs(val_nodes)

In [None]:
train_dataset.save_json("train_dataset.json")
# val_dataset.save_json("val_dataset.json")

### Run embedding finetuning

In [54]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
# val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [55]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [57]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    # val_dataset=val_dataset,
)

Downloading (…)ab102/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2d2d7ab102/README.md:   0%|          | 0.00/78.9k [00:00<?, ?B/s]

Downloading (…)2d7ab102/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)ab102/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)2d2d7ab102/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d7ab102/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [58]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/121 [00:00<?, ?it/s]

Iteration:   0%|          | 0/121 [00:00<?, ?it/s]

In [59]:
embed_model = finetune_engine.get_finetuned_model()

### Retrieval Evaluation

In [60]:
import re
import json
from pathlib import Path

In [61]:
with open(Path(ROOT_DIR, "datasets/eval-dataset-v1.jsonl"), "r") as f:
    test_dataset = [json.loads(item) for item in list(f)]

In [64]:
# Clean up
for row in test_dataset:
    row["source"] = row["source"].replace("https://docs.ray.io/en/latest/", "https://docs.ray.io/en/master/")

In [65]:
test_dataset[:5]

[{'question': 'I’m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?',
  'source': 'https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format'},
 {'question': 'How does autoscaling work in a Ray Serve application?',
  'source': 'https://docs.ray.io/en/master/serve/scaling-and-resource-allocation.html#autoscaling'},
 {'question': 'how do I get the address of a ray node',
  'source': 'https://docs.ray.io/en/master/ray-core/miscellaneous.html#node-information'},
 {'question': 'Does Ray support NCCL?',
  'source': 'https://docs.ray.io/en/master/ray-more-libs/ray-collective.html'},
 {'question': 'Is Ray integrated with DeepSpeed?',
  'source': 'https://docs.ray.io/en/master/ray-air/examples/gptj_deepspeed_fine_tuning.html#fine-tuning-the-model-with-ray-air-a-name-train-a'}]

In [66]:
def evaluate_index(
    dataset,
    index,
    top_k=5,
    verbose=False,
):
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for entry in tqdm(dataset):
        query = entry['question']
        expected_source = entry['source']
        
        retrieved_nodes = retriever.retrieve(query)
        retrieved_sources = [node.node.metadata['source'] for node in retrieved_nodes]
        is_hit = expected_source in retrieved_sources  # assume 1 relevant doc
        
        eval_result = {
            'is_hit': is_hit,
            'retrieved': retrieved_sources,
            'expected': expected_source,
            'query': query,
        }
        eval_results.append(eval_result)
    return eval_results

In [67]:
from llama_index import VectorStoreIndex, Document, ServiceContext
from llama_index.embeddings import OpenAIEmbedding, LangchainEmbedding
from langchain.embeddings import HuggingFaceEmbeddings

In [68]:
service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    chunk_size=512,
)

In [69]:
index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/14242 [00:00<?, ?it/s]

In [71]:
results = evaluate_index(test_dataset, index, top_k=5, verbose=True)

100%|██████████| 177/177 [01:02<00:00,  2.85it/s]


In [73]:
import pandas as pd
df = pd.DataFrame(results)
hit_rate = df['is_hit'].mean()
hit_rate

0.2937853107344633

### Reference

In [77]:
from llama_index import VectorStoreIndex, Document, ServiceContext
from langchain.embeddings import HuggingFaceEmbeddings

service_context = ServiceContext.from_defaults(
    embed_model=HuggingFaceEmbeddings(model_name="BAAI/bge-small-en"),
    chunk_size=512,
)

In [None]:
index = VectorStoreIndex.from_documents(docs, service_context=service_context, show_progress=True)

Parsing documents into nodes:   0%|          | 0/8944 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/14242 [00:00<?, ?it/s]

In [None]:
results = evaluate_index(test_dataset, index, top_k=5, verbose=True)

In [None]:
import pandas as pd
df = pd.DataFrame(results)
hit_rate = df['is_hit'].mean()
hit_rate