<a href="https://colab.research.google.com/github/AreebAhmad-02/Embedding-Models-Finetuning/blob/main/Fine_tuning_Embedding_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## imports and install

In [2]:
!pip install llama_index -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
import json

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader,StorageContext,load_index_from_storage
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode
from llama_index.core import Settings

In [4]:
TRAIN_CORPUS_FPATH_SENTENCE_SPLIT = './corpus/train_corpus_sentence_split.json'
EVAL_CORPUS_FPATH_SENTENCE_SPLIT = './corpus/eval_corpus_sentence_split.json'


In [5]:
TRAIN_FILES = ["/content/data/batch1preprocessed_file.txt","/content/data/batch2preprocessed_file.txt"]
VAL_FILES = ['/content/data/batch0preprocessed_file.txt']

TRAIN_CORPUS_FPATH = './corpus/train_corpus.json'
VAL_CORPUS_FPATH = './corpus/val_corpus.json'

#Chunking

## simple chunking splitting

In [14]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

In [36]:
train_corpus = load_corpus(TRAIN_FILES, verbose=True)
val_corpus = load_corpus(VAL_FILES, verbose=True)

Loading files ['/content/batch1preprocessed_file.txt', '/content/batch2preprocessed_file.txt']
Loaded 2 docs


Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Parsed 66 nodes
Loading files ['/content/data/preprocessed_file.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 30 nodes


In [37]:
with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

## Sentence split chunking using

In [29]:
from llama_index.core.node_parser import SentenceSplitter


def load_corpus_sentence_splitter(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')


    splitter = SentenceSplitter(
    chunk_size=524,
    chunk_overlap=20,)
    nodes = splitter.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus


In [7]:
train_corpus_sentence_split = load_corpus_sentence_splitter(TRAIN_FILES, verbose=True)
val_corpus_sentence_split = load_corpus_sentence_splitter(VAL_FILES, verbose=True)

Loading files ['/content/data/batch1preprocessed_file.txt', '/content/data/batch2preprocessed_file.txt']
Loaded 2 docs


Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Parsed 119 nodes
Loading files ['/content/data/batch0preprocessed_file.txt']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 52 nodes


We do a very naive train/val split by having the
.


In [8]:
with open(TRAIN_CORPUS_FPATH_SENTENCE_SPLIT, 'w+') as f:
    json.dump(train_corpus_sentence_split, f)

with open(EVAL_CORPUS_FPATH_SENTENCE_SPLIT, 'w+') as f:
    json.dump(val_corpus_sentence_split, f)

# Generate synthetic queries
Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [9]:
import re
import uuid

from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode
from tqdm.notebook import tqdm

In [10]:
TRAIN_QUERIES_FPATH = './data/train_queries.json'
TRAIN_RELEVANT_DOCS_FPATH = './data/train_relevant_docs.json'

VAL_QUERIES_FPATH = './data/val_queries.json'
VAL_RELEVANT_DOCS_FPATH = './data/val_relevant_docs.json'

In [26]:
with open(TRAIN_CORPUS_FPATH_SENTENCE_SPLIT, 'r+') as f:
    train_corpus = json.load(f)

with open(TRAIN_CORPUS_FPATH_SENTENCE_SPLIT, 'r+') as f:
    val_corpus = json.load(f)

In [13]:
!pip install llama-index-llms-huggingface -q -U

In [22]:
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install accelerate -q -U
%pip install -i https://pypi.org/simple/ bitsandbytes -q
%pip install sentence-transformers -q
%pip install --upgrade transformers -q
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-finetuning



  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting llama-index-finetuning
  Downloading llama_index_finetuning-0.1.7-py3-none-any.whl (26 kB)
Collecting llama-index-embeddings-adapter<0.2.0,>=0.1.2 (from llama-index-finetuning)
  Downloading llama_index_embeddings_adapter-0.1.3-py3-none-any.whl (4.5 kB)
Collecting llama-index-llms-gradient<0.2.0,>=0.1.1 (from llama-index-finetuning)
  Downloading llama_index_llms_gradient-0.1.2-py3-none-any.whl (2.9 kB)
Collecting llama-index-postprocessor-cohere-rerank<0.2.0,>=0.1.1 (from llama-index-finetuning)
  Downloading llama_index_postprocessor_cohere_rerank-0.1.7-py3-none-any.whl (2.8 kB)
Collecting sentence-transformers<3.0.0,>=2.3.0 (from llama-index-finetuning)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [

In [23]:
%pip install llama-index-finetuning -q -U



In [14]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM


import os
import warnings

warnings.filterwarnings('ignore')




In [15]:
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#We recreate template (or messages) and ensure that they have the correct format, as per,
#https://github.com/run-llama/llama_index/issues/9277#issuecomment-1837545398 for zephyr-7b-beta

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == 'system':
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"
    return prompt

In [16]:
def huggingface_llm(model_name="HuggingFaceH4/zephyr-7b-beta",
                    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
                    context_window=3900,
                    max_new_tokens=256,
                    quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        messages_to_prompt=messages_to_prompt,
        device_map="auto",
    )

    return llm

llm = huggingface_llm()

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [24]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [30]:
from typing import Dict, List, Tuple

from llama_index.core.bridge.pydantic import BaseModel
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm


In [31]:
class EmbeddingQAFinetuneDataset(BaseModel):
    """Embedding QA Finetuning Dataset.

    Args:
        queries (Dict[str, str]): Dict id -> query.
        corpus (Dict[str, str]): Dict id -> string.
        relevant_docs (Dict[str, List[str]]): Dict query id -> list of doc ids.

    """

    queries: Dict[str, str]  # dict id -> query
    corpus: Dict[str, str]  # dict id -> string
    relevant_docs: Dict[str, List[str]]  # query id -> list of doc ids
    mode: str = "text"

    @property
    def query_docid_pairs(self) -> List[Tuple[str, List[str]]]:
        """Get query, relevant doc ids."""
        return [
            (query, self.relevant_docs[query_id])
            for query_id, query in self.queries.items()
        ]

    def save_json(self, path: str) -> None:
        """Save json."""
        with open(path, "w") as f:
            json.dump(self.dict(), f, indent=4)

    @classmethod
    def from_json(cls, path: str) -> "EmbeddingQAFinetuneDataset":
        """Load json."""
        with open(path) as f:
            data = json.load(f)
        return cls(**data)


DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and no prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
# generate queries as a convenience function

def generate_qa_embedding_pairs(
    node_dict,
    llm: LLM,
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes."""


    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0][
            :num_questions_per_chunk
        ]

        num_questions_generated = len(questions)
        if num_questions_generated < num_questions_per_chunk:
            warnings.warn(
                f"Fewer questions generated ({num_questions_generated}) "
                f"than requested ({num_questions_per_chunk})."
            )

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

        print(questions)

    # construct dataset
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )

In [32]:
train_dataset = generate_qa_embedding_pairs(train_corpus, llm)

  0%|          | 0/119 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 1/119 [00:30<1:00:15, 30.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 2/119 [00:53<50:37, 25.96s/it]  Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 3/119 [01:01<34:28, 17.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 4/119 [01:15<31:29, 16.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 5/119 [01:21<24:02, 12.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  5%|▌         | 6/119 [01:33<23:01, 12.23s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         | 7/119 [01:43<21:32, 11.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  7%|▋         | 8/119 [02:05<27:48, 15.03s/it]Setting `pad_token_id` to `eos_token_

In [41]:
train_dataset.save_json("trained_synthetic_dataset.json")

In [36]:


import pickle as pkl

with open("trained_synthetic_data.pkl", 'wb+') as f:
    pkl.dump(train_dataset, f)
