Notebook for creating a fine-tuning dataset and fine-tuning a model on it.

In [None]:
from langchain.vectorstores import Chroma
import chromadb
import os
import numpy as np
from llama_index.core.schema import TextNode
from llama_index.core.evaluation import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI

def create_dataset_for_finetuning(collection_name: str, chunk_comb: str):
    """
    Method for creating a dataset that can be used for fine-tuning based on an existing collection in Chroma. The chunk_comb is used for the path where the model is stored.
    """
    
    new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
    vectordb = Chroma(
        client=new_client,
        collection_name=collection_name,
    )

    documents = vectordb._collection.get()["documents"]
    # Select 1000 random documents
    selected_documents = np.random.choice(documents, 1000, replace=False)

    # Split the selected items into test and validation arrays with 80% and 20% of the items, respectively
    train_size = int(len(selected_documents) * 0.8)

    # Shuffle the selected items before splitting to ensure randomness
    np.random.shuffle(selected_documents)

    train_documents = selected_documents[:train_size]
    val_documents = selected_documents[train_size:]

    train_nodes = []
    val_nodes = []

    for train_doc in train_documents:
        node = TextNode()
        node.text = train_doc
        train_nodes.append(node)

    for val_doc in val_documents:
        node = TextNode()
        node.text = val_doc
        val_nodes.append(node)

    # Generates fitting questions based on the context with the help of an LLM.
    train_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
    )
    train_dataset.save_json(f"./datasets/{chunk_comb}/train_dataset.json")

    val_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes
    )
    val_dataset.save_json(f"./datasets/{chunk_comb}/val_dataset.json")

In [None]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning.embeddings.sentence_transformer import SentenceTransformersFinetuneEngine

def finetune_model_on_documents(chunk_combination: str):
  """
  Uses llama index to fine-tune the model on the created datasets.
  """
  train_dataset = EmbeddingQAFinetuneDataset.from_json(f"./datasets/{chunk_combination}/train_dataset.json")
  val_dataset = EmbeddingQAFinetuneDataset.from_json(f"./datasets/{chunk_combination}/val_dataset.json")

  updated_data = {key: f"Represent this sentence for searching relevant passages: {value}" for key, value in train_dataset.queries.items()}
  train_dataset.queries = updated_data
  updated_data_val = {key: f"Represent this sentence for searching relevant passages: {value}" for key, value in val_dataset.queries.items()}
  val_dataset.queries = updated_data_val

  finetune_engine = SentenceTransformersFinetuneEngine(
      train_dataset,
      model_id="BAAI/bge-large-en-v1.5",
      model_output_path=f"./models/finetuned-BGE-large-ISO-27001_{chunk_combination}",
      val_dataset=val_dataset,
      epochs=5,
      batch_size=8
  )

  finetune_engine.finetune()