In [None]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode

In [None]:
TRAIN_FILES = ["../resources/dev/document_store/msg_life-gb-2021-EN_final_16-30.pdf"]
VAL_FILES = ["../resources/dev/document_store/msg_life-gb-2021-EN_final_31-45.pdf"]

TRAIN_CORPUS_FPATH = "./emb_model_finetune/train_corpus.json"
VAL_CORPUS_FPATH = "./emb_model_finetune/val_corpus.json"

In [None]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

In [None]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

import dotenv
dotenv.load_dotenv(dotenv.find_dotenv(),override=True)

train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

In [None]:
train_dataset.save_json("./emb_model_finetune/train_dataset.json")
val_dataset.save_json("./emb_model_finetune/val_dataset.json")

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="./emb_model_finetune/test_model",
    val_dataset=val_dataset,
)

In [None]:
finetune_engine.finetune()