# Install

In [1]:
import json
import openai
import os

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import MetadataMode
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    Response
)

Load corpus

In [29]:
from llama_index.node_parser import SimpleNodeParser
def load_corpus(docs, for_training=False, verbose=False):
    parser = SimpleNodeParser.from_defaults()
    if for_training:
        nodes = parser.get_nodes_from_documents(docs[:116], show_progress=verbose)
    else:
        nodes = parser.get_nodes_from_documents(docs[117:], show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    return nodes

SEC_FILE = ["/mnt/d/operators_manual2307.pdf"]

print(f"Loading files {SEC_FILE}")

reader = SimpleDirectoryReader(input_files=SEC_FILE)
docs = reader.load_data()
print(f'Loaded {len(docs)} docs')

train_nodes = load_corpus(docs, for_training=True, verbose=True)
val_nodes = load_corpus(docs, for_training=False, verbose=True)

Loading files ['/mnt/d/operators_manual2307.pdf']
Loaded 237 docs


Parsing documents into nodes:   0%|          | 0/116 [00:00<?, ?it/s]

Parsed 116 nodes


Parsing documents into nodes:   0%|          | 0/120 [00:00<?, ?it/s]

Parsed 120 nodes


Generate synthetic queries

In [3]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms import OpenAI

In [None]:
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

train_dataset = generate_qa_embedding_pairs(train_nodes)
val_dataset = generate_qa_embedding_pairs(val_nodes)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

100%|██████████| 116/116 [03:33<00:00,  1.84s/it]
100%|██████████| 120/120 [03:23<00:00,  1.70s/it]


In [31]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

Fine-tune embedding model

In [32]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [33]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
)

In [34]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24 [00:00<?, ?it/s]

Iteration:   0%|          | 0/24 [00:00<?, ?it/s]

In [35]:
embed_model = finetune_engine.get_finetuned_model()

In [36]:
embed_model

LangchainEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7f90baf48940>)

Evaluate fine-tuned model

In [37]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [38]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [39]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    return evaluator(model, output_path="results/")

OpenAI

In [40]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

Generating embeddings:   0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

In [41]:
df_ada = pd.DataFrame(ada_val_results)

In [42]:
hit_rate_ada = df_ada['is_hit'].mean()
hit_rate_ada

0.9291666666666667

BAAI/bge-small-en

In [43]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)

Generating embeddings:   0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

In [44]:
df_bge = pd.DataFrame(bge_val_results)

In [45]:
hit_rate_bge = df_bge['is_hit'].mean()
hit_rate_bge

0.7666666666666667

In [46]:
evaluate_st(val_dataset, "BAAI/bge-small-en", name='bge')

0.5877716530517968

Fine-tuned model

In [47]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

Generating embeddings:   0%|          | 0/120 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

In [50]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [51]:
hit_rate_finetuned = df_finetuned['is_hit'].mean()
hit_rate_finetuned

0.8083333333333333

In [52]:
evaluate_st(val_dataset, "test_model", name='finetuned')

0.6328135286339621

Hit rate

In [53]:
df_ada['model'] = 'ada'
df_bge['model'] = 'bge'
df_finetuned['model'] = 'fine_tuned'

In [54]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby('model').mean('is_hit')

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.929167
bge,0.766667
fine_tuned,0.808333


InformationRetrievalEvaluator

In [55]:
df_st_bge = pd.read_csv('results/Information-Retrieval_evaluation_bge_results.csv')
df_st_finetuned = pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')

df_st_bge['model'] = 'bge'
df_st_finetuned['model'] = 'fine_tuned'
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index('model')
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,1.0,1.0,1.0,1.0,1.0,1.0,0.333333,1.0,...,1.0,0.333333,1.0,0.2,1.0,0.1,1.0,1.0,1.0,1.0
bge,-1,-1,0.445833,0.670833,0.75,0.866667,0.445833,0.445833,0.223611,0.670833,...,0.445833,0.223611,0.670833,0.15,0.75,0.086667,0.866667,0.581286,0.649846,0.587772
fine_tuned,-1,-1,1.0,1.0,1.0,1.0,1.0,1.0,0.333333,1.0,...,1.0,0.333333,1.0,0.2,1.0,0.1,1.0,1.0,1.0,1.0
fine_tuned,-1,-1,0.4875,0.729167,0.808333,0.925,0.4875,0.4875,0.243056,0.729167,...,0.4875,0.243056,0.729167,0.161667,0.808333,0.0925,0.925,0.628697,0.70003,0.632814
