# Fine tuning

Reference:
- [Fine-tuning to Memorize Knowledge](https://gpt-index.readthedocs.io/en/stable/examples/finetuning/knowledge/finetune_knowledge.html)

In [33]:
# import os
# import openai
from llama_index import ServiceContext
from llama_index import SimpleDirectoryReader
from llama_index import SummaryIndex
from llama_index import PromptTemplate
# from llama_index import VectorStoreIndex
# from llama_index import LLMPredictor, ServiceContext
# from llama_index import StorageContext, load_index_from_storage
# from llama_index import PromptHelper
# from llama_index import Prompt
from llama_index.llms import OpenAI
from llama_index.callbacks import CallbackManager
from llama_index.evaluation import DatasetGenerator
from llama_index.node_parser import SimpleNodeParser
from llama_index.finetuning import OpenAIFinetuneEngine

# from langchain.chat_models import ChatOpenAI
import openai
import environ
# from IPython.display import Markdown, display


# from llama_index.evaluation import QueryResponseEvaluator, ResponseEvaluator


In [2]:
env = environ.Env()
environ.Env.read_env()
API_KEY = env("OPENAI_API_KEY")
openai.api_key = API_KEY



In [3]:
callback_manager = CallbackManager([])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3),
    callback_manager=callback_manager,
)
gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4-0613", temperature=0.3),
    callback_manager=callback_manager
)

# Load Data

In [4]:
doc_path = "documents_pdf"
docs = SimpleDirectoryReader(input_dir=doc_path).load_data()

# Generate Dataset

In [5]:
node_parser = SimpleNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(docs)

In [6]:
len(nodes)

802

In [11]:
nodes_subset = nodes[:3]

In [12]:
print(id(nodes))
print(id(nodes_subset))

140325892919360
140325899637632


In [13]:
nodes_subset

[TextNode(id_='2902614e-ac52-47d7-bc0a-f7be2f38bc22', embedding=None, metadata={'page_label': '1', 'file_name': 'ba146349.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='da0e5988-f2f5-4597-8915-69fe3505bdd6', node_type=None, metadata={'page_label': '1', 'file_name': 'ba146349.pdf'}, hash='592bb4a567ff530dbc0607be761d79990988384fdd21bdcda882ef64fc386f94')}, hash='a7904a32b72a275d1173b76477e9771a60010bdd2e142d5925208f125944f67d', text='Familienkasse\nWir helfen Familien.\nMerkblatt\nKinderzuschlag\nDer Zuschlag zum Kindergeld für Familien  \nmit kleinem Einkommen', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='d2677291-1079-4afc-b563-23f31579b1e1', embedding=None, metadata={'page_label': '2', 'file_name': 'ba146349.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys

In [15]:
from tqdm.notebook import tqdm
import json

num_questions_per_chunk = 2 # 10
question_gen_query = (
    "You are a Teacher/ Professor. Your task is to setup "
    "a quiz/examination. Using the provided context, "
    f"formulate {num_questions_per_chunk} that captures an important fact from the "
    "context. \n"
    "You MUST obey the following criteria:\n"
    "- The questions should be diverse in nature across the document.\n"
    "- Restrict the question to the context information provided.\n"
    "- Do NOT create a question that cannot be answered from the context.\n"
    "- Phrase the question so that it does NOT refer to specific context. "
    'For instance, do NOT put phrases like "given provided context" or "in this work" in the question, '
    "because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms "
    "with specific details.\n\n"
    "Generate the questions below:\n"
)

with open("data_finetuning/qa_pairs.jsonl", "w") as fp:
    for idx, node in enumerate(nodes_subset):
        dataset_generator = DatasetGenerator(
            [node],
            question_gen_query=question_gen_query,
            service_context=gpt_4_context,
            # metadata_mode="all", # TODO: this argument does NOT exist, I have to use a newer version of LlamaIndex
        )
        node_questions_0 = dataset_generator.generate_questions_from_nodes(num=10)
        print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
        # for each question, get a response
        for question in tqdm(node_questions_0):
        # for question in node_questions_0:
            index = SummaryIndex([node], service_context=gpt_35_context)
            query_engine = index.as_query_engine()
            response = query_engine.query(question)
            out_dict = {"query": question, "response": str(response)}
            # print(f"[Node {idx}] Outputs: {out_dict}")
            fp.write(json.dumps(out_dict) + "\n")

# fp = open("data_finetuning/qa_pairs.jsonl", "w")
# for idx, node in enumerate(nodes):
#     dataset_generator = DatasetGenerator(
#         [node],
#         question_gen_query=question_gen_query,
#         service_context=gpt_4_context,
#         metadata_mode="all",
#     )
#     node_questions_0 = dataset_generator.generate_questions_from_nodes(num=10)
#     print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
#     # for each question, get a response
#     for question in tqdm(node_questions_0):
#         index = SummaryIndex([node], service_context=gpt_35_context)
#         query_engine = index.as_query_engine()
#         response = query_engine.query(question)
#         out_dict = {"query": question, "response": str(response)}
#         print(f"[Node {idx}] Outputs: {out_dict}")
#         fp.write(json.dumps(out_dict) + "\n")
# 
# fp.close()

[Node 0] Generated questions:
 ['What is the purpose of the Kinderzuschlag as mentioned in the document from Familienkasse?', 'Who is the intended beneficiary of the Kinderzuschlag according to the information in the ba146349.pdf file?']


  0%|          | 0/2 [00:00<?, ?it/s]

[Node 0] Outputs: {'query': 'What is the purpose of the Kinderzuschlag as mentioned in the document from Familienkasse?', 'response': 'The purpose of the Kinderzuschlag mentioned in the document from Familienkasse is to provide an additional financial support to families with low income, in addition to the regular child benefit.'}
[Node 0] Outputs: {'query': 'Who is the intended beneficiary of the Kinderzuschlag according to the information in the ba146349.pdf file?', 'response': 'The intended beneficiary of the Kinderzuschlag according to the information in the ba146349.pdf file is families with low income.'}
[Node 1] Generated questions:
 ['What are the different factors that can affect the amount of Kinderzuschlag a family can receive, according to the document?', 'What are the necessary steps one must take to receive Kinderzuschlag, as outlined in the document?']


  0%|          | 0/2 [00:00<?, ?it/s]

[Node 1] Outputs: {'query': 'What are the different factors that can affect the amount of Kinderzuschlag a family can receive, according to the document?', 'response': 'The document mentions that the amount of Kinderzuschlag a family can receive can be affected by factors such as the income and assets of both the child and the parents.'}
[Node 1] Outputs: {'query': 'What are the necessary steps one must take to receive Kinderzuschlag, as outlined in the document?', 'response': 'To receive Kinderzuschlag, one must take the following necessary steps as outlined in the document:\n\n1. Determine if you are eligible for Kinderzuschlag based on the criteria outlined in section 1.\n2. Calculate and assess your income and assets, both for the child and the parents, as explained in section 2.\n3. Consider what income and assets should be taken into account, as described in section 3.\n4. Understand the duration and assessment period for receiving Kinderzuschlag, as explained in section 4.\n5. E

  0%|          | 0/2 [00:00<?, ?it/s]

[Node 2] Outputs: {'query': 'What are the conditions under which parents can receive the Kinderzuschlag, according to the German social security system?', 'response': 'Parents can receive the Kinderzuschlag if they earn enough for themselves but their income is not sufficient for their entire family. Additionally, if the housing costs are particularly high or there are multiple children in the household, a reduced Kinderzuschlag can be received even in middle-income ranges.'}
[Node 2] Outputs: {'query': 'What additional benefits are available to those who receive the Kinderzuschlag?', 'response': 'Those who receive the Kinderzuschlag are also eligible for additional benefits such as educational and participation benefits, including free meals in daycare and school, a school supplies package worth 150 euros per school year, and exemption from daycare fees.'}


# Evaluate questions

In [18]:
query_eval_tmpl = PromptTemplate(
    "Your task is to evaluate the following: If the response for the query isn't able to answer the question provided.\n"
    "If query isn't able to answer the question, answer NO.\n"
    "Otherwise answer YES.\n"
    "To elaborate, you might get an answer like the following: 'The context does not contain the answer to this question.'"
    "Please return NO in that case. "
    "You be given the query and response. Return YES or NO as the answer.\n"
    "Query: \n {query_str}\n"
    "Response: \n {response_str}\n"
    "Answer: "
)

eval_llm = OpenAI(model="gpt-4-0613")

In [19]:
def filter_data(path: str, out_path: str):
    fp = open(path, "r")
    out_fp = open(out_path, "w")
    # new_lines = []
    for idx, line in enumerate(fp):
        qa_pair = json.loads(line)
        eval = eval_llm.complete(
            query_eval_tmpl.format(
                query_str=qa_pair["query"], response_str=qa_pair["response"]
            )
        )

        # print(f"[{idx}] QA Pair: {qa_pair} \n Eval: {eval}")
        if "NO" in eval:
            continue
        else:
            # new_lines.append(line)
            out_fp.write(line)
    
    fp.close()
    out_fp.close()

In [21]:
filter_data("data_finetuning/qa_pairs.jsonl", "data_finetuning/qa_pairs_filtered.jsonl")

[0] QA Pair: {'query': 'What is the purpose of the Kinderzuschlag as mentioned in the document from Familienkasse?', 'response': 'The purpose of the Kinderzuschlag mentioned in the document from Familienkasse is to provide an additional financial support to families with low income, in addition to the regular child benefit.'} 
 Eval: YES
[1] QA Pair: {'query': 'Who is the intended beneficiary of the Kinderzuschlag according to the information in the ba146349.pdf file?', 'response': 'The intended beneficiary of the Kinderzuschlag according to the information in the ba146349.pdf file is families with low income.'} 
 Eval: YES
[2] QA Pair: {'query': 'What are the different factors that can affect the amount of Kinderzuschlag a family can receive, according to the document?', 'response': 'The document mentions that the amount of Kinderzuschlag a family can receive can be affected by factors such as the income and assets of both the child and the parents.'} 
 Eval: YES
[3] QA Pair: {'query'

# Split Train and Validation Sets

In [22]:
from copy import deepcopy
import random


def split_train_val(path: str, out_train_path: str, out_val_path: str, train_split=0.7):
    with open(path, "r") as fp:
        lines = fp.readlines()

        # shuffle the lines to make sure that the "train questions" cover most fo the context
        shuffled_lines = deepcopy(lines)
        random.shuffle(shuffled_lines)

        split_idx = int(train_split * len(shuffled_lines))
        train_lines = shuffled_lines[:split_idx]
        val_lines = shuffled_lines[split_idx:]
        with open(out_train_path, "w") as out_fp:
            out_fp.write("".join(train_lines))

        with open(out_val_path, "w") as out_fp:
            out_fp.write("".join(val_lines))

In [23]:
split_train_val(
    path="data_finetuning/qa_pairs_filtered.jsonl",
    out_train_path="data_finetuning/qa_pairs_train.jsonl",
    out_val_path="data_finetuning/qa_pairs_val.jsonl",
    train_split=0.7
)

# Format into Training Data

In [31]:
fp = open("data_finetuning/qa_pairs_train.jsonl", "r")
out_fp = open("data_finetuning/qa_pairs_openai.jsonl", "w")

# TODO: try with different system prompts
system_prompt = {
    "role": "system",
    # "content": "You are a helpful assistant helping to answer questions about the Llama 2 paper.",
    "content": (
        "You are an expert on the German administration system and your job is to answer technical questions. "
        "Assume that all questions are related to the the provided context. "
        "Keep your answers based on facts, do not hallucinate information."
    )
}

for line in fp:
    qa_pair = json.loads(line)
    user_prompt = {"role": "user", "content": qa_pair["query"]}
    assistant_prompt = {"role": "assistant", "content": qa_pair["response"]}
    out_dict = {
        "messages": [system_prompt, user_prompt, assistant_prompt],
    }
    out_fp.write(json.dumps(out_dict) + "\n")


fp.close()
out_fp.close()

# Fine-tune the Model

In [39]:
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo",
    "data_finetuning/qa_pairs_openai.jsonl",
    # start_job_id="<start-job-id>"  # if you have an existing job, can specify id here
)

In [35]:
finetune_engine.finetune()

Num examples: 4
First example:
{'role': 'system', 'content': 'You are an expert on the German administration system and your job is to answer technical questions. Assume that all questions are related to the the provided context. Keep your answers based on facts, do not hallucinate information.'}
{'role': 'user', 'content': 'What are the different factors that can affect the amount of Kinderzuschlag a family can receive, according to the document?'}
{'role': 'assistant', 'content': 'The document mentions that the amount of Kinderzuschlag a family can receive can be affected by factors such as the income and assets of both the child and the parents.'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 117, 289
mean / median: 165.5, 128.0
p5 / p95: 117.0, 244.00000000000003

#### 

KeyboardInterrupt: 

In [None]:
finetune_engine.get_current_job()