In [45]:
import os
from dotenv import load_dotenv

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from raft_util import get_doc_chunk, generate_qa_pair, process_qa_pair

load_dotenv(".env")
openai_token = os.getenv("openai_api_key")

In [46]:
# openai model
llm = OpenAI(temperature=0, n=1, model="gpt-3.5-turbo", api_key=openai_token)
embed_model = OpenAIEmbedding()

In [47]:
# document chunk
docs = get_doc_chunk("./text/")

In [48]:
len(docs) ## this is before truncation

75

In [49]:
docs[35]

'This eBook is for the use of anyone anywhere in the United States and most     other parts of the world at no cost and with almost no restrictions     whatsoever. You may copy it, give it away or re-use it under the terms     of the Project Gutenberg License included with this eBook or online     at www.gutenberg.org. If you     are not located in the United States, you will have to check the laws     of the country where you are located before using this eBook.'

In [50]:
### get old questions
old_questions = []
with open("questions.txt", "r") as f:
    old_questions = f.readlines()
    # old_questions = [question.strip() for i, question in enumerate(f.readlines()) if i % 3 == 0 or i % 2 == 0]

target = []
for i, line in enumerate(old_questions):
    if i % 3 == 0:
        target.append(line.strip())
        target.append(old_questions[i+2].strip())

target

['Who are Tardo and Peo in "DISQUALIFIED" by Charles L. Fontenay?',
 '0',
 'What is the significance of the castle overlooking the area in "DISQUALIFIED" by Charles L. Fontenay?',
 '0',
 'Who entertained Tardo and Peo at luncheon?',
 '1',
 'What was served for dessert?',
 '1',
 'What technical aid is available aboard the ship?',
 '2',
 'What kind of equipment will not be received until a more thorough investigation is conducted?',
 '2',
 'What are some fundamental requirements for colonies in other star systems?',
 '3',
 'What difficulties have faced colonies in other star systems?',
 '3',
 'What is the reason why the ship just rusted away?',
 '4',
 'Who is the first ship to land on the planet since colonization?',
 '4',
 'What were the main difficulties faced by the colonizers on the planet?',
 '5',
 'Why did the colonizers know the planet was habitable before landing on it?',
 '5',
 'What did the colonists do to make the planet liveable?',
 '6',
 'Was slavery used by the colonists in

In [51]:
from llama_index.core.llms import ChatMessage

def generate_cot_answer(llm:OpenAI, question:str, context:str) -> str:
    # print(context)
    prompt = f"""
        Question: {question}\nContext: {context}\n
        Answer the question using the information given in the context above or your own knowledge if context is irrelevant. 
        Here is things to pay attention to:
        - First analyse the given context.
        - You must identify any part of the context that is not relevant to answer the question.
        - Provide step-by-step reasoning on how to answer the question.
        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
    """
    messages = [
        ChatMessage(
            role="system",
            content="You are a helpful assistant who can provide an answer given a question and relevant context.",
        ),
        ChatMessage(role="user", content=prompt),
    ]
    return str(llm.chat(messages))

In [52]:
# generate CoT answer for each question
import random
from tqdm.auto import tqdm
with open("raft_some_o_c.txt", "w") as raft_file:
    for i, line in tqdm(enumerate(target)):
        # if a question generate CoT
        if i%2 == 0:
            # question
            raft_file.write(line + "\n")
            raft_file.write("="*5 + "\n")

            # adding context
            chunk_index = int(target[i+1])
            true_context = docs[chunk_index]
            o_c_context = docs[random.randrange(35, 75)]
            raft_file.write(true_context + "\n" + o_c_context + "\n")
            raft_file.write("="*5 + "\n")
            
            # cot answer
            cot = generate_cot_answer(llm, line, true_context + "\n" + o_c_context)
            # print(cot)
            raft_file.write(cot + "\n")
            raft_file.write("="*5 + "\n")

0it [00:00, ?it/s]

In [53]:
## using regex cause i forgot to add \n after context
## regex (?<!^)=====