In [None]:
!pip install llama-index==0.5.6
!pip install langchain==0.0.148

In [None]:
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper, ServiceContext
from langchain import OpenAI
import sys
import os
from IPython.display import Markdown, display
from sklearn.metrics import precision_score, recall_score, f1_score

def construct_index(directory_path):
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 2000
    # set maximum chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600

    # define prompt helper
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    # define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=1.5, model_name="text-davinci-003", max_tokens=num_outputs))

    documents = SimpleDirectoryReader(directory_path).load_data()

    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
    index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)

    index.save_to_disk('index.json')

    return index

def ask_ai(index):
    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    while True:
        query = input("What do you want to ask? ")
        response = index.query(query)
        display(Markdown(f"{response.response}"))


In [None]:
def evaluate(index):
    # Load a labeled dataset with questions and correct answers
    test_questions = [
        ("Question 1", "Correct Answer 1"),
        ("Question 2", "Correct Answer 2"),
        # Add more questions and correct answers as needed
    ]

    predicted_answers = []
    correct_answers = []

    for question, correct_answer in test_questions:
        response = index.query(question)
        predicted_answer = response.response
        predicted_answers.append(predicted_answer)
        correct_answers.append(correct_answer)

    # Compute evaluation metrics
    accuracy = sum(1 for pred, gt in zip(predicted_answers, correct_answers) if pred == gt) /len(test_questions)
    precision = precision_score(correct_answers, predicted_answers, average='macro')
    recall = recall_score(correct_answers, predicted_answers, average='macro')
    f1 = f1_score(correct_answers, predicted_answers, average='macro')

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")


In [None]:
os.environ["OPENAI_API_KEY"] = input("Enter OpenAI key:")

In [None]:
index = construct_index("./example_data_folder")

In [None]:
ask_ai(index)

In [None]:
evaluate(index)