Install LlamaIndex Libraries

In [4]:
'''
!pip install llama-index
!pip install llama-index-llms-gemini
!pip install llama-index-embeddings-huggingface
!pip install spacy
!pip install llama-index-llms-langchain
'''

'\n!pip install llama-index\n!pip install llama-index-llms-gemini\n!pip install llama-index-embeddings-huggingface\n!pip install spacy\n!pip install llama-index-llms-langchain\n'

In [5]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings,
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
import os
import time
import nest_asyncio

nest_asyncio.apply()

Set up the LLM

In [6]:
from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI

# Definir el modelo local
llm = ChatOpenAI(
    model="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
    temperature=0.9,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    base_url="http://localhost:1234/v1",
    api_key="lm-studio"    # organization="...",
    # other params...
)

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [34]:
import getpass
import os

# pip install langchain-groq
# Definir el modelo groq
from langchain_groq import ChatGroq

os.environ["GROQ_API_KEY"] = "XXX" # Definir la API Key de Groq
api_key = os.getenv("GROQ_API_KEY", "NotFound") # Obtener la API Key de las variables de entorno
print(api_key)

llm = ChatGroq(
    model="mixtral-8x7b-32768",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

Settings.llm = llm

Retrieve documents

In [35]:
documents = SimpleDirectoryReader(input_files=["18_EstructurasDeDatos_4196.json"]).load_data()

Generate sample questions

In [36]:
eval_documents = documents[:3]
data_generator = DatasetGenerator.from_documents(eval_documents)
eval_questions = data_generator.generate_questions_from_nodes()

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [37]:
print(eval_questions)

['What is the title and code of the subject related to data structures?', 'What is the purpose of studying data structures and algorithms in a real-world setting?', 'How many credits does this subject have in the pre-degree program?', 'What are the academic prerequisites for enrolling in this subject?', 'List the learning objectives of this subject.', 'What are the expected learning outcomes for students in this subject?', 'What are the four thematic contents of this subject?', 'Describe the first teaching strategy used in this subject.', 'Explain the fourth teaching strategy used in this subject and its benefits.', 'What are the two dates related to the creation and modification of the syllabus file?', 'What is the file type and size of the document "18\\_EstructurasDeDatos\\_4196.json"?', 'What are the three types of data structures covered in the course?', 'What is the weight of Parcial 1 in the overall course evaluation?', 'What is the objective of the first teaching strategy, "apr

Set up faithfulness and relevancy evaluators

In [38]:
faithfulness = FaithfulnessEvaluator()
relevancy = RelevancyEvaluator()

In [39]:
def evaluate(chunk_size, eval_questions):

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    vector_index = VectorStoreIndex.from_documents(
        eval_documents
    )

    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)
    # print("Number of questions: ", num_questions)

    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

Evaluate using only 1 question

In [42]:
chunk_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]

one_question_array = []
one_question_array.append(eval_questions[1])

print("Question to evaluate: ", one_question_array)

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate(chunk_size, one_question_array)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time}s, Average Faithfulness: {avg_faithfulness}, Average Relevancy: {avg_relevancy}")

Question to evaluate:  ['What is the purpose of studying data structures and algorithms in a real-world setting?']
Chunk size 1 - Average Response time: 11.852120637893677s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 2 - Average Response time: 27.78049921989441s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 4 - Average Response time: 27.750017166137695s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 8 - Average Response time: 28.74750590324402s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 16 - Average Response time: 28.742502689361572s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 32 - Average Response time: 28.722283363342285s, Average Faithfulness: 1.0, Average Relevancy: 1.0
Chunk size 64 - Average Response time: 28.76851749420166s, Average Faithfulness: 0.0, Average Relevancy: 1.0
Chunk size 128 - Average Response time: 27.713791131973267s, Average Faithfulness: 0.0, Average Relevancy: 1.0
Chunk size 

Evaluate using 10 questions

In [None]:
chunk_sizes = [128, 256, 512, 1024, 2048]

ten_question_array = []
for i in range(10):  # Desde 0 hasta 9 inclusive
    ten_question_array.append(eval_questions[i])

print("Question to evaluate: ", ten_question_array)

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate(chunk_size, ten_question_array)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time}s, Average Faithfulness: {avg_faithfulness}, Average Relevancy: {avg_relevancy}")

Evaluate using all questions

In [None]:
chunk_sizes = [128, 256, 512, 1024, 2048]

print("Questions to evaluate: ", eval_questions)

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate(chunk_size, eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time}s, Average Faithfulness: {avg_faithfulness}, Average Relevancy: {avg_relevancy}")